<a href="https://colab.research.google.com/github/fjadidi2001/fake_news_detection/blob/main/DansE_Mar29.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

# Mount Drive in Colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load the Dataset



In [6]:
import pandas as pd
import numpy as np

# Replace with your actual file path
file_path = '/content/drive/MyDrive/Projects/Hayat/facebook-fact-check.csv'


df = pd.read_csv(file_path, encoding='latin-1')

# Initial Data Inspection

In [7]:
print(df.head(2))  # See first 2 rows
print("\nMissing values:\n", df.isnull().sum())

     account_id       post_id    Category               Page  \
0  1.840000e+14  1.040000e+15  mainstream  ABC News Politics   
1  1.840000e+14  1.040000e+15  mainstream  ABC News Politics   

                                            Post URL Date Published Post Type  \
0  https://www.facebook.com/ABCNewsPolitics/posts...      9/19/2016     video   
1  https://www.facebook.com/ABCNewsPolitics/posts...      9/19/2016      link   

               Rating Debate  share_count  reaction_count  comment_count  \
0  no factual content    NaN          NaN           146.0           15.0   
1         mostly true    NaN          1.0            33.0           34.0   

                                        Context Post  
0  WATCH: &quot;JEB EXCLAMATION POINT!&quot; - Je...  
1  Can either candidate move the needle in the de...  

Missing values:
 account_id           0
post_id              0
Category             0
Page                 0
Post URL             0
Date Published       0
Post Type    

# Handle Missing Values

In [10]:
# Strategy 1: Fill categorical columns
df['Rating'] = df['Rating'].fillna('Unknown')
df['Debate'] = df['Debate'].fillna('Not Specified')

# Strategy 2: Fill numerical columns with median
numeric_cols = ['share_count', 'reaction_count', 'comment_count']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Alternative: Drop rows with critical missing values
# df = df.dropna(subset=['important_column'])

# Additional Preprocessing

In [12]:
# Convert date to datetime format
df['Date Published'] = pd.to_datetime(df['Date Published'], format='%m/%d/%Y')

# Clean text columns
df['Context Post'] = df['Context Post'].str.replace('"', '')

In [13]:
df['account_id'] = df['account_id'].astype(str)
df['post_id'] = df['post_id'].astype(str)

In [14]:
categorical_cols = ['Category', 'Page', 'Post Type']
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

In [15]:
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282 entries, 0 to 2281
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   account_id      2282 non-null   object        
 1   post_id         2282 non-null   object        
 2   Category        2282 non-null   object        
 3   Page            2282 non-null   object        
 4   Post URL        2282 non-null   object        
 5   Date Published  2282 non-null   datetime64[ns]
 6   Post Type       2282 non-null   object        
 7   Rating          2282 non-null   object        
 8   Debate          2282 non-null   object        
 9   share_count     2282 non-null   float64       
 10  reaction_count  2282 non-null   float64       
 11  comment_count   2282 non-null   float64       
 12  Context Post    2282 non-null   object        
dtypes: datetime64[ns](1), float64(3), object(9)
memory usage: 231.9+ KB
None
account_id        0
post_id        

In [16]:
def processElement(elem):
    id_line = elem[0]
    text = elem[1]
    # Use 'Context Post' instead of 'content' if needed
    text = tkn.createCorpus(text, remove_stopwords=False)
    return id_line, text

# Main

In [5]:
!pip install glove-python mittens gensim  # Install additional dependencies

Collecting glove-python
  Using cached glove_python-0.1.0.tar.gz (263 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mittens
  Downloading mittens-0.2-py3-none-any.whl.metadata (377 bytes)
Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy (from glove-python)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy (from glove-python)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading mittens-0.2-py3-none-any.whl (15 kB)
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)

In [2]:
from mittens import Mittens
from glove import Corpus, Glove
from gensim.models import Word2Vec, FastText
from gensim import corpora
import numpy as np

class WordEmbeddings:

    def __init__(self, corpus, normalize_tfidf=False):
        self.corpus = corpus
        self.normalize_tfidf = normalize_tfidf
        self.documents = []
        self.sentences = []
        self.word2id = {}
        self.no_words = 0
        self.max_size = 0 # max size of largest document
        self.no_docs = len(self.corpus)

    def preprareDocuments(self):
        word_id = 1
        for document in self.corpus:
            doc = []
            for sentence in document:
                self.sentences.append(sentence)
                for word in sentence:
                    if self.word2id.get(word) is None:
                        self.word2id[word] = word_id
                        word_id += 1
                    doc.append(self.word2id[word])
            if self.max_size < len(doc):
                self.max_size = len(doc)
            self.documents.append(doc)

        self.no_words = len(self.word2id) + 1

        return np.array(self.documents)


    def word2vecEmbedding(self, window_size=10, no_components=128, epochs=100, workers=4, sg=0, learning_rate=0.05):
        self.word2vec = np.empty(shape=(self.no_words, no_components))
        model = Word2Vec(self.sentences, vector_size=no_components, window=window_size, min_count=1, workers=workers, sg=sg, alpha=learning_rate, epochs=epochs)

        # for word in model.wv.vocab:
        #     print(word, model.wv[word])
        # print(model.wv['nefarious'])

        self.word2vec[0] = np.array([0] * no_components)
        for word in self.word2id:
            self.word2vec[self.word2id[word]] = np.array(model.wv[word])

        return self.word2vec


    def word2GloVeEmbedding(self, window_size=10, no_components=128, epochs=100, workers=4, learning_rate=0.05):
        self.word2glove = np.empty(shape=(self.no_words, no_components))
        corpus = Corpus()
        #training the corpus to generate the co occurence matrix which is used in GloVe
        corpus.fit(self.sentences, window=window_size)
        # creating a Glove object which will use the matrix created in the above lines to create embeddings
        # We can set the learning rate as it uses Gradient Descent and number of components
        model = Glove(no_components=no_components, learning_rate=0.05)
        model.fit(corpus.matrix, epochs=epochs, no_threads=workers, verbose=False)
        # print(corpus.dictionary)
        model.add_dictionary(corpus.dictionary)
        # get the word vectors
        # for word in corpus.dictionary:
        #     print(word, model.word_vectors[corpus.dictionary[word]])
        # model.save('model.model')

        self.word2glove[0] = np.array([0] * no_components)
        for word in self.word2id:
            self.word2glove[self.word2id[word]] = model.word_vectors[corpus.dictionary[word]]

        return self.word2glove


    def word2FastTextEmbeddings(self, window_size=10, no_components=128, epochs=100, workers=4, sg=0, learning_rate=0.05):
        self.word2fasttext = np.empty(shape=(self.no_words, no_components))
        model = FastText(self.sentences, vector_size=no_components, window=window_size, min_count=1, workers=workers, sg=sg, alpha=learning_rate, epochs=epochs)


        # for word in model.wv.vocab:
        #     print(word, model.wv[word])
        # print(model.wv['nefarious'])
        self.word2fasttext[0] = np.array([0] * no_components)
        for word in self.word2id:
            self.word2fasttext[self.word2id[word]] = np.array(model.wv[word])

        return self.word2fasttext

    def word2MittensEmbedding(self, window_size=10, no_components=128, epochs=100, workers=4, learning_rate=0.05):
        self.word2mittens = np.empty(shape=(self.no_words, no_components))
        self.word2mittens[0] = np.array([0] * no_components)
        word2glove = {}
        corpus = Corpus()
        #training the corpus to generate the co occurence matrix which is used in GloVe
        corpus.fit(self.sentences, window=window_size)
        # creating a Glove object which will use the matrix created in the above lines to create embeddings
        # We can set the learning rate as it uses Gradient Descent and number of components
        glove_model = Glove(no_components=no_components, learning_rate=0.05)
        glove_model.fit(corpus.matrix, epochs=epochs, no_threads=workers, verbose=False)
        # print(corpus.dictionary)
        glove_model.add_dictionary(corpus.dictionary)
        # get the word vectors
        # for word in corpus.dictionary:
        #     print(word, model.word_vectors[corpus.dictionary[word]])
        # model.save('model.model')

        vocabulary = []

        for word in self.word2id:
            word2glove[word] = glove_model.word_vectors[corpus.dictionary[word]]
            vocabulary.append(word)

        mittens_model = Mittens(n=no_components, max_iter=epochs)
        self.word2mittens[1:] = mittens_model.fit(corpus.matrix.toarray(), vocab=vocabulary, initial_embedding_dict=word2glove)

        return self.word2mittens

if __name__ == '__main__':
    corpus = [
        [
            ['Hello', 'this','tutorial', 'on', 'how','convert' ,'word',' integer','format'],
            ['this' ,'beautiful', 'day'],
            ['Jack','going' , 'office']
        ],
        [
            ['Hello', 'this','tutorial', 'on', 'how','convert' ,'word',' integer','format'],
            ['this' ,'beautiful', 'day'],
            ['Jack','going' , 'office']
        ],
        [
            ['Hello', 'this','tutorial', 'on', 'how','convert' ,'word',' integer','format'],
            ['this' ,'beautiful', 'day'],
            ['Jack','going' , 'office']
        ],
    ]

    we = WordEmbeddings(corpus)
    docs = we.preprareDocuments()
    print(docs.shape)
    print(docs)

    w2v = we.word2vecEmbedding()
    print(w2v)


    w2f = we.word2FastTextEmbeddings()
    print(w2f.shape)
    print(w2f)


    w2g = we.word2GloVeEmbedding()
    print(w2g.shape)
    print(w2g)


    w2m = we.word2MittensEmbedding()
    print(w2m.shape)
    print(w2m)

    print("\n\n")

    print(w2v[1])
    print(w2f[1])
    print(w2g[1])
    print(w2m[1])

ModuleNotFoundError: No module named 'glove'

In [6]:
!sudo apt-get install python3-dev build-essential


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
python3-dev is already the newest version (3.10.6-1~22.04.1).
python3-dev set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
