<a href="https://colab.research.google.com/github/fjadidi2001/fake_news_detection/blob/main/DansE_Mar29.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

# Mount Drive in Colab

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load the Dataset



In [6]:
import pandas as pd
import numpy as np

# Replace with your actual file path
file_path = '/content/drive/MyDrive/Projects/Hayat/facebook-fact-check.csv'


df = pd.read_csv(file_path, encoding='latin-1')

# Initial Data Inspection

In [7]:
print(df.head(2))  # See first 2 rows
print("\nMissing values:\n", df.isnull().sum())

     account_id       post_id    Category               Page  \
0  1.840000e+14  1.040000e+15  mainstream  ABC News Politics   
1  1.840000e+14  1.040000e+15  mainstream  ABC News Politics   

                                            Post URL Date Published Post Type  \
0  https://www.facebook.com/ABCNewsPolitics/posts...      9/19/2016     video   
1  https://www.facebook.com/ABCNewsPolitics/posts...      9/19/2016      link   

               Rating Debate  share_count  reaction_count  comment_count  \
0  no factual content    NaN          NaN           146.0           15.0   
1         mostly true    NaN          1.0            33.0           34.0   

                                        Context Post  
0  WATCH: &quot;JEB EXCLAMATION POINT!&quot; - Je...  
1  Can either candidate move the needle in the de...  

Missing values:
 account_id           0
post_id              0
Category             0
Page                 0
Post URL             0
Date Published       0
Post Type    

# Handle Missing Values

In [10]:
# Strategy 1: Fill categorical columns
df['Rating'] = df['Rating'].fillna('Unknown')
df['Debate'] = df['Debate'].fillna('Not Specified')

# Strategy 2: Fill numerical columns with median
numeric_cols = ['share_count', 'reaction_count', 'comment_count']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Alternative: Drop rows with critical missing values
# df = df.dropna(subset=['important_column'])

# Additional Preprocessing

In [12]:
# Convert date to datetime format
df['Date Published'] = pd.to_datetime(df['Date Published'], format='%m/%d/%Y')

# Clean text columns
df['Context Post'] = df['Context Post'].str.replace('"', '')

In [13]:
df['account_id'] = df['account_id'].astype(str)
df['post_id'] = df['post_id'].astype(str)

In [14]:
categorical_cols = ['Category', 'Page', 'Post Type']
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

In [15]:
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282 entries, 0 to 2281
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   account_id      2282 non-null   object        
 1   post_id         2282 non-null   object        
 2   Category        2282 non-null   object        
 3   Page            2282 non-null   object        
 4   Post URL        2282 non-null   object        
 5   Date Published  2282 non-null   datetime64[ns]
 6   Post Type       2282 non-null   object        
 7   Rating          2282 non-null   object        
 8   Debate          2282 non-null   object        
 9   share_count     2282 non-null   float64       
 10  reaction_count  2282 non-null   float64       
 11  comment_count   2282 non-null   float64       
 12  Context Post    2282 non-null   object        
dtypes: datetime64[ns](1), float64(3), object(9)
memory usage: 231.9+ KB
None
account_id        0
post_id        

In [16]:
def processElement(elem):
    id_line = elem[0]
    text = elem[1]
    # Use 'Context Post' instead of 'content' if needed
    text = tkn.createCorpus(text, remove_stopwords=False)
    return id_line, text

# Main

In [1]:
# Install required libraries
!pip install glove-python-binary mittens gensim

from glove import Corpus, Glove
from gensim.models import Word2Vec, FastText
from mittens import Mittens, GloVe
import numpy as np

class WordEmbeddings:
    def __init__(self, corpus):
        self.corpus = corpus
        self.sentences = []
        self.word2id = {}
        self.no_words = 0
        self.max_size = 0
        self.no_docs = len(self.corpus)

    def prepareDocuments(self):
        word_id = 1
        documents = []
        for document in self.corpus:
            doc = []
            for sentence in document:
                self.sentences.append(sentence)
                for word in sentence:
                    if word not in self.word2id:
                        self.word2id[word] = word_id
                        word_id += 1
                    doc.append(self.word2id[word])
            self.max_size = max(self.max_size, len(doc))
            documents.append(doc)

        self.no_words = len(self.word2id) + 1
        return np.array(documents)

    def word2vecEmbedding(self, no_components=128, window_size=10, epochs=100, workers=4, sg=0, learning_rate=0.05):
        model = Word2Vec(sentences=self.sentences, vector_size=no_components, window=window_size,
                         min_count=1, workers=workers, sg=sg, alpha=learning_rate, epochs=epochs)

        self.word2vec = np.zeros((self.no_words, no_components))
        for word, idx in self.word2id.items():
            if word in model.wv:
                self.word2vec[idx] = model.wv[word]
        return self.word2vec

    def word2GloVeEmbedding(self, no_components=128, window_size=10, epochs=100, workers=4):
        corpus = Corpus()
        corpus.fit(self.sentences, window=window_size)
        model = Glove(no_components=no_components, learning_rate=0.05)
        model.fit(corpus.matrix, epochs=epochs, no_threads=workers, verbose=False)
        model.add_dictionary(corpus.dictionary)

        self.word2glove = np.zeros((self.no_words, no_components))
        for word, idx in self.word2id.items():
            if word in corpus.dictionary:
                self.word2glove[idx] = model.word_vectors[corpus.dictionary[word]]
        return self.word2glove

if __name__ == '__main__':
    corpus = [[['Hello', 'this', 'tutorial', 'on', 'how', 'convert', 'word', 'integer', 'format'],
               ['this', 'beautiful', 'day'],
               ['Jack', 'going', 'office']]]

    we = WordEmbeddings(corpus)
    docs = we.prepareDocuments()
    print("Documents Shape:", docs.shape)

    w2v = we.word2vecEmbedding()
    print("Word2Vec Embeddings:", w2v.shape)

    w2g = we.word2GloVeEmbedding()
    print("GloVe Embeddings:", w2g.shape)


[31mERROR: Could not find a version that satisfies the requirement glove-python-binary (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for glove-python-binary[0m[31m
[0m

ModuleNotFoundError: No module named 'glove'