<a href="https://colab.research.google.com/github/fjadidi2001/fake_news_detection/blob/main/DansE_Mar29.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

# Mount Drive in Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load the Dataset



In [2]:
import pandas as pd
import numpy as np

# Replace with your actual file path
file_path = '/content/drive/MyDrive/Projects/Hayat/facebook-fact-check.csv'


df = pd.read_csv(file_path, encoding='latin-1')

# Initial Data Inspection

In [3]:
print(df.head(2))  # See first 2 rows
print("\nMissing values:\n", df.isnull().sum())

     account_id       post_id    Category               Page  \
0  1.840000e+14  1.040000e+15  mainstream  ABC News Politics   
1  1.840000e+14  1.040000e+15  mainstream  ABC News Politics   

                                            Post URL Date Published Post Type  \
0  https://www.facebook.com/ABCNewsPolitics/posts...      9/19/2016     video   
1  https://www.facebook.com/ABCNewsPolitics/posts...      9/19/2016      link   

               Rating Debate  share_count  reaction_count  comment_count  \
0  no factual content    NaN          NaN           146.0           15.0   
1         mostly true    NaN          1.0            33.0           34.0   

                                        Context Post  
0  WATCH: &quot;JEB EXCLAMATION POINT!&quot; - Je...  
1  Can either candidate move the needle in the de...  

Missing values:
 account_id           0
post_id              0
Category             0
Page                 0
Post URL             0
Date Published       0
Post Type    

# Handle Missing Values

In [4]:
# Strategy 1: Fill categorical columns
df['Rating'] = df['Rating'].fillna('Unknown')
df['Debate'] = df['Debate'].fillna('Not Specified')

# Strategy 2: Fill numerical columns with median
numeric_cols = ['share_count', 'reaction_count', 'comment_count']
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Alternative: Drop rows with critical missing values
# df = df.dropna(subset=['important_column'])

# Additional Preprocessing

In [7]:
# Convert date to datetime format
df['Date Published'] = pd.to_datetime(df['Date Published'], format='%m/%d/%Y')

# Clean text columns
df['Context Post'] = df['Context Post'].str.replace('"', '')

In [8]:
df['account_id'] = df['account_id'].astype(str)
df['post_id'] = df['post_id'].astype(str)

In [9]:
categorical_cols = ['Category', 'Page', 'Post Type']
df[categorical_cols] = df[categorical_cols].fillna('Unknown')

In [10]:
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2282 entries, 0 to 2281
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   account_id      2282 non-null   object        
 1   post_id         2282 non-null   object        
 2   Category        2282 non-null   object        
 3   Page            2282 non-null   object        
 4   Post URL        2282 non-null   object        
 5   Date Published  2282 non-null   datetime64[ns]
 6   Post Type       2282 non-null   object        
 7   Rating          2282 non-null   object        
 8   Debate          2282 non-null   object        
 9   share_count     2282 non-null   float64       
 10  reaction_count  2282 non-null   float64       
 11  comment_count   2282 non-null   float64       
 12  Context Post    2282 non-null   object        
dtypes: datetime64[ns](1), float64(3), object(9)
memory usage: 231.9+ KB
None
account_id        0
post_id        

In [11]:
def processElement(elem):
    id_line = elem[0]
    text = elem[1]
    # Use 'Context Post' instead of 'content' if needed
    text = tkn.createCorpus(text, remove_stopwords=False)
    return id_line, text

# Main source

In [17]:
from google.colab import drive
drive.mount('/content/drive')

# Download and save to Drive (run once)
# !wget http://nlp.stanford.edu/data/glove.6B.zip -O /content/drive/MyDrive/glove.6B.zip
!unzip /content/drive/MyDrive/glove.6B.zip -d /content/drive/MyDrive/glove

# Load from Drive in future sessions
embeddings_index = {}
with open('/content/drive/MyDrive/glove/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f'Found {len(embeddings_index)} word vectors.')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archive:  /content/drive/MyDrive/glove.6B.zip
  inflating: /content/drive/MyDrive/glove/glove.6B.50d.txt  
  inflating: /content/drive/MyDrive/glove/glove.6B.100d.txt  
  inflating: /content/drive/MyDrive/glove/glove.6B.200d.txt  
  inflating: /content/drive/MyDrive/glove/glove.6B.300d.txt  
Found 400000 word vectors.


In [None]:


class WordEmbeddings:

    def __init__(self, corpus, normalize_tfidf=False):
        self.corpus = corpus
        self.normalize_tfidf = normalize_tfidf
        self.documents = []
        self.sentences = []
        self.word2id = {}
        self.no_words = 0
        self.max_size = 0
        self.no_docs = len(self.corpus)

    def prepareDocuments(self):
        word_id = 1
        for document in self.corpus:
            doc = []
            for sentence in document:
                self.sentences.append(sentence)
                for word in sentence:
                    if self.word2id.get(word) is None:
                        self.word2id[word] = word_id
                        word_id += 1
                    doc.append(self.word2id[word])
            if self.max_size < len(doc):
                self.max_size = len(doc)
            self.documents.append(doc)

        self.no_words = len(self.word2id) + 1
        return self.documents

    def word2vecEmbedding(self, window_size=10, no_components=128, epochs=100, workers=os.cpu_count(), sg=0, learning_rate=0.05):
        self.word2vec = np.empty(shape=(self.no_words, no_components))
        model = Word2Vec(self.sentences, vector_size=no_components, window=window_size, min_count=1,
                         workers=workers, sg=sg, alpha=learning_rate, epochs=epochs)
        self.word2vec[0] = np.zeros(no_components)
        for word in self.word2id:
            self.word2vec[self.word2id[word]] = model.wv[word]
        return self.word2vec

    def word2GloVeEmbedding(self, window_size=10, no_components=128, epochs=100, workers=os.cpu_count(), learning_rate=0.05):
        self.word2glove = np.empty(shape=(self.no_words, no_components))
        model = GloVe(n=no_components, learning_rate=learning_rate)

        vocab = list(self.word2id.keys())
        cooc_matrix = np.zeros((len(vocab), len(vocab)))
        for sentence in self.sentences:
            for i, word in enumerate(sentence):
                word_idx = self.word2id[word] - 1
                for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                    if i != j:
                        cooc_idx = self.word2id[sentence[j]] - 1
                        cooc_matrix[word_idx, cooc_idx] += 1

        embeddings = model.fit(cooc_matrix)
        self.word2glove[0] = np.zeros(no_components)
        for word, idx in self.word2id.items():
            self.word2glove[idx] = embeddings[idx - 1]
        return self.word2glove

    def word2FastTextEmbeddings(self, window_size=10, no_components=128, epochs=100, workers=os.cpu_count(), sg=0, learning_rate=0.05):
        self.word2fasttext = np.empty(shape=(self.no_words, no_components))
        model = FastText(self.sentences, vector_size=no_components, window=window_size, min_count=1,
                         workers=workers, sg=sg, alpha=learning_rate, epochs=epochs)
        self.word2fasttext[0] = np.zeros(no_components)
        for word in self.word2id:
            self.word2fasttext[self.word2id[word]] = model.wv[word]
        return self.word2fasttext

    def word2MittensEmbedding(self, window_size=10, no_components=128, epochs=100, workers=os.cpu_count(), learning_rate=0.05):
        self.word2mittens = np.empty(shape=(self.no_words, no_components))
        vocab = list(self.word2id.keys())
        cooc_matrix = np.zeros((len(vocab), len(vocab)))
        for sentence in self.sentences:
            for i, word in enumerate(sentence):
                word_idx = self.word2id[word] - 1
                for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)):
                    if i != j:
                        cooc_idx = self.word2id[sentence[j]] - 1
                        cooc_matrix[word_idx, cooc_idx] += 1

        model = Mittens(n=no_components, max_iter=epochs, mittens=learning_rate)
        embeddings = model.fit(cooc_matrix, vocab=vocab)
        self.word2mittens[0] = np.zeros(no_components)
        for word, idx in self.word2id.items():
            self.word2mittens[idx] = embeddings[idx - 1]
        return self.word2mittens

if __name__ == '__main__':
    corpus = [
        [['Hello', 'this', 'tutorial', 'on', 'how', 'convert', 'word', 'integer', 'format'],
         ['this', 'beautiful', 'day'],
         ['Jack', 'going', 'office']],
        [['Hello', 'this', 'tutorial', 'on', 'how', 'convert', 'word', 'integer', 'format'],
         ['this', 'beautiful', 'day'],
         ['Jack', 'going', 'office']],
        [['Hello', 'this', 'tutorial', 'on', 'how', 'convert', 'word', 'integer', 'format'],
         ['this', 'beautiful', 'day'],
         ['Jack', 'going', 'office']],
    ]

    we = WordEmbeddings(corpus)
    docs = we.prepareDocuments()
    print(np.array(docs, dtype=object).shape)
    print(docs)

    w2v = we.word2vecEmbedding()
    print("Word2Vec:", w2v.shape)
    print(w2v)

    w2f = we.word2FastTextEmbeddings()
    print("FastText:", w2f.shape)
    print(w2f)

    w2g = we.word2GloVeEmbedding()
    print("GloVe:", w2g.shape)
    print(w2g)

    w2m = we.word2MittensEmbedding()
    print("Mittens:", w2m.shape)
    print(w2m)

    print("\n\nComparison for word ID 1:")
    print("Word2Vec:", w2v[1])
    print("FastText:", w2f[1])
    print("GloVe:", w2g[1])
    print("Mittens:", w2m[1])

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: mittens 0.2
Uninstalling mittens-0.2:
  Successfully uninstalled mittens-0.2
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Found existing installation: scipy 1.13.1
Uninstalling scipy-1.13.1:
  Successfully uninstalled scipy-1.13.1
Found existing installation: smart-open 7.1.0
Uninstalling smart-open-7.1.0:
  Successfully uninstalled smart-open-7.1.0
Found existing installation: wrapt 1.17.2
Uninstalling wrapt-1.17.2:
  Successfully uninstalled wrapt-1.17.2
[0mCollecting numpy==1.26.4
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting mittens==0.2
  Using cached mittens-0.2-py3-none-any.whl.metadata (377 bytes)
Collecting gensim==4.3.3
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.met