In [10]:
import kagglehub
import os
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

dataset_path = kagglehub.dataset_download("saurabhshahane/fake-news-classification")

print("Path to dataset files:", dataset_path)

path_join = os.path.join(dataset_path, 'WELFake_Dataset.csv')
df = pd.read_csv(path_join)

print(df.head())

Path to dataset files: /Users/fionamagee/.cache/kagglehub/datasets/saurabhshahane/fake-news-classification/versions/77
   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1  


In [11]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix , classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [12]:
y = df.label
print(f'Ratio of real and fake news:')
y.value_counts(normalize=True).rename({1: 'real', 0: 'fake'})
df = df.fillna('')
df.isnull().sum()

Ratio of real and fake news:


Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

Using Multinomial Naive Bayes 
-------------
Vectorizes word count and uses those counts with basic bayes theorem to predict whether real or fake news 

$$
P(\text{Fake} \mid \text{Words}) =
\frac{P(\text{Words} \mid \text{Fake}) \times P(\text{Fake})}
     {P(\text{Words})}
$$

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.33, random_state=53)

In [14]:
count_vectorizer = CountVectorizer(stop_words='english')
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [15]:
from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB()
nb_classifier.fit(count_train, y_train)

In [16]:
pred = nb_classifier.predict(count_test)

In [17]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.91      0.89     11640
           1       0.91      0.87      0.89     12165

    accuracy                           0.89     23805
   macro avg       0.89      0.89      0.89     23805
weighted avg       0.89      0.89      0.89     23805



CNN
-------

In [18]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter

In [19]:
import tensorflow as tf

import keras

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, Activation, RepeatVector, Bidirectional, LSTM, Dropout, Embedding
from sklearn.model_selection import train_test_split 
from keras.losses import sparse_categorical_crossentropy
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping


In [21]:
max_length = df['text'].astype(str).str.split().str.len().max()
print(f"The longest article in the dataset has {max_length} words.")

# It is also helpful to see the average length to avoid extreme padding
avg_length = df['text'].astype(str).str.split().str.len().mean()
print(f"The average length is {avg_length:.2f} words.")

The longest article in the dataset has 24234 words.
The average length is 540.55 words.


In [22]:
text_vectorization_layer = tf.keras.layers.TextVectorization(
    max_tokens=10000,
    output_mode='int',
    output_sequence_length=500
)
text_vectorization_layer.adapt(X_train)

In [24]:
model = tf.keras.Sequential([
    text_vectorization_layer,
    tf.keras.layers.Embedding(input_dim=10000, output_dim=128),
    Bidirectional(tf.keras.layers.LSTM(128,  return_sequences=True)),
    Bidirectional(tf.keras.layers.LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

#compile model
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

#model summary
model.summary()

BERT
-------