In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import kagglehub
from collections import Counter

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM, Dense

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## 1. Download dataset and EDA

In [2]:
path = kagglehub.dataset_download("ronikdedhia/fake-news")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\Chiara\.cache\kagglehub\datasets\ronikdedhia\fake-news\versions\1


In [3]:
os.listdir(path)

['fake_news.csv']

In [4]:
df = pd.read_csv(f"{path}\\fake_news.csv")

In [5]:
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


label = 1 $\rightarrow$ unreliable

label = 0 $\rightarrow$ reliable

In [6]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
# Drop null values

df = df.dropna()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18285 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      18285 non-null  int64 
 1   title   18285 non-null  object
 2   author  18285 non-null  object
 3   text    18285 non-null  object
 4   label   18285 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 857.1+ KB


In [9]:
# Get the dependent and the independent features

X = df.drop("label", axis=1)
y = df["label"]

print("X shape: ", X.shape)
print("y shape: ", y.shape)

X shape:  (18285, 4)
y shape:  (18285,)


## 2. Text pre-processing

* Choice of vocabulary size
* Tokenization, stemming and stopword
* One Hot representation
* Padding

In [10]:
# How many unique words?

all_words = []

for title in df["title"]:
    all_words.extend(title.split())

word_counts = Counter(all_words)

print("Total words: ", len(all_words))
print("Unique words: ", len(word_counts))

Total words:  229006
Unique words:  37887


In [12]:
# How many words to cover 80% of the corpus?

total_words = sum(word_counts.values())
cumulative = 0

for i, (word, count) in enumerate(word_counts.most_common()):
    cumulative += count
    if cumulative / total_words >= 0.8:
        print("I need ", i, "words to cover 80%")
        break

I need  5497 words to cover 80%


I choose the vocqbulqry size $\rightarrow$ 5000 words.

In [13]:
vocabulary_size = 5000

In [14]:
messages = X.copy()
messages["title"][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [15]:
messages.reset_index(inplace=True)

In [18]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chiara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
# Pre-processing on "title"

ps = PorterStemmer()
corpus = []

for i in range(0, len(messages)):
    title = re.sub("[^a-zA-Z]", " ", messages["title"][i])
    title = title.lower()
    title = title.split()

    title = [ps.stem(word) for word in title if not word in stopwords.words("english")]
    title = " ".join(title)
    corpus.append(title)

In [23]:
corpus[1]

'flynn hillari clinton big woman campu breitbart'

In [24]:
# One hot representation

onehot_repr = [one_hot(words, vocabulary_size) for words in corpus]
onehot_repr[1]

[3551, 4496, 955, 4871, 4234, 1206, 4587]

In [25]:
# Study of the lengths of sentences

lengths = [len(seq) for seq in onehot_repr]

print(f"Mean {np.mean(lengths)}")
print(f"Median {np.median(lengths)}")
print(f"90 percentile {np.percentile(lengths, 90)}")
print(f"95 percentile {np.percentile(lengths, 95)}")
print(f"99 percentile {np.percentile(lengths, 99)}")
print(f"Max length {np.max(lengths)}")

Mean 8.725731473885698
Median 9.0
90 percentile 12.0
95 percentile 13.0
99 percentile 15.0
Max length 47


I choose length 20 for sentences.

In [28]:
# Padding

sent_length = 20
embedded_docs = pad_sequences(onehot_repr, padding="pre", maxlen=sent_length)

print(embedded_docs[1])

[   0    0    0    0    0    0    0    0    0    0    0    0    0 3551
 4496  955 4871 4234 1206 4587]


## 3. Modelling: LSTM

In [36]:
# Model

embedding_vector_features = 40

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_vector_features, input_shape=(sent_length,)))
model.add(LSTM(100))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

  super().__init__(**kwargs)


In [37]:
model.summary()

In [39]:
# Convert to array

X_final = np.array(embedded_docs)
y_final = np.array(y)

print("X_final shape: ", X_final.shape)
print("y_final shape: ", y_final.shape)

X_final shape:  (18285, 20)
y_final shape:  (18285,)


In [42]:
# Split train and test data

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [43]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

X_train shape:  (14628, 20)
X_test shape:  (3657, 20)


In [44]:
# Training

model.fit(X_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8816 - loss: 0.2730
Epoch 2/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9436 - loss: 0.1413
Epoch 3/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9578 - loss: 0.1059
Epoch 4/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9696 - loss: 0.0820
Epoch 5/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9764 - loss: 0.0664
Epoch 6/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9810 - loss: 0.0526
Epoch 7/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9871 - loss: 0.0384
Epoch 8/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9910 - loss: 0.0285
Epoch 9/10
[1m229/229[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x267ea2ebf70>

## 4. Performances

In [46]:
# Prediction 

y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [48]:
confusion_matrix(y_test, y_pred)

array([[1879,  203],
       [ 140, 1435]])

In [50]:
accuracy_score(y_test, y_pred)

0.9062072737216298

In [54]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.90      0.92      2082
           1       0.88      0.91      0.89      1575

    accuracy                           0.91      3657
   macro avg       0.90      0.91      0.90      3657
weighted avg       0.91      0.91      0.91      3657



## 5. Attempt to increase accuracy: dropout

In [56]:
# Model with dropout

embedding_vector_features = 40

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_vector_features, input_shape=(sent_length,)))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

  super().__init__(**kwargs)


In [57]:
# Training

model.fit(X_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8601 - loss: 0.2957
Epoch 2/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9397 - loss: 0.1457
Epoch 3/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9554 - loss: 0.1150
Epoch 4/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9663 - loss: 0.0900
Epoch 5/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9761 - loss: 0.0663
Epoch 6/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9832 - loss: 0.0519
Epoch 7/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9867 - loss: 0.0402
Epoch 8/10
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9880 - loss: 0.0338
Epoch 9/10
[1m229/229[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x267ed9ee920>

In [58]:
# Prediction 

y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [59]:
confusion_matrix(y_test, y_pred)

array([[1908,  174],
       [ 128, 1447]])

In [60]:
accuracy_score(y_test, y_pred)

0.917418649165983

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2082
           1       0.89      0.92      0.91      1575

    accuracy                           0.92      3657
   macro avg       0.91      0.92      0.92      3657
weighted avg       0.92      0.92      0.92      3657



Accuracy has slightly improved, but not significantly.