## import libraries

In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv("https://raw.githubusercontent.com/lutzhamel/fake-news/refs/heads/master/data/fake_or_real_news.csv")
df.head()

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [15]:
df.shape

(6335, 4)

In [16]:
df = pd.read_csv("/content/fake_news.csv", on_bad_lines='skip', engine='python', encoding='latin1')
df.head(2)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didnât Even See Comeyâs...,Darrell Lucus,House Dem Aide: We Didnât Even See Comeyâs...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [17]:
df.shape

(3286, 5)

In [18]:
df.isnull().sum()

Unnamed: 0,0
id,0
title,88
author,330
text,7
label,0


In [19]:
df = df.dropna()

In [20]:
df.shape

(2868, 5)

In [21]:
X = df.drop('label', axis=1)
y = df['label']

In [22]:
import tensorflow as tf
tf.__version__

'2.19.0'

In [23]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [24]:
# Vocabulary size
voc_size = 5000

#### OneHot Representation

In [25]:
df.sample(2)

Unnamed: 0,id,title,author,text,label
3112,2898,BREAKING : Hillary Campaign Manager Deletes hi...,Amy Moreno,BREAKING : Hillary Campaign Manager Deletes hi...,1
267,267,Huma Abedin Seeks FBI Immunity Deal,Sean Adl-Tabatabai,"Posted on October 30, 2016 by Sean Adl-Tabatab...",1


In [26]:
messages = X.copy()
messages.head(2)

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didnât Even See Comeyâs...,Darrell Lucus,House Dem Aide: We Didnât Even See Comeyâs...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...


In [27]:
messages['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [28]:
import nltk
import re
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#### Dataset Preprocessing - NLTK

In [29]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus =[]

messages.reset_index(drop=True, inplace=True)

for i in range(0, len(messages)):
  # print(i)
  review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
  review = review.lower()
  review = review.split()

  review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
  review = ' '.join(review)
  corpus.append(review)

corpus

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri',
 'jacki mason hollywood would love trump bomb north korea lack tran bathroom exclus video breitbart',
 'beno hamon win french socialist parti presidenti nomin new york time',
 'back channel plan ukrain russia courtesi trump associ new york time',
 'obama organ action partner soro link indivis disrupt trump agenda',
 'bbc comedi sketch real housew isi caus outrag',
 'russian research discov secret nazi militari base treasur hunter arctic photo',
 'us offici see link trump russia',
 'ye paid govern troll social media blog forum websit',
 'major leagu soccer argentin find home success new york time',
 'well fargo chief abruptli step new york time',
 'anonym donor pay million releas everyon arrest dakota access pipelin',
 'fbi close hilla

In [30]:
corpus[1]

'flynn hillari clinton big woman campu breitbart'

In [31]:
onehot_repr = [one_hot(words, voc_size) for words in corpus]
onehot_repr

[[3115, 2300, 80, 1385, 2481, 1432, 557, 3406, 636, 2994],
 [1341, 888, 172, 3296, 4546, 1701, 2403],
 [2384, 3202, 2673, 2307],
 [3098, 1826, 3464, 2217, 2087, 1325],
 [659, 4546, 2253, 4592, 4612, 742, 4546, 3554, 454, 3523],
 [1226,
  3860,
  4417,
  233,
  2338,
  3195,
  1086,
  141,
  3169,
  2121,
  1062,
  4817,
  3207,
  412,
  2403],
 [2325, 4343, 4220, 1786, 3034, 993, 293, 2311, 4897, 2944, 425],
 [2196, 886, 3960, 2282, 4019, 4038, 3195, 4565, 4897, 2944, 425],
 [1393, 4462, 4729, 1492, 2082, 2602, 2709, 1637, 3195, 2970],
 [3049, 4605, 3779, 4591, 3649, 1160, 2744, 2406],
 [2325, 1657, 2566, 3424, 3882, 1186, 4559, 588, 1588, 2968, 4134],
 [2217, 2136, 2481, 2602, 3195, 4019],
 [772, 4242, 4885, 798, 4141, 2166, 1555, 1043, 4241],
 [4415, 4206, 1945, 2500, 3428, 649, 3608, 4897, 2944, 425],
 [193, 3275, 3365, 1621, 277, 4897, 2944, 425],
 [194, 4558, 4428, 703, 3210, 1568, 2115, 2790, 324, 1754],
 [1346, 3702, 888],
 [1727, 4603, 206, 3494, 3195, 4902, 818, 2403],
 [574, 

In [None]:
corpus