In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
df = pd.read_csv('/content/FakeNewsNet.csv')
df.shape

(23196, 5)

In [6]:
df.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [7]:
df.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [8]:
df.fillna('',inplace=True)
df.isnull().sum()

title            0
news_url         0
source_domain    0
tweet_num        0
real             0
dtype: int64

In [9]:
df['content'] = df['source_domain']+''+df['title']

In [11]:
df.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real,content
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1,toofab.comKandi Burruss Explodes Over Rape Acc...
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1,www.today.comPeople's Choice Awards 2018: The ...
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1,www.etonline.comSophia Bush Sends Sweet Birthd...
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1,www.dailymail.co.ukColombian singer Maluma spa...
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1,www.zerchoo.comGossip Girl 10 Years Later: How...


In [12]:
X = df.drop(columns='real',axis=1)
y = df['real']
print(X,y)


                                                   title  \
0      Kandi Burruss Explodes Over Rape Accusation on...   
1      People's Choice Awards 2018: The best red carp...   
2      Sophia Bush Sends Sweet Birthday Message to 'O...   
3      Colombian singer Maluma sparks rumours of inap...   
4      Gossip Girl 10 Years Later: How Upper East Sid...   
...                                                  ...   
23191  Pippa Middleton wedding: In case you missed it...   
23192  Zayn Malik & Gigi Hadid’s Shocking Split: Why ...   
23193  Jessica Chastain Recalls the Moment Her Mother...   
23194  Tristan Thompson Feels "Dumped" After Khloé Ka...   
23195  Kelly Clarkson Performs a Medley of Kendrick L...   

                                                news_url  \
0      http://toofab.com/2017/05/08/real-housewives-a...   
1      https://www.today.com/style/see-people-s-choic...   
2      https://www.etonline.com/news/220806_sophia_bu...   
3      https://www.dailymail.co.uk/news

STEMMING

In [13]:
porter_stem = PorterStemmer()

In [14]:
def stemi(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [porter_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [15]:
df['content']=df['content'].apply(stemi)

In [16]:
print(df['content'])

0        toofab comkandi burruss explod rape accus real...
1        www today compeopl choic award best red carpet...
2        www etonlin comsophia bush send sweet birthday...
3        www dailymail co ukcolombian singer maluma spa...
4        www zerchoo comgossip girl year later upper ea...
                               ...                        
23191    www express co ukpippa middleton wed case miss...
23192    hollywoodlif comzayn malik gigi hadid shock sp...
23193    www justjar comjessica chastain recal moment m...
23194    www intouchweekli comtristan thompson feel dum...
23195    www billboard comkelli clarkson perform medley...
Name: content, Length: 23196, dtype: object


In [17]:
X = df['content'].values
y = df['real'].values

TEXTUAL_DATA TO NUMERICAL DATA

In [20]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [21]:
print(X)

  (0, 17558)	0.21182632965506829
  (0, 16280)	0.29282877079168707
  (0, 13902)	0.2530032860807785
  (0, 13611)	0.23319435395794588
  (0, 13560)	0.3002668208547604
  (0, 8412)	0.2599228337199497
  (0, 6678)	0.35283888625772475
  (0, 3760)	0.3992969125266123
  (0, 1798)	0.3850682022082062
  (0, 630)	0.3073311902721236
  (0, 57)	0.25808393887630005
  (1, 18141)	0.09188926307468818
  (1, 16245)	0.3326774472870324
  (1, 13682)	0.34075075978911906
  (1, 9997)	0.30172396322540357
  (1, 4278)	0.4826246477349814
  (1, 2354)	0.38104246959127513
  (1, 2047)	0.3494732054103103
  (1, 1200)	0.3098208273796828
  (1, 836)	0.2783761277576564
  (2, 18141)	0.05333413170996292
  (2, 16414)	0.2983921420290402
  (2, 15695)	0.2294893916415434
  (2, 15324)	0.14721282060332433
  (2, 14540)	0.2599067612707775
  :	:
  (23194, 16123)	0.2596370009045142
  (23194, 13728)	0.30653638336232764
  (23194, 10948)	0.2731206551727222
  (23194, 9797)	0.29995574361896993
  (23194, 9572)	0.25704895655596943
  (23194, 9380)	0.

In [22]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train,y_train)

In [25]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred,y_test)
print(accuracy)

0.853448275862069


PREDICTION SYSTEM

In [29]:
X_new = X_test[0]

prediction = model.predict(X_test[0])
print(y_test[0])
if(prediction[0]==0):
  print("REAL NEWS")
else:
  print("FAKE NEWS")

1
FAKE NEWS
