In [1]:
# import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load the dataset
df=pd.read_csv('fake_news_detection.csv')
df

Unnamed: 0,label,text
0,REAL,Payal has accused filmmaker Anurag Kashyap of ...
1,FAKE,A four-minute-long video of a woman criticisin...
2,FAKE,"Republic Poll, a fake Twitter account imitatin..."
3,REAL,"Delhi teen finds place on UN green list, turns..."
4,REAL,Delhi: A high-level meeting underway at reside...
...,...,...
3724,REAL,19:17 (IST) Sep 20\n\nThe second round of coun...
3725,REAL,19:17 (IST) Sep 20\n\nThe second round of coun...
3726,FAKE,The Bengaluru City Police’s official Twitter h...
3727,REAL,"Sep 20, 2020, 08:00AM IST\n\nSource: TOI.in\n\..."


In [3]:
# categories of label column
df['label'].unique()

array(['REAL', 'FAKE'], dtype=object)

In [4]:
# replace the label categories
df['label'].replace({'FAKE':0,'REAL':1},inplace=True)

In [5]:
df

Unnamed: 0,label,text
0,1,Payal has accused filmmaker Anurag Kashyap of ...
1,0,A four-minute-long video of a woman criticisin...
2,0,"Republic Poll, a fake Twitter account imitatin..."
3,1,"Delhi teen finds place on UN green list, turns..."
4,1,Delhi: A high-level meeting underway at reside...
...,...,...
3724,1,19:17 (IST) Sep 20\n\nThe second round of coun...
3725,1,19:17 (IST) Sep 20\n\nThe second round of coun...
3726,0,The Bengaluru City Police’s official Twitter h...
3727,1,"Sep 20, 2020, 08:00AM IST\n\nSource: TOI.in\n\..."


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3729 entries, 0 to 3728
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   3729 non-null   int64 
 1   text    3721 non-null   object
dtypes: int64(1), object(1)
memory usage: 58.4+ KB


In [7]:
# check the null values
df.isnull().sum()

label    0
text     8
dtype: int64

In [8]:
df.dropna(inplace=True)

In [10]:
# check the duplicated values
df.duplicated().sum()

1492

In [11]:
df.drop_duplicates(inplace=True)

In [12]:
df.iloc[16]

label                                                    1
text     NEW DELHI: As many as 21 states, mostly ruled ...
Name: 17, dtype: object

In [13]:
df.reset_index(inplace=True)

In [14]:
# import NLP libraries
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()

In [15]:
# remove the punctuations,stopwords
corpus=[]
for i in range(len(df)):
    rp=re.sub('[^a-zA-Z]'," ",df['text'][i])
    rp=rp.lower()
    rp=rp.split()
    rp=[wnl.lemmatize(word) for word in rp if word not in set(stopwords.words('english'))]
    rp=" ".join(rp)
    corpus.append(rp)

In [16]:
corpus

['payal accused filmmaker anurag kashyap behaving inappropriately video went viral maintained stance speaking etimes said wanted speak long time today finally thought must get head tweeted incident sometime ago metoo movement happened many people told delete tweet else would stop getting work manager advised remove tweet complied post anurag blocked whatsapp',
 'four minute long video woman criticising government anti citizenship amendment act rally delhi earlier january going viral false claim woman show late prime minister atal bihari vajpayee niece caption hindi translates respected vajpayee ji niece finally broken silence listen say translated hindi also read muslim politician disguised hindu anti caa protest video come time protest citizenship amendment act proposed national register citizen gone unabated month video woman seen saying following hindi britisher lot bad least outsider first land came afar came still difference indian government britisher britisher educated illiterat

In [17]:
x=df['text']
y=df['label']

In [20]:
# train_test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [21]:
# vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization=TfidfVectorizer()
x_train=vectorization.fit_transform(x_train)
x_test=vectorization.transform(x_test)

In [22]:
#modeling & evaluation
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()
LR.fit(x_train,y_train)

#prediction
ypred_train=LR.predict(x_train)
ypred_test=LR.predict(x_test)

# train & test accuracy
from sklearn.metrics import accuracy_score
print('train accuracy:',accuracy_score(y_train,ypred_train))
print('test accuracy:',accuracy_score(y_test,ypred_test))

train accuracy: 0.9820527201346047
test accuracy: 0.9304932735426009


In [23]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96       370
           1       1.00      0.59      0.74        76

    accuracy                           0.93       446
   macro avg       0.96      0.80      0.85       446
weighted avg       0.94      0.93      0.92       446



In [24]:
#modeling & evaluation
from sklearn.tree import DecisionTreeClassifier
DT=DecisionTreeClassifier(random_state=0)
DT.fit(x_train,y_train)

#prediction
ypred_train=DT.predict(x_train)
ypred_test=DT.predict(x_test)

# train & test accuracy
from sklearn.metrics import accuracy_score
print('train_accuracy:',accuracy_score(ypred_train,y_train))
print('test_accuracy:',accuracy_score(ypred_test,y_test))

train_accuracy: 1.0
test_accuracy: 0.9730941704035875


In [25]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       370
           1       0.93      0.91      0.92        76

    accuracy                           0.97       446
   macro avg       0.96      0.95      0.95       446
weighted avg       0.97      0.97      0.97       446



In [26]:
#modeling & evaluation
from sklearn.ensemble import GradientBoostingClassifier
GB=GradientBoostingClassifier()
GB.fit(x_train,y_train)

#prediction
ypred_train=GB.predict(x_train)
ypred_test=GB.predict(x_test)

# train & test accuracy
from sklearn.metrics import accuracy_score
print('train_accuracy:',accuracy_score(ypred_train,y_train))
print('test_accuracy:',accuracy_score(ypred_test,y_test))

train_accuracy: 1.0
test_accuracy: 0.9932735426008968


In [27]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       370
           1       0.97      0.99      0.98        76

    accuracy                           0.99       446
   macro avg       0.99      0.99      0.99       446
weighted avg       0.99      0.99      0.99       446



In [28]:
#modeling & evaluation
from sklearn.ensemble import RandomForestClassifier
RF=RandomForestClassifier()
RF.fit(x_train,y_train)

#prediction
ypred_train=RF.predict(x_train)
ypred_test=RF.predict(x_test)

# train & test accuracy
from sklearn.metrics import accuracy_score
print('train_accuracy:',accuracy_score(ypred_train,y_train))
print('test_accuracy:',accuracy_score(ypred_test,y_test))

train_accuracy: 1.0
test_accuracy: 0.9865470852017937


In [29]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test,ypred_test))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       370
           1       1.00      0.92      0.96        76

    accuracy                           0.99       446
   macro avg       0.99      0.96      0.98       446
weighted avg       0.99      0.99      0.99       446



In [34]:
def pred(text):
    if text==0:
        return'fake news'
    else:
        return'not a fake news'
def manual_testing(news):
    testing_news={'text':[news]}
    new_def_test=pd.DataFrame(testing_news)
    corpus=[]
    for i in range(len(new_def_test)):
        rp=re.sub('[^a-zA-Z]'," ",new_def_test['text'][i])
        rsw=rp.lower()
        rsw=rsw.split()
        rsw=[wnl.lemmatize(word) for word in rsw if word not in set(stopwords.words('english'))]
        rsw=" ".join(rsw)
        corpus.append(rsw)
    new_test=vectorization.transform(corpus)
    pred_LR=LR.predict(new_test)
    pred_DT=DT.predict(new_test)
    pred_GB=GB.predict(new_test)
    pred_RF=RF.predict(new_test)

    return print("\n\nlr prediction:{} \ndt prediction:{} \ngb prediction:{} \nrf prediction:{}".format(pred(pred_LR[0]),pred(pred_DT[0]),pred(pred_GB[0]),pred(pred_RF[0])))

In [35]:
# check the given news is fake or not a fake news
news=str(input())
manual_testing(news)

"A woman whose photos were stolen to create the fake persona of 'Dr. Aisha' - a catfish Twitter account posing as a frontline doctor who died of COVID-19, is a medical student in South Africa, BOOM has found out.  Speaking to BOOM, the woman requesting anonymity said she only recently learnt that her photos were being used by a fake account. ""I am getting calls asking if I am dead. Those photos are mine and were taken by someone who stalked my social media account and the accounts of my family,"" said the woman who is a final year medical student at the Mthatha campus of the Walter Sisulu University. The woman whose name is not Aisha, spoke to BOOM over texts and also sent us her college identification card as proof to show that it was indeed her photos used by the fake account.    On August 1, Twitter was filled with condolence messages for a Dr Aisha, after a handle shared photos of a woman highlighting her deteriorating health and misled netizens. Several verified handles including

In [36]:
news=str(input())
manual_testing(news)

"Delhi: A high-level meeting underway at residence of Rajya Sabha Chairman, Venkaiah Naidu, over the ruckus createdâ€¦ https://t.co/OfoRpOwd4b â€” ANI (@ANI) 1600599438000  A watershed moment in the history of Indian agriculture! Congratulations to our hardworking farmers on the passageâ€¦ https://t.co/CdLW6Itzmc â€” Narendra Modi (@narendramodi) 1600595449000  For decades, the Indian farmer was bound by various constraints and bullied by middlemen. The bills passed by Parliâ€¦ https://t.co/GfF1nhaFeZ â€” Narendra Modi (@narendramodi) 1600595492000  Our agriculture sector is in desperate need of latest technology that assists the industrious farmers. Now, with thâ€¦ https://t.co/VGK1o0pPQb â€” Narendra Modi (@narendramodi) 1600595705000  NEW DELHI: The Rajya Sabha on Sunday passed two key farm bills amid a bedlam as opposition members charged towards the podium of the presiding officer, flung the rule book at him, tore official papers and heckled him over their demand for a division of