In [81]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier


In [82]:
df_news = pd.read_csv("WELFake_Dataset.csv")
df_news.head()


Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [83]:
df_news.shape

(72134, 4)

In [84]:
df_news.isnull().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [85]:
df_news = df_news.dropna()

In [86]:
df_news.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

In [87]:
ps = PorterStemmer()

stopwords = [
    "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been",
    "before", "being", "below", "between", "both", "but", "by", "can", "can't", "did", "didn't", "do", "does", "doesn't", "doing",
    "don't", "during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't", "having", "he",
    "her", "here", "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", "isn't", "it", "its", "itself",
    "just", "'ll", "m", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "now", "o", "of", "off", "on", "once",
    "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "same", "shan't", "she", "should", "shouldn't", "so",
    "some", "such", "than", "that", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they", "this", "those",
    "through", "to", "too", "under", "until", "up", "very", "was","wasn't", "we", "were", "weren't", "what", "when", "where",
    "which", "while", "who", "whom", "why", "will", "won't", "with", "won't", "would", "wouldn't", "yet", "you", "your", "yours",
    "yourself", "yourselves"
]

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [ps.stem(word) for word in stemmed_content if not word in stopwords]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content



In [88]:
df_news['title'] = df_news['title'].apply(stemming)

In [89]:
df_news.head

<bound method NDFrame.head of        Unnamed: 0                                              title  \
0               0  law enforc high alert follow threat cop white ...   
2               2  unbeliev obama s attorney gener say charlott r...   
3               3  bobbi jindal rais hindu use stori christian co...   
4               4  satan russia unv imag terrifi new supernuk wes...   
5               5  time christian group sue amazon splc design ha...   
...           ...                                                ...   
72129       72129  russian steal research trump hack u s democrat...   
72130       72130  watch giuliani demand democrat apolog trump s ...   
72131       72131       migrant refus leav train refuge camp hungari   
72132       72132  trump tussl give unpopular mexican leader much...   
72133       72133         goldman sach endors hillari clinton presid   

                                                    text  label  
0      No comment is expected from Bara

In [90]:
X = df_news['title'].values
Y = df_news['label'].values
X

array(['law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video',
       'unbeliev obama s attorney gener say charlott rioter peac protest home state north carolina video',
       'bobbi jindal rais hindu use stori christian convers woo evangel potenti bid',
       ..., 'migrant refus leav train refuge camp hungari',
       'trump tussl give unpopular mexican leader much need shot arm',
       'goldman sach endors hillari clinton presid'], dtype=object)

In [91]:
vec = TfidfVectorizer()
vec.fit(X)
X = vec.transform(X)

In [92]:
print(X)

  (0, 18970)	0.19119199199155718
  (0, 18516)	0.12953081995001134
  (0, 17236)	0.25417143325832803
  (0, 17133)	0.24861338779858738
  (0, 9628)	0.2281779531036376
  (0, 7823)	0.2673841297708583
  (0, 6672)	0.4856722738685227
  (0, 6369)	0.2892696681391208
  (0, 5461)	0.31817689197461024
  (0, 3643)	0.24869119486730346
  (0, 1782)	0.334723411455833
  (0, 404)	0.3194027206998875
  (1, 18516)	0.13424895962371275
  (1, 17902)	0.35973057550211296
  (1, 16322)	0.1998676543814105
  (1, 14983)	0.1608430986300803
  (1, 14483)	0.35810711329768474
  (1, 13495)	0.2268087560849422
  (1, 12654)	0.2790489798405972
  (1, 11924)	0.1686662338838085
  (1, 11778)	0.22306828908813514
  (1, 7956)	0.26921648198053316
  (1, 6820)	0.2652110968000934
  (1, 2892)	0.36407357799494927
  (1, 2647)	0.3081355901867282
  :	:
  (71533, 1748)	0.4930846869221777
  (71533, 756)	0.39873664445651247
  (71534, 17489)	0.39738586728341585
  (71534, 14085)	0.35335720051740843
  (71534, 14081)	0.32382708160669216
  (71534, 10854

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


In [94]:
X_train.shape, X_test.shape

((57229, 19503), (14308, 19503))

In [95]:
model3 = RandomForestClassifier()
model3.fit(X_train, Y_train)

In [100]:
Y_pred = model3.predict(X_train)
print(accuracy_score( Y_pred,Y_train))

0.9998427370738612


In [101]:
Y_pred = model3.predict(X_test)
print(accuracy_score( Y_pred,Y_test))

0.9080234833659491


In [102]:

input_data = X_test[1]
prediction = model3.predict(input_data)
if prediction[0] == 1:
    print('The News is a Fake news')
else:
    print('The News is a Real news')


The News is a Fake news


In [104]:
df_news['title'][0]

'law enforc high alert follow threat cop white blacklivesmatt fyf terrorist video'