In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
dff = pd.read_csv('Datasets/fake-news/Fake.csv', usecols = ['text'])
dft = pd.read_csv('Datasets/fake-news/True.csv', usecols = ['text'])
dff = dff.dropna()
dft = dft.dropna()
dff['class'] = 0
dft['class'] = 1

In [2]:
frames = [dff,dft]
dfn = pd.concat(frames)
dfn.info

<bound method DataFrame.info of                                                     text  class
0      Donald Trump just couldn t wish all Americans ...      0
1      House Intelligence Committee Chairman Devin Nu...      0
2      On Friday, it was revealed that former Milwauk...      0
3      On Christmas day, Donald Trump announced that ...      0
4      Pope Francis used his annual Christmas Day mes...      0
...                                                  ...    ...
21412  BRUSSELS (Reuters) - NATO allies on Tuesday we...      1
21413  LONDON (Reuters) - LexisNexis, a provider of l...      1
21414  MINSK (Reuters) - In the shadow of disused Sov...      1
21415  MOSCOW (Reuters) - Vatican Secretary of State ...      1
21416  JAKARTA (Reuters) - Indonesia will buy 11 Sukh...      1

[44898 rows x 2 columns]>

In [3]:
stop_words = set(stopwords.words('english')) 
def preprocessor(text):
    text = (re.sub('[\W]+', ' ', text.lower()))
    return text

In [4]:
dfn['text'] = dfn['text'].apply(preprocessor)

In [5]:
dfn.head()

Unnamed: 0,text,class
0,donald trump just couldn t wish all americans ...,0
1,house intelligence committee chairman devin nu...,0
2,on friday it was revealed that former milwauke...,0
3,on christmas day donald trump announced that h...,0
4,pope francis used his annual christmas day mes...,0


In [6]:
tokenizer = RegexpTokenizer(r'\w+')
dfn['text'] = dfn['text'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [7]:
dfn.head()

Unnamed: 0,text,class
0,"[donald, trump, just, couldn, t, wish, all, am...",0
1,"[house, intelligence, committee, chairman, dev...",0
2,"[on, friday, it, was, revealed, that, former, ...",0
3,"[on, christmas, day, donald, trump, announced,...",0
4,"[pope, francis, used, his, annual, christmas, ...",0


In [8]:
lemmatizer = WordNetLemmatizer()
def lemmat(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text
def remove_stopwords(text):
    words = [word for word in text if word not in stop_words]
    return words
def untokenize(list):
    return " ".join(list)


In [9]:
#dfn['title'] = dfn['title'].apply(remove_stopwords)
#dfn['text'] = dfn['text'].apply(lemmat)


In [10]:
dfn.head()

Unnamed: 0,text,class
0,"[donald, trump, just, couldn, t, wish, all, am...",0
1,"[house, intelligence, committee, chairman, dev...",0
2,"[on, friday, it, was, revealed, that, former, ...",0
3,"[on, christmas, day, donald, trump, announced,...",0
4,"[pope, francis, used, his, annual, christmas, ...",0


In [11]:
dfn['text'] = dfn['text'].apply(untokenize)

In [12]:
dfn.head()

Unnamed: 0,text,class
0,donald trump just couldn t wish all americans ...,0
1,house intelligence committee chairman devin nu...,0
2,on friday it was revealed that former milwauke...,0
3,on christmas day donald trump announced that h...,0
4,pope francis used his annual christmas day mes...,0


In [13]:
tfidf = TfidfVectorizer()
label = {0:'fake', 1:'true'}
X_train, X_test, y_train, y_test = train_test_split(dfn['text'],dfn['class'], test_size=0.3, random_state=42)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


headline= "U.S. Navy warns mariners to stay clear of its warships in the Persian Gulf"
Z_test = [" ".join(tokenizer.tokenize(headline.lower()))]

Z_test_tfidf = tfidf.transform(Z_test)
print("processed headline:", Z_test)

processed headline: ['u s navy warns mariners to stay clear of its warships in the persian gulf']


In [14]:
clf = LogisticRegression(random_state=0).fit(X_train_tfidf, y_train)
clf.score(X_test_tfidf, y_test)

0.9859688195991091

In [15]:
print('Prediction: %s\nProbability: %.2f%%' %(label[clf.predict(Z_test_tfidf)[0]],np.max(clf.predict_proba(Z_test_tfidf))*100))

Prediction: fake
Probability: 73.66%
