In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import pickle
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
dff = pd.read_csv('Datasets/fake-news/Fake.csv', usecols = ['title'])
dft = pd.read_csv('Datasets/fake-news/True.csv', usecols = ['title'])
dff = dff.dropna()
dft = dft.dropna()
dff['class'] = 0
dft['class'] = 1

In [2]:
frames = [dff,dft]
dfn = pd.concat(frames)
dfn.info

<bound method DataFrame.info of                                                    title  class
0       Donald Trump Sends Out Embarrassing New Year’...      0
1       Drunk Bragging Trump Staffer Started Russian ...      0
2       Sheriff David Clarke Becomes An Internet Joke...      0
3       Trump Is So Obsessed He Even Has Obama’s Name...      0
4       Pope Francis Just Called Out Donald Trump Dur...      0
...                                                  ...    ...
21412  'Fully committed' NATO backs new U.S. approach...      1
21413  LexisNexis withdrew two products from Chinese ...      1
21414  Minsk cultural hub becomes haven from authorities      1
21415  Vatican upbeat on possibility of Pope Francis ...      1
21416  Indonesia to buy $1.14 billion worth of Russia...      1

[44898 rows x 2 columns]>

In [3]:
stop_words = set(stopwords.words('english')) 
def preprocessor(text):
    text = (re.sub('[\W]+', ' ', text.lower()))
    return text

In [4]:
dfn['title'] = dfn['title'].apply(preprocessor)

In [5]:
dfn.head()

Unnamed: 0,title,class
0,donald trump sends out embarrassing new year ...,0
1,drunk bragging trump staffer started russian ...,0
2,sheriff david clarke becomes an internet joke...,0
3,trump is so obsessed he even has obama s name...,0
4,pope francis just called out donald trump dur...,0


In [6]:
tokenizer = RegexpTokenizer(r'\w+')
dfn['title'] = dfn['title'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [7]:
dfn.head()

Unnamed: 0,title,class
0,"[donald, trump, sends, out, embarrassing, new,...",0
1,"[drunk, bragging, trump, staffer, started, rus...",0
2,"[sheriff, david, clarke, becomes, an, internet...",0
3,"[trump, is, so, obsessed, he, even, has, obama...",0
4,"[pope, francis, just, called, out, donald, tru...",0


In [8]:
lemmatizer = WordNetLemmatizer()
def lemmat(text):
    lem_text = [lemmatizer.lemmatize(i) for i in text]
    return lem_text
def remove_stopwords(text):
    words = [word for word in text if word not in stop_words]
    return words
def untokenize(list):
    return " ".join(list)


In [9]:
#dfn['title'] = dfn['title'].apply(remove_stopwords)
dfn['title'] = dfn['title'].apply(lemmat)


In [10]:
dfn.head()

Unnamed: 0,title,class
0,"[donald, trump, sends, out, embarrassing, new,...",0
1,"[drunk, bragging, trump, staffer, started, rus...",0
2,"[sheriff, david, clarke, becomes, an, internet...",0
3,"[trump, is, so, obsessed, he, even, ha, obama,...",0
4,"[pope, francis, just, called, out, donald, tru...",0


In [11]:
dfn['title'] = dfn['title'].apply(untokenize)

In [12]:
dfn.head()

Unnamed: 0,title,class
0,donald trump sends out embarrassing new year s...,0
1,drunk bragging trump staffer started russian c...,0
2,sheriff david clarke becomes an internet joke ...,0
3,trump is so obsessed he even ha obama s name c...,0
4,pope francis just called out donald trump duri...,0


In [27]:
tfidf = TfidfVectorizer()
label = {0:'fake', 1:'true'}
X_train, X_test, y_train, y_test = train_test_split(dfn['title'],dfn['class'], test_size=0.5, random_state=42)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


headline= "Skittles used instead of salt to help clear ice and snow from roads in Canada"
Z_test = [" ".join(lemmat(tokenizer.tokenize(headline.lower())))]

Z_test_tfidf = tfidf.transform(Z_test)
print("processed headline:", Z_test)

processed headline: ['skittle used instead of salt to help clear ice and snow from road in canada']


In [24]:
clf = LogisticRegression(random_state=0).fit(X_train_tfidf, y_train)
clf.score(X_test_tfidf, y_test)

0.946634593968551

In [25]:
print('Prediction: %s\nProbability: %.2f%%' %(label[clf.predict(Z_test_tfidf)[0]],np.max(clf.predict_proba(Z_test_tfidf))*100))

Prediction: fake
Probability: 55.90%
