In [73]:
from google.colab import drive
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import classification_report, confusion_matrix

import pickle
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Preprocessing

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [173]:
true = "/content/drive/MyDrive/Colab Notebooks/News Data/True.csv"
df_true = pd.read_csv(true)
low_memory=False 
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [172]:
false = "/content/drive/MyDrive/Colab Notebooks/News Data/Fake.csv"
df_fake = pd.read_csv(false)
low_memory=False 
df_fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [174]:
df_true['ID'] = 1
df_fake['ID'] = 0

In [175]:
num_rows = [df_true.loc[:5000][:], df_fake.loc[:5000][:]]

In [176]:
df = pd.concat(num_rows)

Check for any null values that would upset the processing

In [178]:
print (df.isnull().sum())
df.shape

title      0
text       0
subject    0
date       0
ID         0
dtype: int64


(10002, 5)

Drop unnecessary data (noise)

In [180]:
df = df.drop("subject", axis = 1)
df = df.drop("date", axis = 1)

In [182]:
x = df.drop("ID", axis = 1)
y = df['ID']

# Data Cleaning

Replace all digits and punctuation

In [183]:
df.replace('\d+', '', regex=True)
df['text'] = df['text'].str.replace('[^\w\s]', '')
df.head()

  


Unnamed: 0,title,text,ID
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON Reuters The head of a conservative...,1
1,U.S. military to accept transgender recruits o...,WASHINGTON Reuters Transgender people will be...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON Reuters The special counsel invest...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON Reuters Trump campaign adviser Geo...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLEWASHINGTON Reuters President Donald Tr...,1


Make all the text lowercase

In [184]:
df['text'] = df['text'].apply(lambda x: x.lower())

# Remove Stop Words

In [185]:
nltk.download('stopwords')
stop = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,text,ID
0,"As U.S. budget fight looms, Republicans flip t...",washington reuters head conservative republica...,1
1,U.S. military to accept transgender recruits o...,washington reuters transgender people allowed ...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,washington reuters special counsel investigati...,1
3,FBI Russia probe helped by Australian diplomat...,washington reuters trump campaign adviser geor...,1
4,Trump wants Postal Service to charge 'much mor...,seattlewashington reuters president donald tru...,1


# **Tokenization**

In [186]:
import re
def tokenize(text):
    split = re.split("\W+",text) 
    return split
df['text_token']= df['text'].apply(lambda x: tokenize(x.lower()))
df.head()

Unnamed: 0,title,text,ID,text_token
0,"As U.S. budget fight looms, Republicans flip t...",washington reuters head conservative republica...,1,"[washington, reuters, head, conservative, repu..."
1,U.S. military to accept transgender recruits o...,washington reuters transgender people allowed ...,1,"[washington, reuters, transgender, people, all..."
2,Senior U.S. Republican senator: 'Let Mr. Muell...,washington reuters special counsel investigati...,1,"[washington, reuters, special, counsel, invest..."
3,FBI Russia probe helped by Australian diplomat...,washington reuters trump campaign adviser geor...,1,"[washington, reuters, trump, campaign, adviser..."
4,Trump wants Postal Service to charge 'much mor...,seattlewashington reuters president donald tru...,1,"[seattlewashington, reuters, president, donald..."


# New Dataframe from the cleaning

In [40]:
train_df.to_csv("cleaned_train_df.csv")
train_label.to_csv("cleaned_train_label.csv")

train = "/content/cleaned_train_df.csv"
train = pd.read_csv(train)

test = "/content/cleaned_train_label.csv"
test = pd.read_csv(test)

train.head()

Unnamed: 0.1,Unnamed: 0,text
0,0,WASHINGTON (Reuters) - U.S. President Donald T...
1,1,WASHINGTON (Reuters) - Top White House officia...
2,2,"ISE-SHIMA, Japan (Reuters) - U.S. President Ba..."
3,3,If you ve ever watched the Hunger Games movies...
4,4,The draft version of the Democratic Party plat...


# **Vectorization**

Train/Test split and converting each row to strings

In [187]:
x = df[:1000]
t = df[:1000]

data = df['text_token'].apply(lambda x: np.str_(x))
target = df['ID'].apply(lambda x: np.str_(x))
X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size=0.3, random_state= 0)
X_train.shape

(7001,)

In [192]:
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
x = tfidf_v.fit_transform(data).toarray()
y = df['ID']
x.shape

(10002, 5000)

In [193]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3, random_state= 0)

# Classification Model

Logistic Regression

In [194]:
logreg = LogisticRegression(class_weight = 'balanced')
logreg.fit(X_train, Y_train)
Accuracy = logreg.score(X_test, Y_test)
print(f'Accuracy: {round(Accuracy*100,2)}%')

Accuracy: 99.57%


Naive Bayes

In [195]:
NB = MultinomialNB()
NB.fit(X_train, Y_train)
Accuracy2 = NB.score(X_test, Y_test)
print(f'Accuracy: {round(Accuracy2*100,2)}%')

Accuracy: 97.5%


Decision Tree

In [196]:
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
Accuracy3 = clf.score(X_test, Y_test)
print(f'Accuracy: {round(Accuracy3*100,2)}%')

Accuracy: 99.67%


Passive Aggressive Classifier

In [199]:
pac=PassiveAggressiveClassifier()
pac.fit(X_train,Y_train)
y_pred=pac.predict(X_test)
score=accuracy_score(Y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

Accuracy: 99.83%


Decision Tree Confusion Matrix

In [200]:
pac=PassiveAggressiveClassifier()
pac.fit(X_train,Y_train)
y_pred=pac.predict(X_test)

print(classification_report(Y_test, y_pred))
print('\n')
print(confusion_matrix(Y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1514
           1       1.00      1.00      1.00      1487

    accuracy                           1.00      3001
   macro avg       1.00      1.00      1.00      3001
weighted avg       1.00      1.00      1.00      3001



[[1511    3]
 [   5 1482]]


# **Save The Model**

In [201]:
pickle.dump(pac, open('fake_news_model2.pkl', 'wb'))

In [202]:
pickle.dump(tfidf_v, open('tfidfvec.pkl', 'wb'))

Load model and vectorization 

In [203]:
joblib_model = pickle.load(open('fake_news_model2.pkl', 'rb'))

In [204]:
joblib_vect = pickle.load(open('tfidfvec.pkl', 'rb'))

In [219]:
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
review = re.sub('[^a-zA-Z]', ' ', df_true['text'][1])
review = review.lower()
review = review.split()
    
review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
review = ' '.join(review)
review

'washington reuter transgend peopl allow first time enlist u militari start monday order feder court pentagon said friday presid donald trump administr decid appeal rule block transgend ban two feder appeal court one washington one virginia last week reject administr request put hold order lower court judg requir militari begin accept transgend recruit jan justic depart offici said administr challeng rule depart defens announc releas independ studi issu come week rather litig interim appeal occur administr decid wait dod studi continu defend presid law author district court meantim offici said speak condit anonym septemb pentagon said creat panel senior offici studi implement direct trump prohibit transgend individu serv defens depart feb submit plan trump lawyer repres current serv transgend servic member aspir recruit said expect administr appeal rule conserv major suprem court hope would happen pentagon spokeswoman heather babb said statement mandat court order depart defens prepar 

In [220]:
val = tfidf_v.transform([review]).toarray()

In [221]:
pac.predict(val)

array([1])