In [1]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import string

DATA COLLECTION 

In [2]:
data_fake = pd.read_csv('Fake.csv')
data_true = pd.read_csv('True.csv')

In [3]:
data_fake["class"] = 0
data_true["class"] = 1

In [4]:
data_fake.shape, data_true.shape

((5000, 5), (5000, 5))

DATA PREPROCESSING 

In [5]:
data_merge = pd.concat([data_fake, data_true], axis=0)
data_merge.head(10)

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
5,Racist Alabama Cops Brutalize Black Boy While...,The number of cases of cops brutalizing and ki...,News,"December 25, 2017",0
6,"Fresh Off The Golf Course, Trump Lashes Out A...",Donald Trump spent a good portion of his day a...,News,"December 23, 2017",0
7,Trump Said Some INSANELY Racist Stuff Inside ...,In the wake of yet another court decision that...,News,"December 23, 2017",0
8,Former CIA Director Slams Trump Over UN Bully...,Many people have raised the alarm regarding th...,News,"December 22, 2017",0
9,WATCH: Brand-New Pro-Trump Ad Features So Muc...,Just when you might have thought we d get a br...,News,"December 21, 2017",0


In [6]:
data_merge.columns

Index(['title', 'text', 'subject', 'date', 'class'], dtype='object')

In [7]:
data = data_merge.drop(['subject','date'], axis = 1)

In [8]:
data.isnull().sum()

title    0
text     0
class    0
dtype: int64

In [9]:
data = data.sample(frac = 1)

In [10]:
data.reset_index(inplace = True)
data.drop(['index'], axis = 1, inplace = True)

In [11]:
data.columns

Index(['title', 'text', 'class'], dtype='object')

In [12]:
data.head()

Unnamed: 0,title,text,class
0,BREAKING: Benghazi-Obsessed GOP Congressman J...,Rep. Jason Chaffetz (R-Utah) has been a thorn ...,0
1,U.S. list of NAFTA goals 'not earth-shattering...,OTTAWA (Reuters) - Canada sees few surprises o...,1
2,U.S. calls Myanmar moves against Rohingya 'eth...,WASHINGTON (Reuters) - The United States on We...,1
3,Chicago teachers union eyes future security wi...,CHICAGO (Reuters) - After trying for years to ...,1
4,White House says it is reviewing House bill on...,WASHINGTON (Reuters) - The White House said it...,1


DATA CLEANING 

In [13]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')

def preprocessing(text):
    text = text.lower()
    text = re.sub('\[.*?\]','',text)
    text = re.sub("\\W"," ",text)
    text = re.sub('https?://\S+|www\.\S+','',text)
    text = re.sub('<.*?>+','',text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*','',text)
    text = re.sub('[%s]' % re.escape(string.punctuation) ,'',text)  #Removes all punctuation characters using string.punctuation
    words = word_tokenize(text)                 #Tokenize the words
    stop_words = set(stopwords.words('english'))    #stores all stopwords in english as a set stop_words
    words = [word for word in words if word not in stop_words]  #remove stopwords
    stemmer = PorterStemmer()               #porterstemmer initializes to stemmer 
    words = [stemmer.stem(word) for word in words]  #stemming process
    text = ' '.join(words)                  #Joins all the words after stopwords removal and stemmming as a text
    return text


[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [14]:
data['text'] = data['text'].apply(preprocessing)
data['title'] = data['title'].apply(preprocessing)
data.head(10)

Unnamed: 0,title,text,class
0,break benghazi obsess gop congressman jason ch...,rep jason chaffetz r utah thorn side democrat ...,0
1,u list nafta goal earth shatter canada sourc,ottawa reuter canada see surpris u list goal r...,1
2,u call myanmar move rohingya ethnic cleans,washington reuter unit state wednesday call my...,1
3,chicago teacher union eye futur secur charter ...,chicago reuter tri year stymi growth charter s...,1
4,white hous say review hous bill russia sanction,washington reuter white hous said review bill ...,1
5,watch msnbc chri hay apolog air releas decept ...,msnbc chri hay consist proven strong voic libe...,1
6,u commerc secretari say trump endors border ta...,washington reuter new u commerc secretari wilb...,0
7,conway mexico pay wall want keep drug pour ame...,donald trump wall stupid money wast idea idiot...,0
8,trump will discuss solar power border wall eve...,trump took time today white hous meet republic...,0
9,new hampshir governor ask trump stop indonesia...,boston reuter new hampshir governor chri sunun...,1


FEATURE SELECTION 

In [15]:
data['combined'] = data['title']+data['text']
x = data['combined']
y = data['class']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25)
print(x_test.head(10))

532     trump get buri whine media classifi foundat le...
7251    u militari indefinit delay ban cluster bombwas...
1671    u treasuri outlin sweep reform capit marketwas...
1431    internet collect cring photograph snap pic tru...
4453    fox news frantic tri cover trump golf trip abs...
1535    trump end nine day oversea trip flourish troub...
5134    know trump decid pari climat deal germaniberli...
8837    trump order syrian air strike dinner xiwashing...
380     obama attorney gener spit trump face circuit c...
1051    nra mock kim kardashian rob gunpoint horrifi t...
Name: combined, dtype: object


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
print(xv_train.shape)
xv_test = vectorization.transform(x_test)
print(xv_train)
from joblib import dump
dump(vectorization, 'f.joblib')

(7500, 36011)
  (0, 12053)	0.038552057903332035
  (0, 29150)	0.0732861193491837
  (0, 5328)	0.0707635148171831
  (0, 33797)	0.03180339571110769
  (0, 14574)	0.05839419091511505
  (0, 10671)	0.031190590122365452
  (0, 18260)	0.041447956161009936
  (0, 35407)	0.025785696303593605
  (0, 24380)	0.11804467771386022
  (0, 5725)	0.09179212284744623
  (0, 6790)	0.054126154699678894
  (0, 6267)	0.071215684475677
  (0, 6028)	0.04616706593499958
  (0, 399)	0.08085453687503272
  (0, 31724)	0.047874188009213024
  (0, 5910)	0.10891966649226743
  (0, 29815)	0.08055208214174661
  (0, 17972)	0.061464777842057186
  (0, 31330)	0.03883265275603171
  (0, 28395)	0.1255844055429636
  (0, 10877)	0.05331801868654928
  (0, 29666)	0.04925584138454715
  (0, 10093)	0.03440371508930318
  (0, 17597)	0.043059486569633
  (0, 11248)	0.04817638344453198
  :	:
  (7499, 35642)	0.0820299201912906
  (7499, 2138)	0.019934561074636745
  (7499, 6309)	0.0265551170440591
  (7499, 894)	0.03168359419472883
  (7499, 31665)	0.018382

['f.joblib']

TRAINING AND TESTING OF THE MODEL 

Random Forest -

In [18]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state = 0)
RF.fit(xv_train, y_train)

In [19]:
from joblib import dump
dump(RF, 'RF.pk1')

['RF.pk1']

In [20]:
pred_rf = RF.predict(xv_test)
print(pred_rf)

[0 1 1 ... 1 0 1]


In [21]:
print(" Random Forest Accuracy = ",RF.score(xv_test, y_test))

 Random Forest Accuracy =  0.918


In [22]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

cm_rf = confusion_matrix(y_test, pred_rf)
print("Confusion Matrix for Random Forest:")
print(cm_rf)

# Assuming cm_rf is the confusion matrix from your random forest predictions
tn, fp, fn, tp = cm_rf.ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Confusion Matrix for Random Forest:
[[1148  112]
 [  93 1147]]
Precision: 0.9110405083399523
Recall: 0.925
F1 Score: 0.9179671868747499


In [23]:
from sklearn.ensemble import RandomForestClassifier

def predict_news_rf(input_text):
    input_text = preprocessing(input_text)
    input_vector = vectorization.transform([input_text])
    prediction = RF.predict(input_vector)
    return "Fake" if prediction[0] == 0 else "Real"

input_text =""
prediction_rf = predict_news_rf(input_text)
print(f"The input text is predicted as: {prediction_rf}")

The input text is predicted as: Real


LOGISTIC REGRESSION -

In [24]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(xv_train, y_train)


In [25]:
dump(LR, 'LR.pk1')

['LR.pk1']

In [26]:
pred_lr = LR.predict(xv_test)

In [27]:
print(" Logistic Regression Accuracy = ",LR.score(xv_test, y_test))

 Logistic Regression Accuracy =  0.9216


In [28]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

cm_rf = confusion_matrix(y_test, pred_lr)
print("Confusion Matrix for Logistic Regression :")
print(cm_rf)

# Assuming cm_rf is the confusion matrix from your random forest predictions
tn, fp, fn, tp = cm_rf.ravel()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Confusion Matrix for Logistic Regression :
[[1142  118]
 [  78 1162]]
Precision: 0.9078125
Recall: 0.9370967741935484
F1 Score: 0.9222222222222222


In [29]:
def predict_news(input_text):
    input_text = preprocessing(input_text)
    input_vector = vectorization.transform([input_text])
    prediction = LR.predict(input_vector)
    return "Fake" if prediction[0] == 0 else "Real"

input_text =" "
prediction = predict_news(input_text)
print(f"The input text is predicted as: {prediction}")


The input text is predicted as: Real


SVM

In [30]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_classifier = SVC(kernel='linear')

svm_classifier.fit(xv_train, y_train)

In [31]:
from joblib import dump
dump(svm_classifier, 'SVM.pk1')

['SVM.pk1']

In [32]:
y_pred = svm_classifier.predict(xv_test)

In [33]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(" SVM Accuracy = ",accuracy)

 SVM Accuracy =  0.9292


In [34]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

cm_svm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix for SVM :")
print(cm_svm)

# Assuming cm_rf is the confusion matrix from your random forest predictions
tn, fp, fn, tp = cm_svm.ravel()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Confusion Matrix for SVM :
[[1149  111]
 [  66 1174]]
Precision: 0.9136186770428015
Recall: 0.9467741935483871
F1 Score: 0.92990099009901


In [35]:
def predict_news_svm(input_text):
    input_text = preprocessing(input_text)
    input_vector = vectorization.transform([input_text])
    prediction = svm_classifier.predict(input_vector)
    return "Fake" if prediction[0] == 0 else "Real"

input_text = " "
prediction = predict_news_svm(input_text)
print(f"The input text is predicted as: {prediction}")

The input text is predicted as: Real


PASSIVE AGRESSIVE CLASSIFER

In [36]:
from sklearn.linear_model import PassiveAggressiveClassifier

# Create and train the Passive Aggressive Classifier
PAC = PassiveAggressiveClassifier(random_state=0,C=0.1)
PAC.fit(xv_train, y_train)

In [37]:
from joblib import dump
dump(PAC, 'PAC.pk1')

['PAC.pk1']

In [38]:
pred_pac = PAC.predict(xv_test)

In [39]:
accuracy = accuracy_score(y_test, pred_pac)
print(" Passive Agressive Classifier Accuracy = ",accuracy)

 Passive Agressive Classifier Accuracy =  0.9184


In [40]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

cm_pac = confusion_matrix(y_test, pred_pac)
print("Confusion Matrix for PAC :")
print(cm_pac)

# Assuming cm_rf is the confusion matrix from your random forest predictions
tn, fp, fn, tp = cm_pac.ravel()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)



Confusion Matrix for PAC :
[[1148  112]
 [  92 1148]]
Precision: 0.9111111111111111
Recall: 0.9258064516129032
F1 Score: 0.9184


In [41]:
def predict_news(input_text):
    input_text = preprocessing(input_text)
    input_vector = vectorization.transform([input_text])
    prediction = PAC.predict(input_vector)
    return "Fake" if prediction[0] == 0 else "Real"

input_text = " "
prediction = predict_news(input_text)
print(f"The input text is predicted as: {prediction}")

The input text is predicted as: Real
