In [28]:
#Import all Required Libraries

#data manipulation and analysis
import pandas as pd 

#working with arrays
import numpy as np 

#sklearn library contains a lot of efficient tools for machine learning and statistical modeling including classification, 
#regression, clustering and dimensionality reduction
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB

#Pickle is used for serializing and de-serializing Python object structures, 
#also called marshalling or flattening. 
#Serialization refers to the process of converting an object in memory to a byte stream that can be stored on disk 
#or sent over a network
import pickle

In [29]:
# Load the Dataset

# pd.read_csv is a function for load the data
#this is the path where the data store
dataset = pd.read_csv("C:\\Users\\data\\Desktop\\Fake News Detection\\fake_or_real_news.csv") 

# X-axis
x = dataset['text']

# Y-axis
y = dataset['label'] 

In [30]:
#this is the dataset

#This function shows the top 5 rows and colums of dataset
dataset.head() 

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [31]:
# Data Size (Rows, Columns)

#this function gives you the actual shape of the data in rows & columns
dataset.shape

(6335, 4)

In [32]:
# here we're performing Data Pre-processing for clean the data by cheking the null values in that perticular file

#this the function
dataset.isnull().any() 

Unnamed: 0    False
title         False
text          False
label         False
dtype: bool

In [33]:
#after this we devide the data into train and test sets which in the ratio 80:20

#means 80% is train set and rest 20% is test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [34]:
#now we impliment TfidfVectorizer (term frequency–inverse document frequency)
#that defines the importance the keyword in a webpage
#we also remove stop_words
#there are some words in commonly appears in the webpage such as "And", "Like" and other punctuations

tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7) 
tfidf_train=tfidf_vectorizer.fit_transform(x_train)
tfidf_test=tfidf_vectorizer.transform(x_test)

#The TfidfVectorizer will tokenize documents, learn the vocabulary and inverse document frequency weightings, 
#and allow you to encode new documents.

In [35]:
#here we impliment PassiveAggressiveClassifier which is online learning algorythm

#Passive-aggressive classification is one of the available incremental learning algorithms and it is very simple to implement, 
#since it has a closed-form update rule.
#The core concept is that the classifier adjusts its weight vector for each misclassified training sample it receives, 
#trying to correct it.

pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_train,y_train)
y_pred=pac.predict(tfidf_test)
score=accuracy_score(y_test,y_pred)
print(f'Accuracy: {round(score*100,2)}%')

#here we get an accuracy

Accuracy: 92.74%


In [39]:
#pipeline utility function which is use to train data and transform to the text data
#without doing it individualy the each time we perform

pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                    ('nbmodel', MultinomialNB())])


In [41]:
pipeline.fit(x_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [42]:
score=pipeline.score(x_test,y_test)
print('accuracy',score)

#here we get an accuracy score 83%

accuracy 0.8397790055248618


In [43]:
pred = pipeline.predict(x_test)

In [44]:
#now we bring the performance evaluation table

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        FAKE       0.95      0.71      0.81       624
        REAL       0.77      0.97      0.86       643

    accuracy                           0.84      1267
   macro avg       0.86      0.84      0.84      1267
weighted avg       0.86      0.84      0.84      1267



In [45]:
#and also the confusion matrix

print(confusion_matrix(y_test, pred))

[[442 182]
 [ 21 622]]


In [46]:
with open('model.pkl', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)