# Fake News Detection.

In [2]:
## Intro

In [1]:
# Importing the libraries.
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
# Importing the dataset
news = pd.read_csv('news.csv')
news.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [3]:
# Splitting the data to train and test
X = news['text']
y = news['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [4]:
# Create a pipeline that creates a bag of words then applies Multinomial Naive Bayes model.
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('nbmodel', MultinomialNB())])

In [5]:
# Training our data
pipeline.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [7]:
# Let's try predicting the label for our test data.
pred = pipeline.predict(X_test)
pred

array(['REAL', 'FAKE', 'REAL', ..., 'FAKE', 'REAL', 'REAL'], dtype='<U4')

In [10]:
# Evaluating the model
report = classification_report(y_test, pred)
matrix = confusion_matrix(y_test, pred)
print(report)
print(matrix)

              precision    recall  f1-score   support

        FAKE       0.95      0.70      0.81       632
        REAL       0.76      0.97      0.85       635

    accuracy                           0.83      1267
   macro avg       0.86      0.83      0.83      1267
weighted avg       0.86      0.83      0.83      1267

[[442 190]
 [ 22 613]]


In [11]:
# Serializing the file
with open('model.pickle', 'wb') as handle:
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)