In [2]:
import pandas as pd
import numpy as np
import itertools

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv("news.csv", index_col=None)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
dataset = df.drop("Unnamed: 0", axis=1) # axis 1 means column, axis 0 means row

In [6]:
X = dataset["text"]
y = dataset["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [7]:
count_vectorizer = CountVectorizer(stop_words="english") # stop_words means to remove common words like "is", "was", "were", "the", "is", etc.
count_train = count_vectorizer.fit_transform(X_train) # it creates a model from the training data and then predicts the training data

count_test = count_vectorizer.transform(X_test)

'''
#  .fit() method means to create a model from the training data
#  .fit_transform() method means model create + predict  from training data
'''



'\n#  .fit() method means to create a model from the training data\n#  .fit_transform() method means model create + predict  from training data\n'

In [8]:
print(count_train.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [9]:
len(count_vectorizer.get_feature_names_out()) # To get the number of features in the training data

56801

In [10]:
# Linear Classification
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression()
reg.fit(count_train, y_train) # train the model with the training data
pred = reg.predict(count_test) # predict the test data
print(pred)


score = metrics.accuracy_score(y_test, pred) # calculate the accuracy of the model
print("Accuracy: %0.3f"  % score )

cm = metrics.confusion_matrix(y_test, pred, labels=["FAKE", "REAL"])

['FAKE' 'FAKE' 'FAKE' ... 'REAL' 'FAKE' 'FAKE']
Accuracy: 0.913


In [11]:
print(cm)

[[988  83]
 [ 99 921]]


In [12]:
from sklearn.metrics import classification_report
report = classification_report(y_test, pred)

# Evaluation Actual (y_test) - Predicted (pred)

In [13]:
print(report)

              precision    recall  f1-score   support

        FAKE       0.91      0.92      0.92      1071
        REAL       0.92      0.90      0.91      1020

    accuracy                           0.91      2091
   macro avg       0.91      0.91      0.91      2091
weighted avg       0.91      0.91      0.91      2091



In [14]:
# Get First row from the dataset
print(dataset["text"][0])

Daniel Greenfield, a Shillman Journalism Fellow at the Freedom Center, is a New York writer focusing on radical Islam. 
In the final stretch of the election, Hillary Rodham Clinton has gone to war with the FBI. 
The word “unprecedented” has been thrown around so often this election that it ought to be retired. But it’s still unprecedented for the nominee of a major political party to go war with the FBI. 
But that’s exactly what Hillary and her people have done. Coma patients just waking up now and watching an hour of CNN from their hospital beds would assume that FBI Director James Comey is Hillary’s opponent in this election. 
The FBI is under attack by everyone from Obama to CNN. Hillary’s people have circulated a letter attacking Comey. There are currently more media hit pieces lambasting him than targeting Trump. It wouldn’t be too surprising if the Clintons or their allies were to start running attack ads against the FBI. 
The FBI’s leadership is being warned that the entire left

In [15]:
print(count_train[[0]])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 152 stored elements and shape (1, 56801)>
  Coords	Values
  (0, 39371)	2
  (0, 29886)	1
  (0, 18756)	1
  (0, 26482)	2
  (0, 12723)	1
  (0, 45820)	1
  (0, 49614)	1
  (0, 35654)	1
  (0, 15846)	2
  (0, 51775)	5
  (0, 23990)	1
  (0, 10533)	4
  (0, 12626)	2
  (0, 39599)	1
  (0, 20249)	1
  (0, 2625)	1
  (0, 13885)	1
  (0, 44932)	1
  (0, 43222)	1
  (0, 21231)	6
  (0, 43952)	6
  (0, 20679)	1
  (0, 45133)	1
  (0, 2586)	1
  (0, 53360)	1
  :	:
  (0, 13579)	1
  (0, 32470)	1
  (0, 16591)	1
  (0, 51232)	1
  (0, 2375)	1
  (0, 6216)	1
  (0, 2627)	1
  (0, 54279)	1
  (0, 25601)	1
  (0, 42242)	1
  (0, 48507)	1
  (0, 52786)	1
  (0, 55621)	1
  (0, 29341)	2
  (0, 12411)	1
  (0, 22105)	1
  (0, 50073)	1
  (0, 53108)	1
  (0, 32118)	1
  (0, 55517)	1
  (0, 52762)	1
  (0, 52971)	1
  (0, 52679)	1
  (0, 11070)	1
  (0, 9944)	1


In [16]:
count_train[[0]]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 152 stored elements and shape (1, 56801)>

In [None]:
#  **************************** Recheck the prediction of the first row **************************** #

# count_vectorizer = CountVectorizer(stop_words="english")
# count_train = count_vectorizer.fit_transform(X_train)
# print(count_train)


<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1103130 stored elements and shape (4244, 56801)>
  Coords	Values
  (0, 39371)	2
  (0, 29886)	1
  (0, 18756)	1
  (0, 26482)	2
  (0, 12723)	1
  (0, 45820)	1
  (0, 49614)	1
  (0, 35654)	1
  (0, 15846)	2
  (0, 51775)	5
  (0, 23990)	1
  (0, 10533)	4
  (0, 12626)	2
  (0, 39599)	1
  (0, 20249)	1
  (0, 2625)	1
  (0, 13885)	1
  (0, 44932)	1
  (0, 43222)	1
  (0, 21231)	6
  (0, 43952)	6
  (0, 20679)	1
  (0, 45133)	1
  (0, 2586)	1
  (0, 53360)	1
  :	:
  (4243, 36069)	1
  (4243, 55860)	1
  (4243, 52613)	1
  (4243, 52905)	1
  (4243, 52616)	1
  (4243, 53548)	1
  (4243, 55782)	1
  (4243, 26778)	1
  (4243, 47236)	1
  (4243, 15081)	1
  (4243, 4180)	1
  (4243, 39152)	1
  (4243, 9525)	4
  (4243, 51326)	1
  (4243, 54485)	1
  (4243, 24622)	1
  (4243, 17442)	1
  (4243, 10187)	3
  (4243, 33680)	1
  (4243, 25010)	4
  (4243, 54486)	1
  (4243, 30857)	1
  (4243, 16819)	1
  (4243, 53403)	1
  (4243, 17186)	1


In [17]:
X_train[[0]]

0    Daniel Greenfield, a Shillman Journalism Fello...
Name: text, dtype: object

In [19]:
reg.predict(count_train[[0]])

array(['REAL'], dtype=object)