Import the Libraries

In [27]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Dataset : Kaggle

Loading the Dataset

In [30]:
news_dataset = pd.read_csv('/content/data.csv')

In [31]:
news_dataset.shape

(4009, 4)

In [32]:
news_dataset.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [33]:
#Check the null values
news_dataset.isnull().sum()

Unnamed: 0,0
URLs,0
Headline,0
Body,21
Label,0


In [34]:
# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

In [35]:
news_dataset.isnull().sum()

Unnamed: 0,0
URLs,0
Headline,0
Body,0
Label,0


In [36]:
print(news_dataset['Headline'])

0              Four ways Bob Corker skewered Donald Trump
1       Linklater's war veteran comedy speaks to moder...
2       Trump’s Fight With Corker Jeopardizes His Legi...
3       Egypt's Cheiron wins tie-up with Pemex for Mex...
4             Jason Aldean opens 'SNL' with Vegas tribute
                              ...                        
4004                                      Trends to Watch
4005    Trump Jr. Is Soon To Give A 30-Minute Speech F...
4006          Ron Paul on Trump, Anarchism & the AltRight
4007    China to accept overseas trial data in bid to ...
4008    Vice President Mike Pence Leaves NFL Game Beca...
Name: Headline, Length: 4009, dtype: object


Stemming:

Stemming is the process of reducing a word to its root word.

In [39]:
port_stem = PorterStemmer()

In [42]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [43]:
news_dataset['Headline'] = news_dataset['Headline'].apply(stemming)

In [44]:
news_dataset['Headline']

Unnamed: 0,Headline
0,four way bob corker skewer donald trump
1,linklat war veteran comedi speak modern americ...
2,trump fight corker jeopard legisl agenda
3,egypt cheiron win tie pemex mexican onshor oil...
4,jason aldean open snl vega tribut
...,...
4004,trend watch
4005,trump jr soon give minut speech
4006,ron paul trump anarch altright
4007,china accept oversea trial data bid speed drug...


In [53]:
# seperating the data & labels
X = news_dataset['Headline'].values
Y = news_dataset['Label'].values

In [54]:
print(X)
print(Y)

['four way bob corker skewer donald trump'
 'linklat war veteran comedi speak modern america say star'
 'trump fight corker jeopard legisl agenda' ...
 'ron paul trump anarch altright'
 'china accept oversea trial data bid speed drug approv'
 'vice presid mike penc leav nfl game anti american protest']
[1 1 1 ... 0 1 0]


In [55]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [58]:
print(X)

  (0, 5242)	0.37863563249625337
  (0, 4987)	0.19663377847748262
  (0, 4400)	0.5210398276773331
  (0, 1895)	0.38469689571798976
  (0, 1400)	0.32732150493643364
  (0, 1046)	0.38469689571798976
  (0, 531)	0.37863563249625337
  (1, 5222)	0.2663522983858765
  (1, 5157)	0.35098177557086435
  (1, 4553)	0.29755081676117606
  (1, 4495)	0.3621799099417042
  (1, 4198)	0.21748052230065945
  (1, 3107)	0.3688809885457156
  (1, 2803)	0.4114013985546997
  (1, 956)	0.38576635805583787
  (1, 144)	0.2910665140650854
  (2, 4987)	0.1886343870160031
  (2, 2748)	0.4762862128463209
  (2, 2552)	0.4998430547683992
  (2, 1789)	0.3531024617500656
  (2, 1046)	0.3690467816496352
  (2, 72)	0.4762862128463209
  (3, 5306)	0.2250193597747793
  (3, 4859)	0.33100081042556057
  (3, 3530)	0.3654630937185681
  :	:
  (4005, 2030)	0.3815661475297205
  (4006, 4987)	0.2123495309806764
  (4006, 4101)	0.4908314641056838
  (4006, 3514)	0.42271782854191553
  (4006, 156)	0.5173498757944015
  (4006, 132)	0.5173498757944015
  (4007, 4

Splitting the dataset into training and testing data

In [59]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Train the model - Logistic Regression

In [60]:
model = LogisticRegression()

In [61]:
model.fit(X_train, Y_train)

Evaluation: Accuracy Score

In [63]:
#Accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.95385095104459


In [65]:
#Accuracy score on the testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.85785536159601


Make a predictive system

In [67]:
X_new = X_test[0]
prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real
