In [1]:
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [36]:
# Creating the dataframe object

df = pd.read_csv('train.csv')

In [3]:
# shows the first 5 rows of the dataframe 

df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
#  Getting the information about shape of DataFrame, coolumnwise information about null values if any, data types 

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
# checking various statistical parameters of each numeric columns

df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [6]:
# to get the null values in each column

df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
# Counts the number of each of the target labels

df['label'].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [10]:
news = df.fillna(' ') # filling all the null values with an empty string

In [11]:
# combining the title and author columns into one 'content' column

news['content'] = news['title'] + ' ' + news['author']

In [12]:
news.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


In [13]:
#  seperating the dataset into X and y

x = news.drop('label', axis=1)
y = news['label']

In [14]:
#  Calling the PorterStemmer class

port_stem = PorterStemmer()

In [15]:
# function to perform complete stemming operation

def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if word not in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [24]:
# stemming the contents of the column 'content' 

news['content'] = news['content'].apply(stemming)

In [25]:
# printing the outcome
news['content']

0        hous dem aid even see comey letter jason chaff...
1        flynn hillari clinton big woman campu breitbar...
2                   truth might get fire consortiumnew com
3        civilian kill singl us airstrik identifi jessi...
4        iranian woman jail fiction unpublish stori wom...
                               ...                        
20795    rapper trump poster child white supremaci jero...
20796    n f l playoff schedul matchup odd new york tim...
20797    maci said receiv takeov approach hudson bay ne...
20798    nato russia hold parallel exercis balkan alex ...
20799                            keep f aliv david swanson
Name: content, Length: 20800, dtype: object

In [26]:
# seperating the data and the label

x = news['content'].values
y = news['label'].values

In [37]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 210687 stored elements and shape (20800, 17128)>
  Coords	Values
  (0, 267)	0.27010124977708766
  (0, 2483)	0.3676519686797209
  (0, 2959)	0.2468450128533713
  (0, 3600)	0.3598939188262559
  (0, 3792)	0.2705332480845492
  (0, 4973)	0.233316966909351
  (0, 7005)	0.21874169089359144
  (0, 7692)	0.24785219520671603
  (0, 8630)	0.29212514087043684
  (0, 8909)	0.3635963806326075
  (0, 13473)	0.2565896679337957
  (0, 15686)	0.28485063562728646
  (1, 1497)	0.2939891562094648
  (1, 1894)	0.15521974226349364
  (1, 2223)	0.3827320386859759
  (1, 2813)	0.19094574062359204
  (1, 3568)	0.26373768806048464
  (1, 5503)	0.7143299355715573
  (1, 6816)	0.1904660198296849
  (1, 16799)	0.30071745655510157
  (2, 2943)	0.3179886800654691
  (2, 3103)	0.46097489583229645
  (2, 5389)	0.3866530551182615
  (2, 5968)	0.3474613386728292
  (2, 9620)	0.49351492943649944
  :	:
  (20797, 3643)	0.2115550061362374
  (20797, 7042)	0.21799048897828685
  (20797,

In [38]:
print(y)

[1 0 1 ... 0 1 1]


In [29]:
#  converting the textual data to numerica data

vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(x)

In [30]:
print(x)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 210687 stored elements and shape (20800, 17128)>
  Coords	Values
  (0, 7005)	0.21874169089359144
  (0, 3792)	0.2705332480845492
  (0, 267)	0.27010124977708766
  (0, 4973)	0.233316966909351
  (0, 13473)	0.2565896679337957
  (0, 2959)	0.2468450128533713
  (0, 8630)	0.29212514087043684
  (0, 7692)	0.24785219520671603
  (0, 2483)	0.3676519686797209
  (0, 15686)	0.28485063562728646
  (0, 3600)	0.3598939188262559
  (0, 8909)	0.3635963806326075
  (1, 5503)	0.7143299355715573
  (1, 6816)	0.1904660198296849
  (1, 2813)	0.19094574062359204
  (1, 1497)	0.2939891562094648
  (1, 16799)	0.30071745655510157
  (1, 2223)	0.3827320386859759
  (1, 1894)	0.15521974226349364
  (1, 3568)	0.26373768806048464
  (2, 15611)	0.41544962664721613
  (2, 9620)	0.49351492943649944
  (2, 5968)	0.3474613386728292
  (2, 5389)	0.3866530551182615
  (2, 3103)	0.46097489583229645
  :	:
  (20797, 9588)	0.17455348025522197
  (20797, 7042)	0.21799048897828685
  (207

In [31]:
# splitting the X & y features into the train and the test set

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y,random_state=42)

In [32]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 168499 stored elements and shape (16640, 17128)>
  Coords	Values
  (0, 10306)	0.08144775542675378
  (0, 16996)	0.08426036972343232
  (0, 15295)	0.08267566305236994
  (0, 15239)	0.26806475543422315
  (0, 16928)	0.28324397032237353
  (0, 336)	0.28324397032237353
  (0, 2432)	0.2407581496307299
  (0, 6200)	0.3276223924841172
  (0, 9956)	0.3774387904751837
  (0, 6421)	0.3774387904751837
  (0, 7176)	0.3774387904751837
  (0, 15107)	0.3929849120235453
  (1, 16847)	0.2613586680827983
  (1, 10306)	0.09056209501028727
  (1, 16996)	0.09368945244117634
  (1, 15295)	0.09192741056100139
  (1, 41)	0.2988901388344103
  (1, 2526)	0.2564530301035042
  (1, 1517)	0.2243545754142816
  (1, 4209)	0.2988901388344103
  (1, 6189)	0.30422609645953386
  (1, 16198)	0.36834795425798766
  (1, 10663)	0.3642846887576505
  (1, 4870)	0.3432894213503973
  (1, 4526)	0.3642846887576505
  :	:
  (16637, 5917)	0.3023461336407092
  (16637, 1887)	0.3175702956874389
  

In [39]:
print(x_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 42188 stored elements and shape (4160, 17128)>
  Coords	Values
  (0, 16226)	0.2197304744445337
  (0, 3173)	0.30220187476862226
  (0, 15154)	0.28147520515507646
  (0, 11811)	0.2736526668277463
  (0, 14625)	0.3334903874308643
  (0, 12672)	0.29724177794048273
  (0, 16820)	0.2700450057325363
  (0, 1691)	0.27334097688945147
  (0, 10889)	0.4074828305477443
  (0, 11054)	0.43934964127276355
  (1, 10306)	0.09804272955135314
  (1, 16996)	0.1014284138022778
  (1, 15295)	0.09952082326457516
  (1, 13775)	0.2478722101521351
  (1, 12672)	0.29840188035224147
  (1, 7835)	0.31685402927230133
  (1, 3618)	0.2807492849199086
  (1, 14495)	0.307026929254979
  (1, 7613)	0.35836836573750197
  (1, 14513)	0.3512537689930698
  (1, 8469)	0.3564842869281156
  (1, 2901)	0.40907319182785806
  (2, 2127)	0.3441449871306693
  (2, 4812)	0.3699166001602211
  (2, 3407)	0.36706564038941
  :	:
  (4157, 6633)	0.3574581433940959
  (4157, 13265)	0.48134330106333706
 

In [40]:
x.count_nonzero()

210687

In [35]:
# Implementing a classification model

classifier = LogisticRegression()

In [41]:
# Training the dataset on the model

classifier.fit(x_train, y_train)

In [45]:
# predicting the target column using training set

y_train_pred = classifier.predict(x_train)

In [46]:
# predicting the target column using test set

y_test_pred = classifier.predict(x_test)

In [49]:
# Evaluating the model accuracy and other parameters 

from sklearn.metrics import confusion_matrix, accuracy_score
cm_test = confusion_matrix(y_test, y_test_pred)
# cm_train = confusion_matrix(y_test, y_train_pred)
acs_test = accuracy_score(y_test, y_test_pred)
# acs_train = accuracy_score(y_test, y_train_pred)

In [39]:
# confusion matrix for test set predictions

print(cm_test)

[[1992   85]
 [  18 2065]]


In [40]:
# accuracy for the test set results 

print(acs_test)

0.9752403846153846


In [41]:
print(x_test[0])

  (0, 1691)	0.27334097688945147
  (0, 3173)	0.30220187476862226
  (0, 10889)	0.4074828305477443
  (0, 11054)	0.43934964127276355
  (0, 11811)	0.2736526668277463
  (0, 12672)	0.29724177794048273
  (0, 14625)	0.3334903874308643
  (0, 15154)	0.28147520515507646
  (0, 16226)	0.2197304744445337
  (0, 16820)	0.2700450057325363


In [42]:
print(x_test[1])
print('\n************************')
print(x_test[2])

  (0, 2901)	0.40907319182785806
  (0, 3618)	0.2807492849199086
  (0, 7613)	0.35836836573750197
  (0, 7835)	0.31685402927230133
  (0, 8469)	0.3564842869281156
  (0, 10306)	0.09804272955135314
  (0, 12672)	0.29840188035224147
  (0, 13775)	0.2478722101521351
  (0, 14495)	0.307026929254979
  (0, 14513)	0.3512537689930698
  (0, 15295)	0.09952082326457516
  (0, 16996)	0.1014284138022778

************************
  (0, 1788)	0.47665928472957086
  (0, 2127)	0.3441449871306693
  (0, 3407)	0.36706564038941
  (0, 4812)	0.3699166001602211
  (0, 5996)	0.4655292822474496
  (0, 15270)	0.4075133937130352


In [50]:
#  predicting the news as fake or real 

for i in range(x_test.count_nonzero()):
    
    x_new=x_test[i]
    prediction=classifier.predict(x_new)
    print(prediction)
    if (prediction[0]==0):
        print("The news is real")
    else:
        print("The news is fake")

[0]
The news is real
[0]
The news is real
[1]
The news is fake
[1]
The news is fake
[0]
The news is real
[0]
The news is real
[0]
The news is real
[0]
The news is real
[1]
The news is fake
[0]
The news is real
[1]
The news is fake
[0]
The news is real
[0]
The news is real
[0]
The news is real
[0]
The news is real
[0]
The news is real
[1]
The news is fake
[0]
The news is real
[1]
The news is fake
[1]
The news is fake
[1]
The news is fake
[1]
The news is fake
[0]
The news is real
[0]
The news is real
[1]
The news is fake
[0]
The news is real
[1]
The news is fake
[1]
The news is fake
[1]
The news is fake
[1]
The news is fake
[0]
The news is real
[0]
The news is real
[0]
The news is real
[0]
The news is real
[1]
The news is fake
[0]
The news is real
[0]
The news is real
[1]
The news is fake
[1]
The news is fake
[0]
The news is real
[0]
The news is real
[1]
The news is fake
[1]
The news is fake
[0]
The news is real
[1]
The news is fake
[1]
The news is fake
[1]
The news is fake
[1]
The news 

IndexError: index (4160) out of range