In [1]:
import numpy as np
import pandas as pd
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.shape

(20800, 5)

In [5]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

I didn't want to delete any of the data using dropna, rather I preferred to replace null values with the modes of each field having null values.

In [6]:
df['title']=df['title'].fillna(df['title'].mode()[0])
df['author']=df['author'].fillna(df['author'].mode()[0])
df['text']=df['text'].fillna(df['text'].mode()[0])

In [7]:
df['title']=df['title'].str.lower() 
df['author']=df['author'].str.lower()
# This avoids having multiple copies of the same words. For example, 
#'Give' and 'give' will be taken as different words if this code is ignored.

In [8]:
# Stopwords are the English words which does not add much meaning to a sentence. They can safely be 
# ignored without sacrificing the meaning of the sentence. Examples are the, he, have etc. Such words 
# are already captured in corpus (import nltk.corpus). Their exclusion from texts, which results into 
# word reduction, often times increases the accuracy of training model.

nltk.download('stopwords')

stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Student\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
from sklearn.feature_extraction import text
stop=text.ENGLISH_STOP_WORDS
print(stop)
# the sklearn stopwords was employed to further reduce the words in the text and thus increase the 
# accuracy of the model. Stopwords in sklearn are different from those from nltk.corpus

frozenset({'each', 'everywhere', 'other', 'above', 'con', 'whither', 'beyond', 'well', 'find', 'still', 'de', 'hundred', 'nothing', 'latterly', 'none', 'whatever', 'too', 'few', 'where', 'ours', 'describe', 'everyone', 'six', 'yet', 'since', 'anyone', 'could', 'moreover', 'throughout', 'ten', 'former', 'whom', 'twenty', 'towards', 'would', 'amongst', 'us', 'about', 'nowhere', 'might', 'you', 'of', 'these', 'third', 'there', 'were', 'itself', 'detail', 'ever', 'his', 'anyhow', 'sincere', 'those', 'ltd', 'cannot', 'themselves', 'the', 'couldnt', 'mine', 'up', 'behind', 'name', 'next', 'some', 'their', 'however', 're', 'somewhere', 'this', 'will', 'am', 'became', 'being', 'forty', 'serious', 'thereafter', 'herself', 'but', 'anywhere', 'indeed', 'eight', 'four', 'bottom', 'wherein', 'had', 'top', 'bill', 'even', 'toward', 'to', 'therein', 'seems', 'him', 'whenever', 'thick', 'etc', 'back', 'that', 'un', 'fire', 'with', 'during', 'besides', 'out', 'all', 'afterwards', 'another', 'if', 'fron

In [11]:
df['revtitle'] = df['title'].astype(str)# this code was considered as a result of an attribute error
#from python stating title column is not string
df['clean_title'] = df['revtitle'].apply(lambda x: ' '.join([word for word in x.split() if word not 
                                                             in (stop_words) and (stop)]))
#using the lambda x and split functions in python

In [12]:
df['rev'] = df['author'].astype(str)# same as above
df['clean_author'] = df['rev'].apply(lambda x: ' '.join([word for word in x.split() if word not in 
                                                         (stop_words) and (stop)]))

In [13]:
df['content']=df['clean_title']+' '+df['clean_author']
df['content'] #concertenating both clean_title and clean_author

0        house dem aide: didn’t even see comey’s letter...
1        flynn: hillary clinton, big woman campus - bre...
2                 truth might get fired consortiumnews.com
3        15 civilians killed single us airstrike identi...
4        iranian woman jailed fictional unpublished sto...
                               ...                        
20795    rapper t.i.: trump ’poster child white suprema...
20796    n.f.l. playoffs: schedule, matchups odds - new...
20797    macy’s said receive takeover approach hudson’s...
20798    nato, russia hold parallel exercises balkans a...
20799                       keeps f-35 alive david swanson
Name: content, Length: 20800, dtype: object

In [14]:
print(list(df))

['id', 'title', 'author', 'text', 'label', 'revtitle', 'clean_title', 'rev', 'clean_author', 'content']


In [15]:
df.head()

Unnamed: 0,id,title,author,text,label,revtitle,clean_title,rev,clean_author,content
0,0,house dem aide: we didn’t even see comey’s let...,darrell lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide: we didn’t even see comey’s let...,house dem aide: didn’t even see comey’s letter...,darrell lucus,darrell lucus,house dem aide: didn’t even see comey’s letter...
1,1,"flynn: hillary clinton, big woman on campus - ...",daniel j. flynn,Ever get the feeling your life circles the rou...,0,"flynn: hillary clinton, big woman on campus - ...","flynn: hillary clinton, big woman campus - bre...",daniel j. flynn,daniel j. flynn,"flynn: hillary clinton, big woman campus - bre..."
2,2,why the truth might get you fired,consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,why the truth might get you fired,truth might get fired,consortiumnews.com,consortiumnews.com,truth might get fired consortiumnews.com
3,3,15 civilians killed in single us airstrike hav...,jessica purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 civilians killed in single us airstrike hav...,15 civilians killed single us airstrike identi...,jessica purkiss,jessica purkiss,15 civilians killed single us airstrike identi...
4,4,iranian woman jailed for fictional unpublished...,howard portnoy,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jailed for fictional unpublished...,iranian woman jailed fictional unpublished sto...,howard portnoy,howard portnoy,iranian woman jailed fictional unpublished sto...


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            20800 non-null  int64 
 1   title         20800 non-null  object
 2   author        20800 non-null  object
 3   text          20800 non-null  object
 4   label         20800 non-null  int64 
 5   revtitle      20800 non-null  object
 6   clean_title   20800 non-null  object
 7   rev           20800 non-null  object
 8   clean_author  20800 non-null  object
 9   content       20800 non-null  object
dtypes: int64(2), object(8)
memory usage: 1.6+ MB


In [17]:
df['content'][0]

'house dem aide: didn’t even see comey’s letter jason chaffetz tweeted darrell lucus'

In [18]:
ps=PorterStemmer()
#stemming the content column of interest. Stemming is targeted towards replacing similar words with 
#their root word e.g actress, acted, acting, action....all have a root word called act. Stemming also 
#reduces the length of words

In [19]:
new_content=[]#declaring the stemmed text as a list stored in a variable called new_content
for w in df.content[:]:
    new_content.append(ps.stem(w))
print(new_content)



In [20]:
df['stemmed_stopwords_removed']=new_content
df.head()

Unnamed: 0,id,title,author,text,label,revtitle,clean_title,rev,clean_author,content,stemmed_stopwords_removed
0,0,house dem aide: we didn’t even see comey’s let...,darrell lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide: we didn’t even see comey’s let...,house dem aide: didn’t even see comey’s letter...,darrell lucus,darrell lucus,house dem aide: didn’t even see comey’s letter...,house dem aide: didn’t even see comey’s letter...
1,1,"flynn: hillary clinton, big woman on campus - ...",daniel j. flynn,Ever get the feeling your life circles the rou...,0,"flynn: hillary clinton, big woman on campus - ...","flynn: hillary clinton, big woman campus - bre...",daniel j. flynn,daniel j. flynn,"flynn: hillary clinton, big woman campus - bre...","flynn: hillary clinton, big woman campus - bre..."
2,2,why the truth might get you fired,consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,why the truth might get you fired,truth might get fired,consortiumnews.com,consortiumnews.com,truth might get fired consortiumnews.com,truth might get fired consortiumnews.com
3,3,15 civilians killed in single us airstrike hav...,jessica purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 civilians killed in single us airstrike hav...,15 civilians killed single us airstrike identi...,jessica purkiss,jessica purkiss,15 civilians killed single us airstrike identi...,15 civilians killed single us airstrike identi...
4,4,iranian woman jailed for fictional unpublished...,howard portnoy,Print \nAn Iranian woman has been sentenced to...,1,iranian woman jailed for fictional unpublished...,iranian woman jailed fictional unpublished sto...,howard portnoy,howard portnoy,iranian woman jailed fictional unpublished sto...,iranian woman jailed fictional unpublished sto...


In [21]:
print(type(df.stemmed_stopwords_removed))

<class 'pandas.core.series.Series'>


In [22]:
df['stemmed_stopwords_removed']

0        house dem aide: didn’t even see comey’s letter...
1        flynn: hillary clinton, big woman campus - bre...
2                 truth might get fired consortiumnews.com
3        15 civilians killed single us airstrike identi...
4        iranian woman jailed fictional unpublished sto...
                               ...                        
20795    rapper t.i.: trump ’poster child white suprema...
20796    n.f.l. playoffs: schedule, matchups odds - new...
20797    macy’s said receive takeover approach hudson’s...
20798    nato, russia hold parallel exercises balkans a...
20799                       keeps f-35 alive david swanson
Name: stemmed_stopwords_removed, Length: 20800, dtype: object

In [23]:
X=df['stemmed_stopwords_removed']
Y=df.label

In [24]:
#converting textual documents to numerical values
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('lr',LogisticRegression())
    
])
#vectorizer=TfidfVectorizer()

In [25]:
# vectorizer.fit(X)

In [26]:
# X=vectorizer.transform(X)

In [27]:
# X.shape

In [28]:
# Y.shape

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42,stratify=Y)

In [30]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

In [31]:
X_train

18697    thousands mourners celebrate gwen ifill’s tena...
19320    oklahoma governor vetoes bill would charge abo...
13879          fascism india - countercurrents.org pam key
15282    ‘terrifying’: at&t spying americans profit, ne...
17746    new project veritas release: rigging election ...
                               ...                        
13180    poll: corrupt federal agency? anonymous coward...
11811    hillary campaign hates pander “f*cking dumb” m...
17375    gene tests identify breast cancer patients ski...
20189    france: police siege muslim migrant areas arou...
3515     donald trump & hillary clinton ~ rap song (the...
Name: stemmed_stopwords_removed, Length: 16640, dtype: object

In [32]:
X_train.shape

(16640,)

In [33]:
Y_train

18697    0
19320    0
13879    1
15282    1
17746    1
        ..
13180    1
11811    1
17375    0
20189    1
3515     1
Name: label, Length: 16640, dtype: int64

In [34]:
clf.fit(X_train,Y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('lr', LogisticRegression())])

In [35]:
y_predicted=clf.predict(X_test)
y_predicted

array([0, 0, 1, ..., 0, 1, 0], dtype=int64)

In [36]:
from sklearn import metrics
metrics.accuracy_score(Y_test,y_predicted)

0.9872596153846154

In [37]:
metrics.f1_score(Y_test,y_predicted)

0.987377947130269

In [38]:
clf.score(X_test,Y_test)

0.9872596153846154

In [39]:
clf.score(X_train,Y_train)

0.99921875

### pre-processing of the test file for prediction

In [40]:
submit=pd.read_csv('submit.csv')
test=pd.read_csv('test.csv')

In [41]:
submit.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [42]:
test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [43]:
test.shape

(5200, 4)

In [44]:
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [45]:
test['title']=test['title'].fillna(test['title'].mode()[0])
test['author']=test['author'].fillna(test['author'].mode()[0])
test['text']=test['text'].fillna(test['text'].mode()[0])

In [46]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5200 non-null   object
 2   author  5200 non-null   object
 3   text    5200 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


In [47]:
test['revtitle'] = test['title'].astype(str)
test['clean_title'] = test['revtitle'].apply(lambda x: ' '.join([word for word in x.split() if word not 
                                                             in (stop_words) and (stop)]))


In [48]:
test['rev'] = test['author'].astype(str)
test['clean_author'] = test['rev'].apply(lambda x: ' '.join([word for word in x.split() if word not in 
                                                         (stop_words) and (stop)]))

In [49]:
test['content']=test['clean_title']+' '+test['clean_author']
test['content'] 

0       Specter Trump Loosens Tongues, Not Purse Strin...
1       Russian warships ready strike terrorists near ...
2       #NoDAPL: Native American Leaders Vow Stay All ...
3       Tim Tebow Will Attempt Another Comeback, This ...
4       Keiser Report: Meme Wars (E995) Truth Broadcas...
                              ...                        
5195    The Bangladeshi Traffic Jam That Never Ends - ...
5196    John Kasich Signs One Abortion Bill Ohio Vetoe...
5197    California Today: What, Exactly, Is Your Sushi...
5198    300 US Marines To Be Deployed To Russian Borde...
5199    Awkward Sex, Onscreen Off - The New York Times...
Name: content, Length: 5200, dtype: object

In [50]:
stemmed_test=[]
for w in test.content[:]:
    stemmed_test.append(ps.stem(w))
print(stemmed_test)



In [51]:
test['test_stemmed_stopwords_removed']=stemmed_test
test.head()

Unnamed: 0,id,title,author,text,revtitle,clean_title,rev,clean_author,content,test_stemmed_stopwords_removed
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...","Specter of Trump Loosens Tongues, if Not Purse...","Specter Trump Loosens Tongues, Not Purse Strin...",David Streitfeld,David Streitfeld,"Specter Trump Loosens Tongues, Not Purse Strin...","specter trump loosens tongues, not purse strin..."
1,20801,Russian warships ready to strike terrorists ne...,Pam Key,Russian warships ready to strike terrorists ne...,Russian warships ready to strike terrorists ne...,Russian warships ready strike terrorists near ...,Pam Key,Pam Key,Russian warships ready strike terrorists near ...,russian warships ready strike terrorists near ...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,#NoDAPL: Native American Leaders Vow to Stay A...,#NoDAPL: Native American Leaders Vow Stay All ...,Common Dreams,Common Dreams,#NoDAPL: Native American Leaders Vow Stay All ...,#nodapl: native american leaders vow stay all ...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...","Tim Tebow Will Attempt Another Comeback, This ...","Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,Daniel Victor,"Tim Tebow Will Attempt Another Comeback, This ...","tim tebow will attempt another comeback, this ..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,Keiser Report: Meme Wars (E995),Keiser Report: Meme Wars (E995),Truth Broadcast Network,Truth Broadcast Network,Keiser Report: Meme Wars (E995) Truth Broadcas...,keiser report: meme wars (e995) truth broadcas...


In [52]:
test['label']=submit.label
test.head()

Unnamed: 0,id,title,author,text,revtitle,clean_title,rev,clean_author,content,test_stemmed_stopwords_removed,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...","Specter of Trump Loosens Tongues, if Not Purse...","Specter Trump Loosens Tongues, Not Purse Strin...",David Streitfeld,David Streitfeld,"Specter Trump Loosens Tongues, Not Purse Strin...","specter trump loosens tongues, not purse strin...",0
1,20801,Russian warships ready to strike terrorists ne...,Pam Key,Russian warships ready to strike terrorists ne...,Russian warships ready to strike terrorists ne...,Russian warships ready strike terrorists near ...,Pam Key,Pam Key,Russian warships ready strike terrorists near ...,russian warships ready strike terrorists near ...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,#NoDAPL: Native American Leaders Vow to Stay A...,#NoDAPL: Native American Leaders Vow Stay All ...,Common Dreams,Common Dreams,#NoDAPL: Native American Leaders Vow Stay All ...,#nodapl: native american leaders vow stay all ...,0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...","Tim Tebow Will Attempt Another Comeback, This ...","Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,Daniel Victor,"Tim Tebow Will Attempt Another Comeback, This ...","tim tebow will attempt another comeback, this ...",1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,Keiser Report: Meme Wars (E995),Keiser Report: Meme Wars (E995),Truth Broadcast Network,Truth Broadcast Network,Keiser Report: Meme Wars (E995) Truth Broadcas...,keiser report: meme wars (e995) truth broadcas...,1


In [53]:
test.isnull().sum()

id                                0
title                             0
author                            0
text                              0
revtitle                          0
clean_title                       0
rev                               0
clean_author                      0
content                           0
test_stemmed_stopwords_removed    0
label                             0
dtype: int64

In [54]:
Z=test['test_stemmed_stopwords_removed']
T=test.label

In [55]:
# vectorizer.fit(Z)

In [56]:
# Z=vectorizer.transform(Z)

In [57]:
clf.fit(Z,T)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('lr', LogisticRegression())])

In [58]:
# new_=clf.predict(test.test_stemmed_stopwords_removed)
# newl=pd.DataFrame(new_,columns=['prediction_'])
# newl

In [59]:
prediction=clf.predict(Z)
submit_mine=pd.DataFrame(prediction,columns=['prediction_submit'])

submit_mine

Unnamed: 0,prediction_submit
0,0
1,1
2,0
3,1
4,1
...,...
5195,0
5196,1
5197,0
5198,1


In [60]:
prediction[0]

0

In [61]:
if prediction[0]==0:
    print ('The article is not fake')
else:
    print('The article is fake')

The article is not fake


In [62]:
submit_mine.to_csv('submit_liz.csv')

In [63]:
#for quick comparison between the submit.label given in the dataset and the one predicted 
#(prediction_submit) by the model
new_df=submit_mine
new_df['submit.label']=submit.label
new_df.head()

Unnamed: 0,prediction_submit,submit.label
0,0,0
1,1,1
2,0,0
3,1,1
4,1,1


In [64]:
new_df.tail()

Unnamed: 0,prediction_submit,submit.label
5195,0,0
5196,1,1
5197,0,0
5198,1,1
5199,0,0


### Correct prediction above