In [2]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics, cluster
from sklearn.decomposition import PCA #PCA

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# nltk.download('stopwords')
# nltk.download('wordnet')

import string



In [3]:
# Import data

df = pd.read_csv('fake_news_dataset.csv')

In [4]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
    df.shape

(20800, 5)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [7]:
# Dropping ID column

df = df.drop(columns=['id'])

In [8]:
# Deleting rows with Null Text Values

df = df.loc[df['text'].notnull()]

In [9]:
df.shape

(20761, 4)

In [10]:
df['label'].value_counts()

0    10387
1    10374
Name: label, dtype: int64

In [11]:
pd.options.mode.chained_assignment = None  
df['corpus'] = df['title'] + ' ' + df['author'] + ' ' + df['text']

In [12]:
df

Unnamed: 0,title,author,text,label,corpus
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Consortiumne...
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...
...,...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0,Rapper T.I.: Trump a ’Poster Child For White S...
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0,"N.F.L. Playoffs: Schedule, Matchups and Odds -..."
20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0,Macy’s Is Said to Receive Takeover Approach by...
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1,"NATO, Russia To Hold Parallel Exercises In Bal..."


In [13]:
df = df.drop(['title','author','text'], axis=1)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20761 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   20761 non-null  int64 
 1   corpus  18285 non-null  object
dtypes: int64(1), object(1)
memory usage: 486.6+ KB


In [15]:
df = df.loc[df['corpus'].notnull()]

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18285 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   18285 non-null  int64 
 1   corpus  18285 non-null  object
dtypes: int64(1), object(1)
memory usage: 428.6+ KB


In [17]:
df

Unnamed: 0,label,corpus
0,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,1,Why the Truth Might Get You Fired Consortiumne...
3,1,15 Civilians Killed In Single US Airstrike Hav...
4,1,Iranian woman jailed for fictional unpublished...
...,...,...
20795,0,Rapper T.I.: Trump a ’Poster Child For White S...
20796,0,"N.F.L. Playoffs: Schedule, Matchups and Odds -..."
20797,0,Macy’s Is Said to Receive Takeover Approach by...
20798,1,"NATO, Russia To Hold Parallel Exercises In Bal..."


In [18]:
x_train = df['corpus']
y_train = df['label']

In [19]:
x_train

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2        Why the Truth Might Get You Fired Consortiumne...
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
20795    Rapper T.I.: Trump a ’Poster Child For White S...
20796    N.F.L. Playoffs: Schedule, Matchups and Odds -...
20797    Macy’s Is Said to Receive Takeover Approach by...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799    What Keeps the F-35 Alive David Swanson   Davi...
Name: corpus, Length: 18285, dtype: object

In [20]:
# Text cleaning function

def text_process(text):
    '''
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Return the cleaned text as a list of words
    4. Remove words
    '''
    stemmer = WordNetLemmatizer()
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join([i for i in nopunc if not i.isdigit()])
    nopunc =  [word.lower() for word in nopunc.split() if word not in stopwords.words('english')]
    return [stemmer.lemmatize(word) for word in nopunc]

In [21]:
tfidfconvert = TfidfVectorizer(analyzer=text_process).fit(x_train)


In [22]:
x_transformed=tfidfconvert.transform(x_train)

In [23]:
kmeans = KMeans(n_clusters=2, init='k-means++', n_init=100)
clustered = kmeans.fit_predict(x_transformed)

In [24]:
testing_df = {'Corpus': x_train, 'Labels': y_train, 'Prediction': clustered}
testing_df = pd.DataFrame(data=testing_df)

testing_df.head(10)

Unnamed: 0,Corpus,Labels,Prediction
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,0
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0,0
2,Why the Truth Might Get You Fired Consortiumne...,1,0
3,15 Civilians Killed In Single US Airstrike Hav...,1,0
4,Iranian woman jailed for fictional unpublished...,1,0
5,Jackie Mason: Hollywood Would Love Trump if He...,0,0
7,Benoît Hamon Wins French Socialist Party’s Pre...,0,1
9,"A Back-Channel Plan for Ukraine and Russia, Co...",0,1
10,Obama’s Organizing for Action Partners with So...,0,0
11,"BBC Comedy Sketch ""Real Housewives of ISIS"" Ca...",0,0


In [25]:
correct = 0
incorrect = 0
for index, row in testing_df.iterrows():
    if row['Labels'] == row['Prediction']:
        correct += 1
    else:
        incorrect += 1
        
print("Correctly clustered news: " + str((correct*100)/(correct+incorrect)) + "%")

Correctly clustered news: 45.15176374077112%
