## Detecting Fake News and Real News

**Contributors:** Clayton Fields, Evi Ofekeze, Justin Carpenter

In [15]:
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import re
import textstat as ts
import xgboost as xg
from sklearn import svm
import csv

import scipy.sparse
import scipy.sparse.csgraph

In [16]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, make_scorer, roc_auc_score, average_precision_score, precision_score, f1_score

### Politifact: Load and Format News Content Data

In [17]:
#get real news content data
temp = pd.read_json('PolitiFact/RealNewsContent/PolitiFact_Real_2-Webpage.json',orient='index')
pf_real_news =  pd.DataFrame(columns=temp.index)
for i in range(1,121):
    path='PolitiFact/RealNewsContent/PolitiFact_Real_'+str(i)+'-Webpage.json'
    df = pd.read_json(path,orient='index')
    df = df.transpose()
    pf_real_news = pd.concat([pf_real_news, df])
del(df,path,i)


#get fake news content data
pf_fake_news =  pd.DataFrame(columns=temp.index)
for i in range(1,121):
    path='PolitiFact/FakeNewsContent/PolitiFact_Fake_'+str(i)+'-Webpage.json'
    df = pd.read_json(path,orient='index')
    df = df.transpose()
    pf_fake_news = pd.concat([pf_fake_news, df])

del(df,path,i,temp)


#Assing Classe
pf_fake_news['Real'] = 0
pf_real_news['Real'] = 1

print(pf_real_news.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 0 to 0
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   top_img         118 non-null    object
 1   text            120 non-null    object
 2   authors         118 non-null    object
 3   keywords        118 non-null    object
 4   meta_data       118 non-null    object
 5   canonical_link  118 non-null    object
 6   images          118 non-null    object
 7   title           120 non-null    object
 8   url             118 non-null    object
 9   summary         118 non-null    object
 10  movies          118 non-null    object
 11  publish_date    81 non-null     object
 12  source          118 non-null    object
 13  Real            120 non-null    int64 
dtypes: int64(1), object(13)
memory usage: 14.1+ KB
None


In [18]:
pf_real_news = pf_real_news.reset_index(drop=True)
pf_fake_news.head()

Unnamed: 0,top_img,text,authors,keywords,meta_data,canonical_link,images,title,url,summary,movies,publish_date,source,Real
0,http://occupydemocrats.com/wp-content/uploads/...,335 SHARES SHARE THIS STORY\n\nRepublican atta...,[Colin Taylor],[],{'generator': 'Powered by Visual Composer - dr...,http://occupydemocrats.com/2016/01/12/virginia...,[http://occupydemocrats.com/wp-content/uploads...,Virginia Republican Wants Schools To Check Chi...,http://www.occupydemocrats.com/virginia-republ...,,[],{'$date': 1452628948000},http://www.occupydemocrats.com,0
0,http://www.americanpoliticnews.com/wp-content/...,Denzel Washington Switches to Trump Shocks Hol...,[],[],"{'og': {'site_name': 'American Politic', 'desc...",http://www.americanpoliticnews.com/news/denzel...,[http://www.americanpoliticnews.com/wp-content...,Denzel Washington Switches to Trump Shocks Hol...,http://www.americanpoliticnews.com/news/denzel...,,[],{'$date': 1472491609000},http://www.americanpoliticnews.com,0
0,http://dailysnark.com/wp-content/uploads/2016/...,Ad\n\nYou may asked what the Unites States did...,[],[],{'generator': 'Powered by Slider Revolution 5....,http://dailysnark.com/harambe-dead-gorilla-got...,[http://dailysnark.com/wp-content/uploads/2016...,"Harambe, A Dead Gorilla, Got Over 15,000 Votes...",http://dailysnark.com/harambe-dead-gorilla-got...,,[],{'$date': 1478666395000},http://dailysnark.com,0
0,https://cdn.vox-cdn.com/thumbor/sye0FzRVD4YBWJ...,"Last night, a twitter account by the name of @...",[Nov Est],[],"{'outbrainsection': 'us-world', 'msapplication...",https://www.theverge.com/2016/11/25/13748226/c...,[https://cdn.vox-cdn.com/thumbor/BDjczkjZSskRO...,The CNN porn scare is how fake news spreads,http://www.theverge.com/2016/11/25/13748226/cn...,,[],{'$date': 1480032000000},http://www.theverge.com,0
0,http://spinzon.com/wp-content/uploads/2016/11/...,Lady Gaga has opened up about the perils of fa...,[],[],{'description': 'Gaga also said that she finds...,http://spinzon.com/lady-gaga-reveals-plan-cove...,[https://i0.wp.com/assets.pinterest.com/images...,Lady Gaga Reveals Plan To Cover Her Face Again...,http://spinzon.com/lady-gaga-reveals-plan-cove...,,[],{'$date': 1480072046000},http://spinzon.com,0


There are two news content sets for stories categorized by Politifact one consisting of fake and real stories. Both data sets contain 120 stories with the same 13 fields for each. Above is an informational readout for the Politifact fake news data with the name of each field where the names are mostly self-expanatory. Note that the text column contains the entire text of the news article.

### Politifact: Load and Format User/Network Data

In [19]:
# get user info data
pf_news = pd.read_csv('PolitiFact/News.txt',header=None,names=['news'])
pf_users = pd.read_csv('PolitiFact/User.txt',header=None,names=['users'])
pf_news_user = pd.read_csv('PolitiFact/PolitiFactNewsUser.txt',header=None,names=['news_users'])
pf_user_user = pd.read_csv('PolitiFact/PolitiFactUserUser.txt',header=None,names=['followers'])

# build graph
G_pf = nx.DiGraph()
G_pf.add_nodes_from(pf_users)
G_pf.add_edges_from(pf_user_user['followers'].str.split('\t'))
print('The number of nodes in the network is ',G_pf.number_of_nodes())
print('The number of edges in the network is ',G_pf.number_of_edges())
print('The number of user-news relationships is', len(pf_news_user))

The number of nodes in the network is  23866
The number of edges in the network is  574744
The number of user-news relationships is 32791


There are 23,866 distinct twitter users represented in the data set. 

There are 574,744 user relationships that represent who is following who. 

Finally there are 32,791 user-news relationships that tell which user shared which story and how many times the story was shared.

### BuzzFeed: Load and Format News Content Data

In [20]:
#get real news content data
temp = pd.read_json('BuzzFeed/RealNewsContent/BuzzFeed_Real_2-Webpage.json',orient='index')
bf_real_news =  pd.DataFrame(columns=temp.index)
for i in range(1,92):
    path='BuzzFeed/RealNewsContent/BuzzFeed_Real_'+str(i)+'-Webpage.json'
    df = pd.read_json(path,orient='index')
    df = df.transpose()
    bf_real_news = pd.concat([bf_real_news,df])
del(df,path,i)

#get fake news content data
bf_fake_news =  pd.DataFrame(columns=temp.index)
for i in range(1,92):
    path='BuzzFeed/FakeNewsContent/BuzzFeed_Fake_'+str(i)+'-Webpage.json'
    df = pd.read_json(path,orient='index')
    df = df.transpose()
    bf_fake_news = pd.concat([bf_fake_news, df])
del(df,path,i,temp)
bf_fake_news['Real'] = 0
bf_real_news['Real'] = 1

bf_real_news.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 0 to 0
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   top_img         91 non-null     object
 1   text            91 non-null     object
 2   authors         91 non-null     object
 3   keywords        91 non-null     object
 4   meta_data       91 non-null     object
 5   canonical_link  91 non-null     object
 6   images          91 non-null     object
 7   title           91 non-null     object
 8   url             91 non-null     object
 9   summary         91 non-null     object
 10  movies          91 non-null     object
 11  publish_date    56 non-null     object
 12  source          91 non-null     object
 13  Real            91 non-null     int64 
dtypes: int64(1), object(13)
memory usage: 10.7+ KB


In [21]:
bf_fake_news = bf_fake_news.reset_index(drop=True)
bf_fake_news

Unnamed: 0,top_img,text,authors,keywords,meta_data,canonical_link,images,title,url,summary,movies,publish_date,source,Real
0,http://addictinginfo.addictinginfoent.netdna-c...,I woke up this morning to find a variation of ...,[Wendy Gittleson],[],{'publisher': 'Addicting Info | The Knowledge ...,http://addictinginfo.com/2016/09/19/proof-the-...,"[http://i.imgur.com/JeqZLhj.png, http://addict...",Proof The Mainstream Media Is Manipulating The...,http://www.addictinginfo.org/2016/09/19/proof-...,,[],{'$date': 1474243200000},http://www.addictinginfo.org,0
1,http://usherald.com/wp-content/uploads/2015/05...,Thanks in part to the declassification of Defe...,[Bob Amoroso],[],"{'generator': 'WordPress 4.8.1', 'og': {'site_...",http://usherald.com/breaking-declassified-docs...,[http://usherald.com/wp-content/uploads/2015/0...,Declassified Docs Show That Obama Admin Create...,http://usherald.com/breaking-declassified-docs...,,[],{'$date': 1432650030000},http://usherald.com,0
2,http://eaglerising.com/wp-content/uploads/2016...,The Democrats are using an intimidation tactic...,[View All Posts],[],{'description': 'There is evidence the birth c...,http://eaglerising.com/36841/why-is-it-racist-...,[http://2lv0hm3wvpix464wwy2zh7d1.wpengine.netd...,Why is it ‚ÄúRACIST‚Äù to Question Someone‚Äôs Birth...,http://eaglerising.com/36841/why-is-it-racist-...,,[],{'$date': 1474243356000},http://eaglerising.com,0
3,http://100percentfedup.com/wp-content/uploads/...,Dolly Kyle has written a scathing ‚Äútell all‚Äù b...,[Fed Up],[],"{'googlebot': 'noimageindex', 'generator': 'Po...",http://100percentfedup.com/hillary-on-disabled...,[https://www.facebook.com/tr?id=15790889156864...,HILLARY ON DISABLED CHILDREN During Easter Egg...,http://100percentfedup.com/hillary-on-disabled...,,[],{'$date': 1466439263000},http://100percentfedup.com,0
4,http://clashdaily.com/wp-content/uploads/2016/...,The Haitians in the audience have some newswor...,"[Rich Witmer, Doug Giles]",[],"{'googlebot': 'noimageindex', 'og': {'site_nam...",http://clashdaily.com/2016/09/watch-trump-visi...,[http://clashdaily.wpengine.netdna-cdn.com/wp-...,'Reporters' FLEE When Clintons Get EXPOSED!,http://clashdaily.com/2016/09/watch-trump-visi...,,[https://www.youtube.com/embed/x5IS6Ya005E?fea...,{'$date': 1474208802000},http://clashdaily.com,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,http://rightwingnews.com/wp-content/uploads/20...,BREAKING: Steps to FORCE FBI Director Comey to...,[Cassy Fiano],[],"{'googlebot': 'noimageindex', 'og': {'site_nam...",http://rightwingnews.com/hillary-clinton-2/bre...,[http://rightwingnews.com/wp-content/uploads/2...,BREAKING: Steps to FORCE FBI Director Comey to...,http://rightwingnews.com/hillary-clinton-2/bre...,,[],{'$date': 1474907754000},http://rightwingnews.com,0
87,http://rightwingnews.com/wp-content/uploads/20...,Hillary‚Äôs TOP Donor Country Just Auctioned Off...,[Terresa Monroe-hamilton],[],"{'googlebot': 'noimageindex', 'og': {'site_nam...",http://rightwingnews.com/hillary-clinton-2/hil...,[http://1.gravatar.com/avatar/d35b77ff6c390071...,Hillary‚Äôs TOP Donor Country Just Auctioned Off...,http://rightwingnews.com/hillary-clinton-2/hil...,,[],{'$date': 1474912025000},http://rightwingnews.com,0
88,http://conservativetribune.com/wp-content/uplo...,Advertisement - story continues below\n\nThe f...,"[Martin Lioll, John Falkenberg, Ben Marquis, K...",[],{'description': 'People are already calling th...,http://conservativetribune.com/lester-holt-lie...,[http://conservativetribune.com/wp-content/upl...,Cavuto Just Exposed Lester Holt's Lies During ...,http://conservativetribune.com/lester-holt-lie...,,[https://www.youtube.com/embed/ThwaDSaoGU8?fea...,{'$date': 1474934400000},http://conservativetribune.com,0
89,http://assets.thepoliticalinsider.com.s3.amazo...,\n\nThere‚Äôs a lot to be discussed about last n...,[],[],{'description': 'One thing that has baffled ma...,http://thepoliticalinsider.com/first-president...,[http://1.gravatar.com/avatar/71be986d321b3d52...,People Noticed Something Odd About Hillary's O...,http://www.thepoliticalinsider.com/first-presi...,,[],{'$date': 1475000011000},http://www.thepoliticalinsider.com,0


The BuzzFeed set is organized in the same fashion as the Politifact dataset with the same features. In this set though, we have only 91 stories for each set for a total of 182 stories.

### BuzzFeed: Load and Format User/Network Data

In [22]:
# get user info data
bf_news = pd.read_csv('BuzzFeed/News.txt',header=None,names=['news'])
bf_users = pd.read_csv('BuzzFeed/User.txt',header=None,names=['users'])
bf_news_user = pd.read_csv('BuzzFeed/BuzzFeedNewsUser.txt',header=None,names=['news_users'])
bf_user_user = pd.read_csv('BuzzFeed/BuzzFeedUserUser.txt',header=None,names=['followers'])

# build graph
G_bf = nx.DiGraph()
G_bf.add_nodes_from(bf_users)
G_bf.add_edges_from(bf_user_user['followers'].str.split('\t'))
print('The number of nodes in the network is ',G_bf.number_of_nodes())
print('The number of edges in the network is ',G_bf.number_of_edges())
print('The number of user-news relationships is', len(bf_news_user))


The number of nodes in the network is  15258
The number of edges in the network is  634750
The number of user-news relationships is 22779


In this network we have 15,258 distinct twitter users and 634,750 user relationships represented in the data set. Finally there are 22,779 user-news relationships that tell which user shared which story and how many times.


## Feature Engineering: Social Network


### Number of Times Shared

In [23]:
# Politifact
pf_news = pd. concat([pf_real_news, pf_fake_news]).reset_index(drop=True)
pf_news.index
pf_news['news_number']  = pf_news.index +1
pf_news_user['news_users']=pf_news_user['news_users'].str.split('\t')
pf_news_user=pf_news_user['news_users'].apply(pd.Series)
pf_news_user.rename(columns={0:'news',1:'user',2:'num_shared'},inplace=True)
pf_news_user['num_shared'] = pf_news_user['num_shared'].astype('int')
pf_news_user['news'] = pf_news_user['news'].astype('int')
df=pf_news_user.groupby('news')['num_shared'].sum()
df=df.reset_index(drop=True)
pf_news['num_shared']=df
del(df)
pf_news['num_shared'] = pf_news['num_shared'].astype('int')

# BuzzFeed
bf_news = pd.concat([bf_real_news,bf_fake_news]).reset_index(drop=True)
bf_news.index
bf_news['news_number']  = bf_news.index +1
bf_news['news_number']
bf_news_user['news_users']=bf_news_user['news_users'].str.split('\t')
bf_news_user=bf_news_user['news_users'].apply(pd.Series)
bf_news_user.rename(columns={0:'news',1:'user',2:'num_shared'},inplace=True)
bf_news_user['num_shared'] = bf_news_user['num_shared'].astype('int')
bf_news_user['news'] = bf_news_user['news'].astype('int')
df=bf_news_user.groupby('news')['num_shared'].sum()
df=df.reset_index(drop=True)
bf_news['num_shared']=df
del(df)
bf_news['num_shared'] = bf_news['num_shared'].astype('int')

### Num Times Shared by top 2 percent

In [24]:
# Politifact
degree_cent = nx.in_degree_centrality(G_pf)
sort_dict_degree= dict(sorted((value, key) for (key,value) in degree_cent.items())) 
top= -int(.02*G_pf.number_of_nodes())
print(top)
temp=list(sort_dict_degree.values())[top:]
foo=pf_news_user.loc[pf_news_user['user'].isin(temp),'news'].sort_values()
foo1=foo.groupby(foo).count()
foo1.index=foo1.index-1
pf_news['shared_by_top']=foo1
pf_news.loc[pf_news['shared_by_top'].isna(),'shared_by_top']=0
# pf_news[['shared_by_top','Real']]

# Buzzfeed
degree_cent = nx.in_degree_centrality(G_bf)
sort_dict_degree= dict(sorted((value, key) for (key,value) in degree_cent.items())) 
top= -int(.02*G_bf.number_of_nodes())
print(top)
temp=list(sort_dict_degree.values())[top:]
foo=bf_news_user.loc[bf_news_user['user'].isin(temp),'news'].sort_values()
foo1=foo.groupby(foo).count()
foo1.index=foo1.index-1
bf_news['shared_by_top']=foo1
bf_news.loc[bf_news['shared_by_top'].isna(),'shared_by_top']=0
#bf_news['shared_by_top']

-477
-305


### Avg. Number of Followers Shared by

In [25]:
# Politifact
degree=pd.DataFrame.from_dict(G_pf.in_degree).drop(0).rename(columns={0:'user',1:'followers'})
degree.reset_index(drop=True,inplace=True)
degree['user']=degree['user'].astype('int')
pf_news_user['user']=pf_news_user['user'].astype('int')
df=degree.merge(pf_news_user,on='user')
df=df.groupby('news')['followers'].mean().reset_index()
pf_news['avg_follower']=df['followers']
del(df)
pf_news['avg_follower']

# Buzzfeed
degree=pd.DataFrame.from_dict(G_bf.in_degree).drop(0).rename(columns={0:'user',1:'followers'})
degree.reset_index(drop=True,inplace=True)
degree['user']=degree['user'].astype('int')
bf_news_user['user']=bf_news_user['user'].astype('int')
df=degree.merge(bf_news_user,on='user')
df=df.groupby('news')['followers'].mean().reset_index()
bf_news['avg_follower']=df['followers']
del(df)
#bf_news['avg_follower']

### Avg. Number of followees shared by

In [26]:
# Politifact
degree=pd.DataFrame.from_dict(G_pf.out_degree).drop(0).rename(columns={0:'user',1:'followees'})
degree.reset_index(drop=True,inplace=True)
degree['user']=degree['user'].astype('int')
#pf_news_user['user']=pf_news_user['user'].astype('int')
df=degree.merge(pf_news_user,on='user')
df=df.groupby('news')['followees'].mean().reset_index()
pf_news['avg_followee']=df['followees']
del(df)
#pf_news['avg_followee']

# BuzzFeed
degree=pd.DataFrame.from_dict(G_bf.out_degree).drop(0).rename(columns={0:'user',1:'followees'})
degree.reset_index(drop=True,inplace=True)
degree['user']=degree['user'].astype('int')
#bf_news_user['user']=bf_news_user['user'].astype('int')
df=degree.merge(bf_news_user,on='user')
df=df.groupby('news')['followees'].mean().reset_index()
bf_news['avg_followee']=df['followees']
del(df)
#bf_news['avg_followee']

### Followee to Follower Ratio

In [27]:
pf_news['f_ratio']=pf_news['avg_followee']/pf_news['avg_follower']
bf_news['f_ratio']=bf_news['avg_followee']/bf_news['avg_follower']
#bf_news['f_ratio']

In [28]:
# df.head()# news = news.append([bf_news,pf_news],ignore_index=True)

In [None]:
# df['user'] = df.user.astype(int)
# df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15257 entries, 1 to 15257
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   user       15257 non-null  int64  
 1   Followers  15257 non-null  int64  
 2   Fraction   15257 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 357.7 KB


In [29]:
bf_news_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22779 entries, 0 to 22778
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   news        22779 non-null  int64
 1   user        22779 non-null  int64
 2   num_shared  22779 non-null  int64
dtypes: int64(3)
memory usage: 534.0 KB


In [30]:
# G_bf.degree
# G_bf_outD = dict(G_bf.out_degree())
# k = [k for k,v in G_bf_outD.items()]
# v = [v for k,v in G_bf_outD.items()]
# df = pd.DataFrame({'user': k, 'Followers': v, 'Fraction': v/np.sum(v)}).drop(0,axis=0)
# df_ = df.merge(bf_news_user, on='user')
# bf_news_user.head()
# df.head()

In [31]:
G_bf.degree
G_bf_inD = dict(G_bf.in_degree())
k = [k for k,v in G_bf_inD.items()]
v = [v for k,v in G_bf_inD.items()]
df = pd.DataFrame({'Users': k, 'Followers': v, 'Fraction': v/np.sum(v)}).drop(0,axis=0)
df.head()

Unnamed: 0,Users,Followers,Fraction
1,48,2,3e-06
2,1,4,6e-06
3,899,19,3e-05
4,6781,3,5e-06
5,10097,10,1.6e-05


**Betweeness Centrality for Buzz Feed**

In [32]:
# G_bf_b = nx.betweenness_centrality(G_bf, endpoints=True,normalized=True)

In [None]:
# new_G_bf_b = G_bf_b.copy()

In [None]:
# new_G_bf_b

**Closeness Centrality for Buzz Feed**

In [None]:
# G_bf_c = nx.closeness_centrality(G_bf)

**Betweenness Centrality for Politifact**

In [None]:
# G_pf_b = nx.betweenness_centrality(G_pf, endpoints=True,normalized=True)

**Closeness Centrality for Politifact**

In [None]:
# G_pf_c = nx.closeness_centrality(G_pf)

In [33]:

# with open('G_bf_b.csv', 'w') as csv_file:  
#     writer = csv.writer(csv_file)
#     for key, value in G_bf_b.items():
      #  writer.writerow([key, value])
with open('Term Project/G_bf_b.csv') as csv_file:
    file = csv.reader(csv_file)
    G_bf_b = dict(file)
# G_bf_b

In [34]:
# with open('G_bf_c.csv', 'w') as csv_file:  
#     writer = csv.writer(csv_file)
#     for key, value in G_bf_c.items():
#        writer.writerow([key, value])
with open('Term Project/G_bf_c.csv') as csv_file:
    file = csv.reader(csv_file)
    G_bf_c  = dict(file)
# G_bf_c

In [35]:
# with open('G_pf_b.csv', 'w') as csv_file:  
#     writer = csv.writer(csv_file)
#     for key, value in G_pf_b .items():
#        writer.writerow([key, value])
with open('Term Project/G_pf_b.csv') as csv_file:
    file = csv.reader(csv_file)
    G_pf_b = dict(file)
# G_pf_b

In [36]:
# with open('G_pf_c.csv', 'w') as csv_file:  
#     writer = csv.writer(csv_file)
#     for key, value in G_pf_c.items():
#        writer.writerow([key, value])
with open('Term Project/G_pf_c.csv') as csv_file:
    file = csv.reader(csv_file)
    G_pf_c = dict(file)
# G_pf_c

In [37]:
#Buzz Feed betweenness centrality
btw_central = G_bf_b

k = [k for k,v in btw_central.items()]
v = [v for k,v in btw_central.items()]
btw_central = pd.DataFrame({'user': k, 'betweenness_centrality': v}).rename(columns = {0:'user',1:'betweenness_centrality'}).drop(0,axis = 0).reset_index()
btw_central['user'] = btw_central['user'].astype('i4')
btw_central['betweenness_centrality'] = btw_central['betweenness_centrality'].astype('float')
df = btw_central.merge(bf_news_user, on = 'user')
df = df.groupby('news')['betweenness_centrality'].mean().reset_index()
bf_news['betweenness_centrality'] = df['betweenness_centrality']
# bf_news.head()

In [38]:
#Buzz Feed closenness centrality
close_central = G_bf_c

k = [k for k,v in close_central.items()]
v = [v for k,v in close_central.items()]

close_central  = pd.DataFrame({'user': k, 'closenness_centrality': v}).rename(columns = {0:'user',1:'closenness_centrality'}).drop(0,axis = 0).reset_index()
close_central['user'] = close_central['user'].astype('i4')
close_central['closenness_centrality'] = close_central['closenness_centrality'].astype('float')
df = close_central.merge(bf_news_user, on = 'user')
df = df.groupby('news')['closenness_centrality'].mean().reset_index()
bf_news['closenness_centrality'] = df['closenness_centrality']
# bf_news

In [39]:
#Plotifact betweenness centrality
btw_central = G_pf_b

k = [k for k,v in btw_central.items()]
v = [v for k,v in btw_central.items()]

btw_central = pd.DataFrame({'user': k, 'betweenness_centrality': v}).rename(columns = {0:'user',1:'betweenness_centrality'}).drop(0,axis = 0).reset_index()
btw_central['user'] = btw_central['user'].astype('i4')
btw_central['betweenness_centrality'] = btw_central['betweenness_centrality'].astype('float')
df = btw_central.merge(pf_news_user, on = 'user')
df = df.groupby('news')['betweenness_centrality'].mean().reset_index()
pf_news['betweenness_centrality'] = df['betweenness_centrality']
# pf_news.head()

In [40]:
#Politifact closeness centrality
close_central = G_pf_c

k = [k for k,v in close_central.items()]
v = [v for k,v in close_central.items()]

close_central = pd.DataFrame({'user': k, 'closenness_centrality': v}).rename(columns = {0:'user',1:'closenness_centrality'}).drop(0,axis = 0).reset_index()
close_central['user'] = close_central['user'].astype('i4')
close_central['closenness_centrality'] = close_central['closenness_centrality'].astype('float')
df = close_central.merge(pf_news_user, on = 'user')
df = df.groupby('news')['closenness_centrality'].mean().reset_index()
pf_news['closenness_centrality'] = df['closenness_centrality']
# pf_news

## Union of News Content Data Sets

In [41]:
cols = list(pf_news.columns)
news = pd.DataFrame(columns=cols)
news = pd.concat([news,bf_fake_news],axis=0,ignore_index=True)
news = news.sample(frac=1).reset_index(drop=True)
news.head()

Unnamed: 0,top_img,text,authors,keywords,meta_data,canonical_link,images,title,url,summary,...,source,Real,news_number,num_shared,shared_by_top,avg_follower,avg_followee,f_ratio,betweenness_centrality,closenness_centrality
0,http://conservativetribune.com/wp-content/uplo...,Olympic Committee Buckles to LGBT Groups... Ma...,[],[],"{'viewport': 'width=device-width, initial-scal...",,[http://stripe.rs-1198-a.com/stripe/beacon?cs_...,Californians Had Special Way to View the Eclip...,http://conservativebyte.com/2016/09/hillary-ba...,,...,http://conservativebyte.com,0,,,,,,,,
1,http://usherald.com/wp-content/uploads/2015/05...,Thanks in part to the declassification of Defe...,[Bob Amoroso],[],"{'generator': 'WordPress 4.8.1', 'og': {'site_...",http://usherald.com/breaking-declassified-docs...,[http://usherald.com/wp-content/uploads/2015/0...,Declassified Docs Show That Obama Admin Create...,http://usherald.com/breaking-declassified-docs...,,...,http://usherald.com,0,,,,,,,,
2,http://100percentfedup.com/wp-content/uploads/...,Dolly Kyle has written a scathing ‚Äútell all‚Äù b...,[Fed Up],[],"{'googlebot': 'noimageindex', 'generator': 'Po...",http://100percentfedup.com/hillary-on-disabled...,[https://www.facebook.com/tr?id=15790889156864...,HILLARY ON DISABLED CHILDREN During Easter Egg...,http://100percentfedup.com/hillary-on-disabled...,,...,http://100percentfedup.com,0,,,,,,,,
3,http://freedomdaily.com/wp-content/uploads/201...,6.6k SHARES Facebook Twitter\n\nGerman Chancel...,[],[],{'description': 'German chancellor Angela Merk...,http://freedomdaily.com/boom-merkel-admits-flo...,[http://1csabj4ddrd61fgqez2e4nss.wpengine.netd...,BOOM! Merkel Admits Flooding Germany With Musl...,http://freedomdaily.com/boom-merkel-admits-flo...,,...,http://freedomdaily.com,0,,,,,,,,
4,http://occupydemocrats.com/wp-content/uploads/...,11.3k SHARES SHARE THIS STORY\n\nDuring a Repu...,"[Grant Stern, Brett Bose, Natalie Dickinson]",[],{'generator': 'Powered by Visual Composer - dr...,http://occupydemocrats.com/2016/09/23/just-new...,[http://occupydemocrats.com/wp-content/uploads...,Newsweek Accuses Trump Of Committing A Felony,http://occupydemocrats.com/2016/09/23/just-new...,,...,http://occupydemocrats.com,0,,,,,,,,


Because the four news content datasets have the same fields and do not conatin any information exclusive to the individual datasets, in contrast to the network data, we can merge them into a single dataset for use in our data mining algorithms. This has the positive effect of increasing our sample size to 422 total news stories. The ratio of true stories to false stories is 50:50. Note that we added a logical variable 'Real' to indicate whether the story is real or fake and the order of the instances has been randomized to eliminate patterns.

## Feature Engineering: News Content

In [46]:
import emoji
# from emoji import UNICODE_EMOJI
from emoji import unicode_codes
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/eviofekeze/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/eviofekeze/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [60]:
list(emoji.unicode_codes.EMOJI_DATA)

['ü•á',
 'ü•à',
 'ü•â',
 'üÜé',
 'üèß',
 'üÖ∞Ô∏è',
 'üÖ∞',
 'üá¶üá´',
 'üá¶üá±',
 'üá©üáø',
 'üá¶üá∏',
 'üá¶üá©',
 'üá¶üá¥',
 'üá¶üáÆ',
 'üá¶üá∂',
 'üá¶üá¨',
 '‚ôí',
 'üá¶üá∑',
 '‚ôà',
 'üá¶üá≤',
 'üá¶üáº',
 'üá¶üá®',
 'üá¶üá∫',
 'üá¶üáπ',
 'üá¶üáø',
 'üîô',
 'üÖ±Ô∏è',
 'üÖ±',
 'üáßüá∏',
 'üáßüá≠',
 'üáßüá©',
 'üáßüáß',
 'üáßüáæ',
 'üáßüá™',
 'üáßüáø',
 'üáßüáØ',
 'üáßüá≤',
 'üáßüáπ',
 'üáßüá¥',
 'üáßüá¶',
 'üáßüáº',
 'üáßüáª',
 'üáßüá∑',
 'üáÆüá¥',
 'üáªüá¨',
 'üáßüá≥',
 'üáßüá¨',
 'üáßüá´',
 'üáßüáÆ',
 'üÜë',
 'üÜí',
 'üá∞üá≠',
 'üá®üá≤',
 'üá®üá¶',
 'üáÆüá®',
 '‚ôã',
 'üá®üáª',
 '‚ôë',
 'üáßüá∂',
 'üá∞üáæ',
 'üá®üá´',
 'üá™üá¶',
 'üáπüá©',
 'üá®üá±',
 'üá®üá≥',
 'üá®üáΩ',
 'üéÑ',
 'üá®üáµ',
 'üá®üá®',
 'üá®üá¥',
 'üá∞üá≤',
 'üá®üá¨',
 'üá®üá©',
 'üá®üá∞',
 'üá®üá∑',
 'üá≠üá∑',
 'üá®üá∫',
 'üá®üáº',
 'üá®üáæ',
 'üá®üáø',
 'üá®üáÆ',
 'üá©üá∞',


In [61]:
def num_all_caps(string):
    if len(re.findall(r"([A-Z]+\s?[A-Z]+[^a-z0-9\W])",string)) > 0:
      return 0
    return 1
    # return len(re.findall(r"([A-Z]+\s?[A-Z]+[^a-z0-9\W])",string))
def num_exmarks(string):
    if string.count("!") > 0:
      return 0
    return 1
    # return string.count("!") 

def num_all_caps_or_exmarks(string):
    if len(re.findall(r"([A-Z]+\s?[A-Z]+[^a-z0-9\W])",string)) > 0:
      return 0
    if string.count("!") > 0:
      return 0
    return 1

def title_ln(text):
  totalln = 0 
  for character in word_tokenize(text):
      totalln +=1
  if totalln > 11:
    return 0
  return 1
  # return totalln
def text_ln(text):
  totalln = 0 
  for character in word_tokenize(text):
      totalln +=1
  if totalln > 500:
    return 1
  return 0
  # return totalln
def text_has_emoji(text):
    for character in text:
        if character in list(emoji.unicode_codes.EMOJI_DATA):
            return 1
    return 0
def text_word_len(text):
  longln = 0
  totalln = 0 
  for character in word_tokenize(text):
    if character not in stop_words:
      if len(character) > 6:
        longln += 1
      totalln +=1
  if totalln != 0:
    if longln / totalln > 0.3:
      return 0
  return 1
  # if totalln != 0:
  #   return longln / totalln
  # return 0
def stop_word_title(text):
  stopln = 0
  totalln = 0 
  for character in word_tokenize(text):
    if character in stop_words:
        stopln += 1
    totalln +=1
  if totalln != 0:
    if stopln / totalln > 0.14:
      return 1
  return 0
  # if totalln != 0:
  #   return stopln / totalln
  # return 0
def stop_word_text(text):
  stopln = 0
  totalln = 0 
  for character in word_tokenize(text):
    if character in stop_words:
        stopln += 1
    totalln +=1
  if totalln != 0:
    if stopln / totalln > 0.37:
      return 1
  return 0
  # if totalln != 0:
  #   return stopln / totalln
  # return 0
news['title_allcaps']=news['title'].apply(num_all_caps)
news['title_num_exmarks'] = news['title'].apply(num_exmarks)
news['title_allcaps_or_exmarks']=news['title'].apply(num_all_caps_or_exmarks)
news['title_length']=news['title'].apply(title_ln)
news['text_length']=news['text'].apply(text_ln)
news['title_isascii']=news['title'].apply(text_has_emoji)
news['text_isascii']=news['text'].apply(text_has_emoji)
news['title_comp']=news['title'].apply(text_word_len)
news['text_comp']=news['text'].apply(text_word_len)
news['title_stopwords']=news['title'].apply(stop_word_title)
news['text_stopwords']=news['text'].apply(stop_word_text)
# Readability Scores
# news['flesch_score'] =news['text'].apply(ts.flesch_reading_ease).apply(lambda x: x**2)
news['flesch_score'] =news['text'].apply(ts.flesch_reading_ease)
news['dale_chall_score'] =news['text'].apply(ts.dale_chall_readability_score)

## Prepare Data Set for Data Mining

### Select Features

In [69]:
feature_list = ['title_isascii','title_allcaps','title_num_exmarks','title_length',
               'text_length','flesch_score','dale_chall_score', 'title_comp', 
                'text_comp', 'title_stopwords', 'text_stopwords', 'num_shared',
               'shared_by_top','avg_follower', 'avg_followee', 'f_ratio',
                'betweenness_centrality','closenness_centrality','Real',"title_allcaps_or_exmarks"]
news_ft = news[feature_list].copy()
news_ft['num_shared']=news_ft['num_shared'].astype('int')
news_ft['Real']=news_ft['Real'].astype('int')
news_ft.info()

ValueError: cannot convert float NaN to integer

## Modelling

In [64]:

metrics_dict={
'Logistic Regression':{'accuracy':0,'precision':0,'recall':0,'f1':0},
'XgBoost':{'accuracy':0,'precision':0,'recall':0,'f1':0},
'Naive Bayes':{'accuracy':0,'precision':0,'recall':0,'f1':0},
'Support Vector Machine':{'accuracy':0,'precision':0,'recall':0,'f1':0},
'Decision Tree':{'accuracy':0,'precision':0,'recall':0,'f1':0},
'Random Forest Classifier':{'accuracy':0,'precision':0,'recall':0,'f1':0},
'Voting Classifier':{'accuracy':0,'precision':0,'recall':0,'f1':0}}

metrics_df = pd.DataFrame.from_dict(metrics_dict,orient='index')

### Using Logistic Regression


In [68]:
# X_ft = news_ft.drop(columns=['Real'])
X_ft = news_ft[[          
          'text_length',
          # 'flesch_score',
          'dale_chall_score',
          # 'num_shared',
          # 'avg_follower',
          # 'avg_followee',
          # 'closenness_centrality',
          # 'betweenness_centrality',
          # 'title_allcaps',
          'title_allcaps_or_exmarks',

          # 'title_num_exmarks',
          'title_length',
          'shared_by_top',
          'title_comp', 
          'text_comp', 
          'title_stopwords', 
          'text_stopwords',
          'f_ratio',
          ]]
y = news_ft['Real']
count = CountVectorizer(stop_words='english')
X_txt = count.fit_transform(news['text'])
X_txt = X_txt.todense()

X = np.hstack((X_ft,X_txt))
X=X.astype('float')
y = y.astype('int')

try:
    from sklearn.utils._testing import ignore_warnings
except ImportError:
    from sklearn.utils.testing import ignore_warnings

from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
# warnings.filterwarnings("ignore", category=ConvergenceWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

precision_list=[]
recall_list=[]
f1_list=[]
accuracy_list=[]
auc_scores = []
AVG_precision_scores = []

#X = X.to_numpy()
y = y.to_numpy()

for tr_ind, tst_ind in skf.split(X,y):
    X_train = X[tr_ind]
    X_test = X[tst_ind]
    y_train = y[tr_ind]
    y_test = y[tst_ind]
    #classification
    lg_clf = LogisticRegression(max_iter=200)
    lg_clf.fit(X_train, y_train)  
    # predict the labels on test dataset
    predictions = lg_clf.predict(X_test)

    #evaluation
    precision_list.append(metrics.precision_score(y_test, predictions))
    recall_list.append(metrics.recall_score(y_test, predictions))
    f1_list.append(metrics.f1_score(y_test, predictions))
    accuracy_list.append(metrics.accuracy_score(y_test,predictions))
    auc_scores.append(roc_auc_score(y_test, predictions))
    AVG_precision_scores.append(average_precision_score(y_test, predictions))
#     print(metrics.confusion_matrix(y_test,predictions))
metrics_df.loc['Logistic Regression','precision'] = round(np.mean(precision_list)*100,3)
metrics_df.loc['Logistic Regression','recall'] = round(np.mean(recall_list)*100,3)
metrics_df.loc['Logistic Regression','f1'] = round(np.mean(f1_list)*100,3)
metrics_df.loc['Logistic Regression','accuracy'] = round(np.mean(accuracy_list)*100,3)
print(" precision  = ", round(np.mean(precision_list)*100,3),"\n", 
      "recall     = ",round(np.mean(recall_list)*100,3),"\n",
      "f1         = ",round(np.mean(f1_list)*100,3),"\n",
      "accuracy   = ",round(np.mean(accuracy_list)*100,3),"\n", 
      "AUROC      = ", round(np.mean(auc_scores)*100,3),"\n",
      "Average Precision = ", round(np.mean(AVG_precision_scores)*100,3),"\n", )



ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### XG Boost

In [None]:
#X= news_ft.drop(columns=['Real'])
X_ft = news_ft[[          
          'text_length',
          'flesch_score',
          'dale_chall_score',
          # 'num_shared',
          # 'avg_follower',
          # 'avg_followee',
          # 'closenness_centrality',
          # 'betweenness_centrality',
          # 'title_allcaps',
          'title_allcaps_or_exmarks',
          'title_num_exmarks',
          'title_length',
          'shared_by_top',
          'title_comp', 
          'text_comp', 
          'title_stopwords', 
          'text_stopwords',
          'f_ratio',
          ]]
y = news_ft['Real']
# y = news['Real']

count = CountVectorizer(stop_words='english')
X_txt = count.fit_transform(news['text'])
X_txt = X_txt.todense()

X = np.hstack((X_ft,X_txt))
X=X.astype('float')
y = y.astype('int')
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

precision_list=[]
recall_list=[]
f1_list=[]
accuracy_list=[]

#X = X.to_numpy()
y = y.to_numpy()


param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'error'
num_round = 10

for tr_ind, tst_ind in skf.split(X,y):
    X_train = X[tr_ind,:]
    X_test = X[tst_ind,:]
    y_train = y[tr_ind]
    y_test = y[tst_ind]
    # evallist = [(dtest, 'eval'), (dtrain, 'train')]
    #classification
    xg_clf = xg.XGBClassifier(max_depth=10, learning_rate=0.5, n_estimators=8,
                       objective='binary:logistic', booster='gbtree',min_child_weight=10)
    xg_clf.fit(X_train, y_train)  
    # predict the labels on test dataset
    predictions = xg_clf.predict(X_test)

    #evaluation
    precision_list.append(metrics.precision_score(y_test, predictions))
    recall_list.append(metrics.recall_score(y_test, predictions))
    f1_list.append(metrics.f1_score(y_test, predictions))
    accuracy_list.append(metrics.accuracy_score(y_test,predictions))
    auc_scores.append(roc_auc_score(y_test, predictions))
    AVG_precision_scores.append(average_precision_score(y_test, predictions))

metrics_df.loc['XgBoost','precision'] =  round(np.mean(precision_list)*100,3)
metrics_df.loc['XgBoost','recall'] = round(np.mean(recall_list)*100,3)
metrics_df.loc['XgBoost','f1'] = round(np.mean(f1_list)*100,3)
metrics_df.loc['XgBoost','accuracy'] = round(np.mean(accuracy_list)*100,3)

print(" precision  = ", round(np.mean(precision_list)*100,3),"\n", 
      "recall     = ",round(np.mean(recall_list)*100,3),"\n",
      "f1         = ",round(np.mean(f1_list)*100,3),"\n",
      "accuracy   = ",round(np.mean(accuracy_list)*100,3),"\n", 
      "AUROC      = ", round(np.mean(auc_scores)*100,3),"\n",
      "Average Precision = ", round(np.mean(AVG_precision_scores)*100,3),"\n", )

 precision  =  78.466 
 recall     =  71.539 
 f1         =  74.517 
 accuracy   =  75.583 
 AUROC      =  78.782 
 Average Precision =  73.693 



### Naive Bayes


In [None]:
wordCount = CountVectorizer(stop_words='english')
wordCount = wordCount.fit(news['text'])
text_vector = wordCount.transform(news['text'])

temp = text_vector.toarray()
text_vector_df = pd.DataFrame(temp, columns=[f'TV{i}' for i in range(15428)], index=news.index)
new_df = news_ft.join(text_vector_df, on = news_ft.index)

In [None]:
# X = text_vector_df.join(news_ft[['title_allcaps','dale_chall_score','title_num_exmarks','f_ratio','shared_by_top', 
#                         'title_allcaps_or_exmarks','title_comp', 'text_comp', 'title_stopwords', 'text_stopwords']], on=news_ft.index)
X = text_vector_df.join(news_ft[['title_allcaps','dale_chall_score','title_num_exmarks','f_ratio','shared_by_top']], on=news_ft.index)

y = new_df['Real'].astype('i4')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

precision_list=[]
recall_list=[]
f1_list=[]
accuracy_list=[]
auc_scores = []
AVG_precision_scores = []


for tr_ind, tst_ind in skf.split(X,y):
  X_train = X.iloc[tr_ind]
  X_test = X.iloc[tst_ind]
  y_train = y[tr_ind]
  y_test = y[tst_ind]

  mn_clf = MultinomialNB()
  mn_clf.fit(X_train, y_train) 
  predictions = mn_clf.predict(X_test)
  proba = mn_clf.predict_proba(X_test)[:,1]
      

    #evaluation
  precision = metrics.precision_score(y_test, predictions)
  recall = metrics.recall_score(y_test, predictions)
  f1 = metrics.f1_score(y_test, predictions)
  accuracy = metrics.accuracy_score(y_test,predictions)
      # print(metrics.confusion_matrix(y_test,predictions))
      
  precision_list.append(precision)
  recall_list.append(recall)
  f1_list.append(f1)
  accuracy_list.append(accuracy)
  auc_scores.append(roc_auc_score(y_test, proba))
  AVG_precision_scores.append(average_precision_score(y_test, proba))

metrics_df.loc['Naive Bayes','precision'] = round(np.mean(precision_list)*100,3)
metrics_df.loc['Naive Bayes','recall'] = round(np.mean(recall_list)*100,3)
metrics_df.loc['Naive Bayes','f1'] = round(np.mean(f1_list)*100,3)
metrics_df.loc['Naive Bayes','accuracy'] = round(np.mean(accuracy_list)*100,3)

print(" precision  = ", round(np.mean(precision_list)*100,3),"\n", 
      "recall     = ",round(np.mean(recall_list)*100,3),"\n",
      "f1         = ",round(np.mean(f1_list)*100,3),"\n",
      "accuracy   = ",round(np.mean(accuracy_list)*100,3),"\n", 
      "AUROC      = ", round(np.mean(auc_scores)*100,3),"\n",
      "Average Precision = ", round(np.mean(AVG_precision_scores)*100,3),"\n", )

 precision  =  82.537 
 recall     =  84.341 
 f1         =  83.377 
 accuracy   =  83.174 
 AUROC      =  87.629 
 Average Precision =  86.864 



### Support Vector Machine

In [None]:
wordCount = TfidfVectorizer(stop_words='english')
wordCount = wordCount.fit(news['text'])
text_vector = wordCount.transform(news['text'])

temp = text_vector.toarray()
text_vector_df = pd.DataFrame(temp, columns=[f'TV{i}' for i in range(15428)], index=news.index)
new_df = news_ft.join(text_vector_df, on = news_ft.index)

In [None]:
X = text_vector_df.join(news_ft[['title_allcaps','dale_chall_score','title_num_exmarks','f_ratio','shared_by_top',
          'title_comp', 'text_comp', 'title_stopwords', 'text_stopwords']], on=news_ft.index)
y = new_df['Real'].astype('i4')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

precision_list=[]
recall_list=[]
f1_list=[]
accuracy_list=[]
auc_scores = []
AVG_precision_scores = []


for tr_ind, tst_ind in skf.split(X,y):
  X_train = X.iloc[tr_ind]
  X_test = X.iloc[tst_ind]
  y_train = y[tr_ind]
  y_test = y[tst_ind]

  sv_clf = svm.SVC(kernel= 'rbf', class_weight='balanced',probability=True, decision_function_shape='ovo',C=85)
  sv_clf.fit(X_train, y_train) 
  predictions = sv_clf.predict(X_test)
  proba = sv_clf.predict_proba(X_test)[:,1]
      

    #evaluation
  precision = metrics.precision_score(y_test, predictions)
  recall = metrics.recall_score(y_test, predictions)
  f1 = metrics.f1_score(y_test, predictions)
  accuracy = metrics.accuracy_score(y_test,predictions)
      # print(metrics.confusion_matrix(y_test,predictions))
      
  precision_list.append(precision)
  recall_list.append(recall)
  f1_list.append(f1)
  accuracy_list.append(accuracy)
  auc_scores.append(roc_auc_score(y_test, proba))
  AVG_precision_scores.append(average_precision_score(y_test, proba))

metrics_df.loc['Support Vector Machine','precision'] = round(np.mean(precision_list)*100,3)
metrics_df.loc['Support Vector Machine','recall'] = round(np.mean(recall_list)*100,3)
metrics_df.loc['Support Vector Machine','f1'] = round(np.mean(f1_list)*100,3)
metrics_df.loc['Support Vector Machine','accuracy'] = round(np.mean(accuracy_list)*100,3)

print(" precision  = ", round(np.mean(precision_list)*100,3),"\n", 
      "recall     = ",round(np.mean(recall_list)*100,3),"\n",
      "f1         = ",round(np.mean(f1_list)*100,3),"\n",
      "accuracy   = ",round(np.mean(accuracy_list)*100,3),"\n", 
      "AUROC      = ", round(np.mean(auc_scores)*100,3),"\n",
      "Average Precision = ", round(np.mean(AVG_precision_scores)*100,3),"\n", )

 precision  =  74.366 
 recall     =  78.195 
 f1         =  76.157 
 accuracy   =  75.591 
 AUROC      =  85.562 
 Average Precision =  86.209 



### Decision Tree

In [None]:
wordCount = CountVectorizer(stop_words='english')
wordCount = wordCount.fit(news['text'])
text_vector = wordCount.transform(news['text'])
tnew = news.copy()

temp = text_vector.toarray()
text_vector_df = pd.DataFrame(temp, columns=[f'TV{i}' for i in range(15428)], index=news.index)
new_df = news_ft.join(text_vector_df, on = news_ft.index)
X = text_vector_df.join(news_ft[[          
          'text_length',
          # 'flesch_score',
          'dale_chall_score',
          # 'num_shared',
          # 'avg_follower',
          # 'avg_followee',
          'closenness_centrality',
          # 'betweenness_centrality',
          # 'title_allcaps',
          'title_allcaps_or_exmarks',
          # 'title_num_exmarks',
          # 'title_length',
          'shared_by_top',
          'title_comp', 
          'text_comp', 
          # 'title_stopwords', 
          # 'text_stopwords',
          # 'f_ratio',
          ]], on=news_ft.index)
y = tnew['Real'].astype('i4')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

precision_list=[]
recall_list=[]
f1_list=[]
accuracy_list=[]
auc_scores = []
AVG_precision_scores = []


for tr_ind, tst_ind in skf.split(X,y):
  X_train = X.iloc[tr_ind]
  X_test = X.iloc[tst_ind]
  y_train = y[tr_ind]
  y_test = y[tst_ind]

  tr_clf = DecisionTreeClassifier( )
  tr_clf = tr_clf.fit(X_train, y_train)
  predictions = tr_clf.predict(X_test)
  proba = tr_clf.predict_proba(X_test)[:,1]
      

    #evaluation
  precision = metrics.precision_score(y_test, predictions)
  recall = metrics.recall_score(y_test, predictions)
  f1 = metrics.f1_score(y_test, predictions)
  accuracy = metrics.accuracy_score(y_test,predictions)
      # print(metrics.confusion_matrix(y_test,predictions))
      
  precision_list.append(precision)
  recall_list.append(recall)
  f1_list.append(f1)
  accuracy_list.append(accuracy)
  auc_scores.append(roc_auc_score(y_test, proba))
  AVG_precision_scores.append(average_precision_score(y_test, proba))

metrics_df.loc['Decision Tree','precision'] = round(np.mean(precision_list)*100,3)
metrics_df.loc['Decision Tree','recall'] = round(np.mean(recall_list)*100,3)
metrics_df.loc['Decision Tree','f1'] = round(np.mean(f1_list)*100,3)
metrics_df.loc['Decision Tree','accuracy'] = round(np.mean(accuracy_list)*100,3)

print(" precision  = ", round(np.mean(precision_list)*100,3),"\n", 
      "recall     = ",round(np.mean(recall_list)*100,3),"\n",
      "f1         = ",round(np.mean(f1_list)*100,3),"\n",
      "accuracy   = ",round(np.mean(accuracy_list)*100,3),"\n", 
      "AUROC      = ", round(np.mean(auc_scores)*100,3),"\n",
      "Average Precision = ", round(np.mean(AVG_precision_scores)*100,3),"\n", )

 precision  =  84.171 
 recall     =  79.081 
 f1         =  80.93 
 accuracy   =  81.504 
 AUROC      =  81.512 
 Average Precision =  76.854 



### Random Forest Classifier

In [None]:
X = text_vector_df.join(news_ft[[          
          # 'title_allcaps',
          'title_allcaps_or_exmarks',
          # 'title_num_exmarks',

          'title_length',
          # 'text_length',

          # 'flesch_score',

          'dale_chall_score',
          # 'num_shared',

          'shared_by_top',
          'avg_follower',
          'title_comp', 

          'text_comp', 
          'title_stopwords', 
          # 'text_stopwords',
          # 'avg_followee',
          'f_ratio',
          # 'closenness_centrality',
          # 'betweenness_centrality',
          ]], on=news_ft.index)
y = tnew['Real'].astype('i4')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

precision_list=[]
recall_list=[]
f1_list=[]
accuracy_list=[]
auc_scores = []
AVG_precision_scores = []


for tr_ind, tst_ind in skf.split(X,y):
  X_train = X.iloc[tr_ind]
  X_test = X.iloc[tst_ind]
  y_train = y[tr_ind]
  y_test = y[tst_ind]

  rnd_clf = RandomForestClassifier(n_estimators= 1000)
  rnd_clf = rnd_clf.fit(X_train, y_train)
  predictions = rnd_clf.predict(X_test)
  proba = rnd_clf.predict_proba(X_test)[:,1]
      

    #evaluation
  precision = metrics.precision_score(y_test, predictions)
  recall = metrics.recall_score(y_test, predictions)
  f1 = metrics.f1_score(y_test, predictions)
  accuracy = metrics.accuracy_score(y_test,predictions)
      # print(metrics.confusion_matrix(y_test,predictions))
      
  precision_list.append(precision)
  recall_list.append(recall)
  f1_list.append(f1)
  accuracy_list.append(accuracy)
  auc_scores.append(roc_auc_score(y_test, proba))
  AVG_precision_scores.append(average_precision_score(y_test, proba))

metrics_df.loc['Random Forest Classifier','precision'] = round(np.mean(precision_list)*100,3)
metrics_df.loc['Random Forest Classifier','recall'] = round(np.mean(recall_list)*100,3)
metrics_df.loc['Random Forest Classifier','f1'] = round(np.mean(f1_list)*100,3)
metrics_df.loc['Random Forest Classifier','accuracy'] = round(np.mean(accuracy_list)*100,3)

print(" precision  = ", round(np.mean(precision_list)*100,3),"\n", 
      "recall     = ",round(np.mean(recall_list)*100,3),"\n",
      "f1         = ",round(np.mean(f1_list)*100,3),"\n",
      "accuracy   = ",round(np.mean(accuracy_list)*100,3),"\n", 
      "AUROC      = ", round(np.mean(auc_scores)*100,3),"\n",
      "Average Precision = ", round(np.mean(AVG_precision_scores)*100,3),"\n", )


 precision  =  83.763 
 recall     =  76.268 
 f1         =  79.649 
 accuracy   =  80.56 
 AUROC      =  90.028 
 Average Precision =  91.301 



### Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
X = text_vector_df.join(news_ft[['title_allcaps','title_isascii','dale_chall_score','title_num_exmarks','f_ratio','shared_by_top',
                      'title_comp', 'text_comp', 'title_stopwords', 'text_stopwords']], on=news_ft.index)
y = new_df['Real'].astype('i4')

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

precision_list=[]
recall_list=[]
f1_list=[]
accuracy_list=[]
auc_scores = []
AVG_precision_scores = []


for tr_ind, tst_ind in skf.split(X,y):
  X_train = X.iloc[tr_ind]
  X_test = X.iloc[tst_ind]
  y_train = y[tr_ind]
  y_test = y[tst_ind]

  # clf = svm.SVC(kernel= 'rbf', class_weight='balanced',probability=True, decision_function_shape='ovo',C=85)
  # clf.fit(X_train, y_train) 

  log_clf_ = LogisticRegression(max_iter=200)
  NV_clf_ = MultinomialNB()
  svm_clf_ = svm.SVC(kernel= 'rbf', class_weight='balanced',probability=True, decision_function_shape='ovo',C=85)
  tr_clf_ = RandomForestClassifier(n_estimators=1000)
  xg_clf_ = xg.XGBClassifier(max_depth=10, learning_rate=0.5, n_estimators=8,
                       objective='binary:logistic', booster='gbtree',min_child_weight=10)


  vtg_clf = VotingClassifier(
      estimators=[('lr', log_clf_), ('NV', NV_clf_),('forest', tr_clf_)],voting='soft')

  vtg_clf.fit(X_train, y_train)


  predictions = vtg_clf.predict(X_test)
  proba = vtg_clf.predict_proba(X_test)[:,1]
      

    #evaluation
  precision = metrics.precision_score(y_test, predictions)
  recall = metrics.recall_score(y_test, predictions)
  f1 = metrics.f1_score(y_test, predictions)
  accuracy = metrics.accuracy_score(y_test,predictions)
      # print(metrics.confusion_matrix(y_test,predictions))
      
  precision_list.append(precision)
  recall_list.append(recall)
  f1_list.append(f1)
  accuracy_list.append(accuracy)
  auc_scores.append(roc_auc_score(y_test, proba))
  AVG_precision_scores.append(average_precision_score(y_test, proba))


metrics_df.loc['Voting Classifier','precision'] = round(np.mean(precision_list)*100,3)
metrics_df.loc['Voting Classifier','recall'] = round(np.mean(recall_list)*100,3)
metrics_df.loc['Voting Classifier','f1'] = round(np.mean(f1_list)*100,3)
metrics_df.loc['Voting Classifier','accuracy'] = round(np.mean(accuracy_list)*100,3)

print(" precision  = ", round(np.mean(precision_list)*100,3),"\n", 
      "recall     = ",round(np.mean(recall_list)*100,3),"\n",
      "f1         = ",round(np.mean(f1_list)*100,3),"\n",
      "accuracy   = ",round(np.mean(accuracy_list)*100,3),"\n", 
      "AUROC      = ", round(np.mean(auc_scores)*100,3),"\n",
      "Average Precision = ", round(np.mean(AVG_precision_scores)*100,3),"\n", )

 precision  =  85.164 
 recall     =  83.389 
 f1         =  84.216 
 accuracy   =  84.359 
 AUROC      =  91.907 
 Average Precision =  92.154 



## Summary of Results

In [None]:
metrics_df

Unnamed: 0,accuracy,precision,recall,f1
Logistic Regression,81.989,84.326,78.627,81.14
XgBoost,75.583,78.466,71.539,74.517
Naive Bayes,83.174,82.537,84.341,83.377
Support Vector Machine,75.591,74.366,78.195,76.157
Decision Tree,81.504,84.171,79.081,80.93
Random Forest Classifier,80.56,83.763,76.268,79.649
Voting Classifier,84.359,85.164,83.389,84.216
