In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

In [2]:
fakenews = set(['fake', 'conspiracy'])
relevant = ['reliable', 'fake']

def label_news(field):
    '''function for labeling news articles as either fake, reliable or unknown'''
    if field in fakenews:
        return 'fake'
    elif field == 'reliable':
        return 'reliable'
    else:
        return 'unknown'
    
def bin_target(x):
    '''returns 0 for reliable content and 1 for fake'''
    if x=="reliable":
        return 0
    if x=="fake":
        return 1

In [3]:
#Reading in preprocessed data
big_dataset_preprocessed = pd.read_pickle('data/995,000_rows_preprocessed.pkl')
labeled_content = pd.DataFrame(big_dataset_preprocessed[['content', 'type']])

print(labeled_content['type'].value_counts())

type
reliable                      218564
political                     194518
bias                          133232
fake                          104883
conspiracy                     97314
rumor                          56445
unknown                        43534
unreliable                     35332
clickbait                      27412
junksci                        14040
satire                         13160
hate                            8779
2018-02-10 13:43:39.521661         1
Name: count, dtype: int64


In [4]:
#Preparing the content without the extra scraped data

labeled_content['type'] = labeled_content['type'].apply(lambda x: label_news(x)).astype('category')

print(labeled_content['type'].value_counts())
labeled_content = labeled_content[labeled_content["type"].isin(relevant)]
print(labeled_content['type'].value_counts())

print(len(labeled_content))



type
unknown     574239
reliable    218564
fake        202197
Name: count, dtype: int64
type
reliable    218564
fake        202197
unknown          0
Name: count, dtype: int64
420761


In [5]:
X = labeled_content['content']
y = labeled_content['type'].apply(lambda x: bin_target(x))

len(X)

420761

In [6]:
#Split data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Split the newly created test data equally into validation and test data (10% each of the total dataset)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [7]:
#baseline: logistic regression where x is length of article
X_len_train = pd.DataFrame(X_train.apply(lambda x: len(x))) #get feature with length of article
X_len_val = pd.DataFrame(X_val.apply(lambda x: len(x))) #get X for validation

model = LogisticRegression()
reg = model.fit(X_len_train, y_train)

y_pred = model.predict(X_len_val)
acc = accuracy_score(y_pred,y_val)

print(acc)

0.6007225021389866


In [8]:
#Reading in extra scraped preprocessed data
scraped_data_preprocessed = pd.read_pickle('data/scraped_data_preprocessed.pkl')
scraped_data_content = pd.DataFrame(scraped_data_preprocessed)

In [9]:
#adding column with default value 'reliable'
scraped_data_content['type'] = 'reliable'
#Renaming 'text' column to 'content'
scraped_data_content.rename(columns={'text' : 'content'}, inplace=True)

In [10]:
#Concatenating the labeled content and the extra scraped and labeled content
labeled_content_extended = pd.concat([labeled_content, scraped_data_content])

len(labeled_content_extended)

425210

In [11]:
#Labeling news as either fake or reliable
labeled_content_extended['type'] = labeled_content_extended['type'].apply(lambda x: label_news(x)).astype('category')

print(labeled_content_extended['type'].value_counts())

labeled_content_extended.head()

type
reliable    223013
fake        202197
Name: count, dtype: int64


Unnamed: 0,content,type
1,"[cost, senat, bank, committe, jp, morgan, buy, cur000000, bribe, news, hedg, hour, time, jami, dimon, sit, senat, bank, committe, prove, smarter, ...",fake
3,"[julia, geist, ask, draw, pictur, comput, scientist, year, numberyearold, sketch, businessman, wear, glass, tie, classmat, draw, similar, depict, ...",reliable
4,"[number, compil, studi, vaccin, danger, activist, post, sep, number, number, shortag, research, negat, effect, wide, varieti, vaccin, gardasil, se...",fake
5,"[spend, major, wake, hour, stare, content, comput, smartphon, will, ignor, ocular, havoc, blue, light, electron, devic, eyestrain, dryness, irrita...",reliable
11,"[republican, privat, medicar, make, day, sen, chuck, schumer, getti, schumer, accus, gop, war, senior, share, facebook, share, twitter, futur, sen...",reliable


In [12]:
labeled_content_extended.to_pickle('labeled_content_extended.pkl')

In [13]:
#Prepraring the content with extra scraped data
X = labeled_content_extended['content']
y = labeled_content_extended['type'].apply(lambda x: bin_target(x))

In [14]:
#Split data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Split the newly created test data equally into validation and test data (10% each of the total dataset)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [15]:
#baseline: logistic regression where x is length of article
X_len_train = pd.DataFrame(X_train.apply(lambda x: len(x))) #get feature with length of article
X_len_val = pd.DataFrame(X_val.apply(lambda x: len(x))) #get X for validation

model = LogisticRegression()
reg = model.fit(X_len_train, y_train)

y_pred = model.predict(X_len_val)
acc = accuracy_score(y_pred,y_val)

print(acc)

0.6107100021166012
