In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import functions as funs

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

In [2]:
fakenews = set(['fake', 'conspiracy'])
relevant = ['reliable', 'fake']

In [3]:
#Reading in labeled content
labeled_content = pd.read_pickle('data/labeled_content.pkl')
labeled_content = pd.DataFrame(labeled_content)

In [5]:
#Preparing the content without the extra scraped data
labeled_content['type'] = labeled_content['type'].apply(lambda x: funs.label_news(x, fakenews)).astype('category')

labeled_content = labeled_content[labeled_content["type"].isin(relevant)]

In [6]:
#Splitting into feature and target
X = labeled_content['content']
y = labeled_content['type'].apply(lambda x: funs.bin_target(x))

In [7]:
#Split data into 80% training, 10% validation and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0, shuffle=True)

In [11]:
#baseline: logistic regression where x is length of article
X_len_train = pd.DataFrame(X_train.apply(lambda x: len(x))) #get feature with length of article
X_len_val = pd.DataFrame(X_val.apply(lambda x: len(x))) #get X for validation

#No need for penalty since there is no risk of overfitting to one feature when there only is one
model = LogisticRegression(penalty=None)
reg = model.fit(X_len_train, y_train)

y_pred = model.predict(X_len_val)
acc = accuracy_score(y_pred,y_val)

print(acc)

0.5976328548341097


In [12]:
#Reading in extra scraped preprocessed data
scraped_data_preprocessed = pd.read_pickle('data/scraped_data_preprocessed.pkl')
scraped_data_content = pd.DataFrame(scraped_data_preprocessed)

In [13]:
#adding column with default value 'reliable'
scraped_data_content['type'] = 'reliable'
#Renaming 'text' column to 'content'
scraped_data_content.rename(columns={'text' : 'content'}, inplace=True)

In [15]:
#Concatenating the labeled content and the extra scraped and labeled content
labeled_content_extended = pd.concat([labeled_content, scraped_data_content])

In [16]:
#Re-labeling articles as either "fake" or "reliable"
labeled_content_extended['type'] = labeled_content_extended['type'].apply(lambda x: funs.label_news(x, fakenews)).astype('category')

In [17]:
#Prepraring the content with extra scraped data
X = labeled_content_extended['content'] # feature
y = labeled_content_extended['type'].apply(lambda x: funs.bin_target(x)) #target

In [18]:
#Split data into 80% training, 10% validation and 10% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0, shuffle=True)

In [19]:
#baseline: logistic regression where x is length of article
X_len_train = pd.DataFrame(X_train.apply(lambda x: len(x))) #get feature with length of article
X_len_val = pd.DataFrame(X_val.apply(lambda x: len(x))) #get X for validation

#Again no need for penalty since there is no risk of overfitting to one feature when there only is one
model = LogisticRegression(penalty=None)
reg = model.fit(X_len_train, y_train)

y_pred = model.predict(X_len_val)
acc = accuracy_score(y_pred,y_val)

print(acc)

0.608734507655041
