In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

In [16]:
fakenews = set(['fake', 'conspiracy'])
relevant = ['reliable', 'fake']

def label_news(field):
    '''function for labeling news articles as either fake, reliable or unknown'''
    if field in fakenews:
        return 'fake'
    elif field == 'reliable':
        return 'reliable'
    else:
        return 'unknown'
    
def bin_target(x):
    '''returns 0 for reliable content and 1 for fake'''
    if x=="reliable":
        return 0
    if x=="fake":
        return 1
    
def word_count_reg(field,word):
    count = 0
    for words in field:
        if words == word:
            count+=1
    return count   
    
#gridsearch to see if better models with same X exist
def gridSearch(solver,penalties,data,target):
    model = LogisticRegression()
    parameters = {'solver' : solver,'penalty': penalties, 'C' : [0.00001,0.001, 0.01, 0.1, 1.0, 10]}
    clf = GridSearchCV(model, parameters, scoring='accuracy', cv=5)
    clf.fit(data,target)
    return clf

In [17]:
labeled_content = pd.read_pickle('data/labeled_content.pkl')

In [18]:
X = labeled_content['content']
y = labeled_content['type']

In [19]:
X_train, X_val, y_train, y_val= train_test_split(X,y,test_size=0.2,random_state=0,shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_val,y_val, test_size=0.5,random_state=0,shuffle=True)

In [20]:
#baseline2: logistic regression, X is number of "fact"-mentions
def reg_from_one_word(word):
    X_word_train = pd.DataFrame(X_train.apply(lambda x: word_count_reg(x,word)))
    X_word_val = pd.DataFrame(X_val.apply(lambda x: word_count_reg(x,word)))
    model = LogisticRegression()
    reg = model.fit(X_word_train,y_train)
    y_pred = model.predict(X_word_val)
    return accuracy_score(y_pred,y_val)

print(reg_from_one_word("fact"))

0.6211617073866337


In [22]:
labeled_content_extended = pd.read_pickle('data/labeled_content_extended.pkl')

In [23]:
X = labeled_content_extended['content']
y = labeled_content_extended['type']

In [24]:
X_train, X_val, y_train, y_val= train_test_split(X,y,test_size=0.2,random_state=0,shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_val,y_val, test_size=0.5,random_state=0,shuffle=True)

In [25]:
print(reg_from_one_word("fact"))

0.6221866842266174
