In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

In [42]:
fakenews = set(['fake', 'conspiracy'])
relevant = ['reliable', 'fake']

def label_news(field):
    '''function for labeling news articles as either fake, reliable or unknown'''
    if field in fakenews:
        return 'fake'
    elif field == 'reliable':
        return 'reliable'
    else:
        return 'unknown'
    
def bin_target(x):
    if x=="reliable":
        return 0
    if x=="fake":
        return 1

In [43]:
#Reading in preprocessed data
big_dataset_preprocessed = pd.read_pickle('995,000_rows_preprocessed.pkl')
labeled_content = pd.DataFrame(big_dataset_preprocessed[['content', 'type']])

len(labeled_content)

995000

In [52]:
#Preparing the content without the extra scraped data

labeled_content['type'] = labeled_content['type'].apply(lambda x: label_news(x)).astype('category')
labeled_content = labeled_content[labeled_content["type"].isin(relevant)]

X = labeled_content['content']
y = labeled_content['type'].apply(lambda x: bin_target(x))

len(X)

420761

In [53]:
#Split data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Split the newly created test data equally into validation and test data (10% each of the total dataset)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [54]:
#baseline: logistic regression where x is length of article
X_len_train = pd.DataFrame(X_train.apply(lambda x: len(x))) #get feature with length of article
X_len_val = pd.DataFrame(X_val.apply(lambda x: len(x))) #get X for validation

model = LogisticRegression()
reg = model.fit(X_len_train, y_train)

y_pred = model.predict(X_len_val)
acc = accuracy_score(y_pred,y_val)

print(acc)

0.6007225021389866


In [55]:
#Reading in extra scraped preprocessed data
scraped_data_preprocessed = pd.read_pickle('scraped_data_preprocessed.pkl')
scraped_data_content = pd.DataFrame(scraped_data_preprocessed)

In [56]:
#adding column with default value 'reliable'
scraped_data_content['type'] = 'reliable'
#Renaming 'text' column to 'content'
scraped_data_content.rename(columns={'text' : 'content'}, inplace=True)

In [57]:
#Concatenating the labeled content and the extra scraped and labeled content
labeled_content_extended = pd.concat([labeled_content, scraped_data_content])

len(labeled_content_extended)

425210

In [None]:
#Labeling news as either fake or reliable
labeled_content_extended['type'] = labeled_content_extended['type'].apply(lambda x: label_news(x)).astype('category')

labeled_content_extended['type'].value_counts()

type
reliable    223013
fake        202197
Name: count, dtype: int64

In [58]:
#Prepraring the content with extra scraped data
X = labeled_content_extended['content']
y = labeled_content_extended['type'].apply(lambda x: bin_target(x))

In [59]:
#Split data into 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Split the newly created test data equally into validation and test data (10% each of the total dataset)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [60]:
#baseline: logistic regression where x is length of article
X_len_train = pd.DataFrame(X_train.apply(lambda x: len(x))) #get feature with length of article
X_len_val = pd.DataFrame(X_val.apply(lambda x: len(x))) #get X for validation

model = LogisticRegression()
reg = model.fit(X_len_train, y_train)

y_pred = model.predict(X_len_val)
acc = accuracy_score(y_pred,y_val)

print(acc)

0.6107100021166012


In [None]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# import numpy as np

# def fun(doc):
#     return doc

# vectorizer = TfidfVectorizer(
#     tokenizer = fun,
#     preprocessor= fun,
#     token_pattern=None
# )

In [None]:
# X = vectorizer.fit_transform(X)

# print(X.shape)

(420761, 955247)


In [None]:
# np.save("X_tfidf.npy", X, allow_pickle=True, fix_imports=True)