## Takes full_df (containing all features of the scraped data), creates classifiers and uses them to find the top logits.

In [1]:
import pickle
from pathlib import Path

# First, load the dataframe
with open(Path.cwd() / 'sp_df_tokenized.pkl', 'rb') as f:
    df = pickle.load(f)

df.head()

Unnamed: 0,index,site,text,tokenized
0,0,dailyexpose.uk,[…] Does the Covid-19 Virus contain Genet...,"[…, doe, covid, viru, contain, genet, sequenc,..."
1,1,rumble.com,Note\n \t\t\tthat this Policy may be modified ...,"[note, thi, polici, may, modifi, time, time, s..."
2,2,harpers.org,"It wasn’t just about the PowerPoint, though; i...","[’, powerpoint, though, retrospect, powerpoint..."
3,3,kanekoa.substack.com,"The pathologist cited “rare, severe side effec...","[pathologist, cite, “, rare, sever, side, effe..."
4,6,pattyporter.net,Support local candidates running for office wi...,"[support, local, candid, run, offic, boot, gro..."


In [2]:
with open('content_logit_tokens.pkl', 'rb') as f:
    content_logit_tokens = pickle.load(f)
with open('meta_logit_tokens.pkl', 'rb') as f:
    meta_logit_tokens = pickle.load(f)

In [3]:
print(len(content_logit_tokens))
print(len(meta_logit_tokens))

87650
14663


In [None]:
import pickle
from pathlib import Path

def vocab_list_from_logit_tokens(logit_tokens, num_tokens=100):
    '''
    Takes logit_tokens output by logit_explained_variance() and creates vocab_list for vocabify_dataframe
    ignores non-alphanumeric tokens
    '''
    vocab_list = []
    i = 0
    while len(vocab_list) < num_tokens:
        i += 1
        non_ascii = False
        for x in logit_tokens[i]:
            for char in x:
                if ord(char) < 65 or ord(char) > 122:
                    non_ascii = True
        if not non_ascii:
            vocab_list.append(logit_tokens[i])
    return vocab_list

content_vocab_list_500 = vocab_list_from_logit_tokens(content_logit_tokens, num_tokens=500)
meta_vocab_list_500 = vocab_list_from_logit_tokens(meta_logit_tokens, num_tokens=500)

print(content_vocab_list_500)
print(meta_vocab_list_500)

In [6]:
from analysis_functions import *

def vocabify_dataframe(df, col, new_col_name, vocab_list):
    '''
    vocab_list : list of tokens to filter df[col] with

    takes a dataframe, and creates a new column ('reduced_tokens') from 'tokenized'. Basically filters the list of tokens in 'tokenized' to only include that in 'vocab_list'

    after vocabifying dataframe, should be able to train logistic regression with reduced dimensionality vector
    '''
    print(f'\ntokenizing {col} into {new_col_name}')
    df[new_col_name] = df.apply(lambda row: [x for x in row[col] if x in vocab_list], axis=1)

    return df

df = vocabify_dataframe(df, 'tokenized', 'reduced_content_500', content_vocab_list_500)
df = vocabify_dataframe(df, 'meta_tokenized', 'reduced_meta_500', meta_vocab_list_500)

df.head()


tokenizing tokenized into reduced_content_500

tokenizing meta_tokenized into reduced_meta_500


KeyError: 'meta_tokenized'

In [11]:
# Get TF-IDF representations of the proper columns

meta_tfidf_100, meta_vocab_100, _ = tfidf_transformation(df, 'reduced_meta_100')
content_tfidf_100, content_vocab_100, _ = tfidf_transformation(df, 'reduced_content_100')
meta_tfidf_500, meta_vocab_500, _ = tfidf_transformation(df, 'reduced_meta_500')
content_tfidf_500, content_vocab_500, _ = tfidf_transformation(df, 'reduced_content_500')
meta_tfidf_1000, meta_vocab_1000, _ = tfidf_transformation(df, 'reduced_meta_1000')
content_tfidf_1000, content_vocab_1000, _ = tfidf_transformation(df, 'reduced_content_1000')

In [21]:
def add_tfidf_vectors_to_dataframe(df, tfidf, new_col_name):
    assert len(df) == tfidf.shape[0], 'ERROR: size mismatch'
    
    # Convert to array
    tfidf = tfidf.toarray()
    # Normalize
    tfidf = tfidf / np.max(tfidf)

    new_col = []
    for i in range(tfidf.shape[0]):
        new_col.append(tfidf[i,:])

    df[new_col_name] = new_col
    return df

df = add_tfidf_vectors_to_dataframe(df, meta_tfidf_100, 'meta_tfidf_100')
df = add_tfidf_vectors_to_dataframe(df, content_tfidf_100, 'content_tfidf_100')
df = add_tfidf_vectors_to_dataframe(df, meta_tfidf_500, 'meta_tfidf_500')
df = add_tfidf_vectors_to_dataframe(df, content_tfidf_500, 'content_tfidf_500')
df = add_tfidf_vectors_to_dataframe(df, meta_tfidf_1000, 'meta_tfidf_1000')
df = add_tfidf_vectors_to_dataframe(df, content_tfidf_1000, 'content_tfidf_1000')
df.head()

Unnamed: 0,site,y,tokenized,meta_tokenized,global_index,subindex,domain,label,vectorized_links,reduced_content,...,reduced_content_500,reduced_meta_500,reduced_content_1000,reduced_meta_1000,meta_tfidf_100,content_tfidf_100,meta_tfidf_500,content_tfidf_500,meta_tfidf_1000,content_tfidf_1000
0,http://www.google.com,0,"[robust, integr, connect, cowork, via, googl, ...","[offici, gmail, help, center, find, tip, tutor...",0,0,.google.com,real,"[0.18734177215189873, 0.0, 10.0]","[googl, googl, cooki, product, googl, account,...",...,"[googl, googl, action, list, shop, cooki, mani...","[offici, help, find, use, answer, question, br...","[googl, googl, chat, send, invit, action, list...","[offici, help, find, use, answer, frequent, qu...","[0.00013044355184170067, 0.0, 0.00020095475506...","[0.0009109320298328371, 0.0029521434893754624,...","[0.00013044355184170067, 0.0, 0.00036633413545...","[0.0, 0.0, 0.0, 0.0004451551200154094, 0.00144...","[0.0, 0.00013044355184170067, 0.0, 0.0, 0.0003...","[0.0, 0.0, 0.0, 1.0126034675947897e-05, 0.0004..."
1,http://www.youtube.com,0,"[doctor, mike, doctor, mike, doctor, mike, ver...","[video, live, stream, live, video, cbsn, covid...",1,1,.youtube.com,real,"[0.2180906622101525, 0.0, 10.0]","[current, googl, scroll, scroll, access, googl...",...,"[mike, mike, mike, play, live, play, live, rec...","[live, live, coronaviru, vaccin, access, vacci...","[mike, mike, mike, verifi, view, play, verifi,...","[live, live, coronaviru, vaccin, access, vacci...","[0.00026088710368340134, 0.0, 0.00015071606629...","[0.0007327061979090211, 0.0010234097429834936,...","[0.00026088710368340134, 0.0, 0.0, 0.000150716...","[0.0, 0.0, 0.0, 0.00035805955305587284, 0.0005...","[0.0, 0.00026088710368340134, 0.0, 0.0, 0.0, 0...","[0.0, 0.0, 0.0, 5.063017337973948e-05, 0.00035..."
2,http://www.baidu.com,0,"[åström, karl, johan, iee, review, micropor, m...","[investor, relat, websit, contain, inform, bai...",2,3,.baidu.com,real,"[0.024390243902439025, 0.0, 10.0]",[product],...,"[review, institut, health, health, transact, e...","[investor, websit, inform, busi, stockhold, in...","[review, institut, health, health, food, scien...","[investor, websit, inform, busi, stockhold, in...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.6462038087577...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,http://www.qq.com,0,"[scan, bind, meanwhil, must, inform, exist, me...",[none],3,4,.qq.com,real,"[0.0, 0.0, 3.0]","[code, code, code, account, code, account, error]",...,"[agreement, join, registr, also, registr, agre...",[none],"[agreement, invit, join, registr, red, also, r...",[none],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 3.936191319167283e-05, 0.0, 1.9615273021...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.9235416712804245e-05, 0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0126034675947897e-05, 0.0, 0..."
4,http://www.facebook.com,0,"[discrimin, base, upon, race, religion, color,...","[facebook, compani, meta, build, technolog, he...",4,7,.facebook.com,real,"[0.2856230031948882, 0.0, 10.0]","[ad, data, english, upload, ad, product, data,...",...,"[color, includ, health, relat, medic, protect,...","[facebook, compani, build, help, connect, find...","[color, origin, includ, health, relat, medic, ...","[facebook, compani, build, technolog, help, co...","[0.0, 0.0, 5.023868876613474e-05, 0.0, 0.0, 0....","[0.0005148746255576905, 0.0014170288749002218,...","[0.0, 0.0, 9.158353386307677e-05, 5.0238688766...","[0.0, 0.0, 3.4364342205938227e-05, 0.000251609...","[0.0, 0.0, 0.0, 0.0, 9.158353386307677e-05, 0....","[0.0, 0.0, 3.4364342205938227e-05, 1.012603467..."


## Investigate the optimal classifier performance for each of the reduced tfidif columns

In [22]:
# with open('full_df_with_reduced.pkl', 'wb') as f:
#     pickle.dump(df, f)

In [1]:
import pickle
import pandas as pd

with open('full_df_with_reduced.pkl', 'rb') as f:
    df = pickle.load(f)

In [2]:
print(df.columns)

Index(['site', 'y', 'tokenized', 'meta_tokenized', 'global_index', 'subindex',
       'domain', 'label', 'vectorized_links', 'reduced_content',
       'reduced_meta', 'meta_tfidf', 'content_tfidf', 'combined_features',
       'reduced_content_100', 'reduced_meta_100', 'reduced_content_500',
       'reduced_meta_500', 'reduced_content_1000', 'reduced_meta_1000',
       'meta_tfidf_100', 'content_tfidf_100', 'meta_tfidf_500',
       'content_tfidf_500', 'meta_tfidf_1000', 'content_tfidf_1000'],
      dtype='object')


In [4]:
import numpy as np
df['vec_links_normalized'] = df.apply(lambda row: (np.array(row.vectorized_links)+0.5)/10.5, axis=1)

In [41]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import numpy as np

def grid_search_logistic_regression(df, col, param_grid = None, col_is_tfidf=False):
    if not param_grid:
        param_grid = [
                     {'classifier' : [LogisticRegression()],
                      'classifier__penalty' : ['l1', 'l2'],
                      # 'classifier__C' : np.logspace(-1, 4, 20),
                      'classifier__C' : np.logspace(-4, 4, 20),
                      'classifier__solver' : ['liblinear']}
                      ]

    if not col_is_tfidf:
        X_tfidf, tf_vocab, X = tfidf_transformation(df, col)
    else:
        X_tfidf = np.array(df[col].tolist())
    y = df['y'].to_numpy()
    X_train, _, y_train, _ = train_test_split(X_tfidf, y)

    pipe = Pipeline([('classifier' , LogisticRegression(max_iter=1000))])

    model = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=3, n_jobs=-1).fit(X_train, y_train)
    best_model = model.fit(X_train, y_train)

    print('best estimator:')
    print(best_model.best_estimator_)
    return

grid_search_logistic_regression(df=df, col='vec_links_normalized', col_is_tfidf=True)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
Fitting 5 folds for each of 40 candidates, totalling 200 fits
best estimator:
Pipeline(steps=[('classifier',
                 LogisticRegression(C=29.763514416313132, penalty='l1',
                                    solver='liblinear'))])


In [15]:
from tqdm import tqdm
from analysis_functions import *

def train_logistic_regression(df, col, col_is_tfidf=False):
    '''
    model, tf_X_train, tf_X_test, tf_y_train, tf_y_test, X, tf_vocab = train_logistic_regression(df, 'tokenized')
    tf_y_pred = model.predict(tf_X_test)
    '''
    if not col_is_tfidf:
        X_tfidf, tf_vocab, X = tfidf_transformation(df, col)
    else:
        X_tfidf = np.array(df[col].tolist())
        tf_vocab = None

    y = df['y'].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y)
    
    # Model optimized for tf-idf combined
    model = LogisticRegression(C=3792.690191, penalty='l1', solver='liblinear').fit(X_train, y_train)
    # model = LogisticRegression(C=100000, penalty='l1', solver='liblinear').fit(X_train, y_train)

    if not col_is_tfidf:
        return model, X_train, X_test, y_train, y_test, X.toarray(), tf_vocab
    else:
        return model, X_train, X_test, y_train, y_test, X_tfidf, tf_vocab

def evaluate_logistic_regression(model, tf_X_test, tf_y_test, tf_vocab, PRINT=True):
    
    tf_y_pred = model.predict(tf_X_test)
    
    f1 = f1_score(tf_y_test, tf_y_pred)
    precision = precision_score(tf_y_test, tf_y_pred)
    recall = recall_score(tf_y_test, tf_y_pred)
    accuracy = accuracy_score(tf_y_test, tf_y_pred)
    
    baseline = np.sum(tf_y_test) / tf_y_test.shape[0]
    if baseline < 0.5:
        baseline = 1 - baseline

    if PRINT:
        print('\n\nlogistic regression classifier\n-------------\naccuracy: {:.4} %\nbaseline: {:.4} %'.format(accuracy*100, np.max(baseline)*100))
        print('\nf1:         {:.4f}\nprecision:  {:.4f}\nrecall:     {:.4f}\n\n'.format(f1, precision, recall))

        if tf_vocab is not None:
            print('size of vocab: {}\n'.format(len(tf_vocab)))

            fake_idx = model.coef_.argsort()[0][-20:][::-1]
            real_idx = model.coef_.argsort()[0][:20][::-1]

            real_words = []
            fake_words = []

            for i in range(len(real_idx)):
                real_words.append(tf_vocab[real_idx[i]])
                fake_words.append(tf_vocab[fake_idx[i]])

            top_words = pd.DataFrame(list(zip(real_words, fake_words)), columns=['top "real" words', 'top "fake" words'])
            print(top_words)

    return accuracy, f1, precision, recall

def confidence_interval_logistic_regression(num_iterations, df, col='combined_features', col_is_tfidf=False, PRINT=False):
    '''
    Trains logistic regression classifiers on unmodified TF-IDF vectors from corpus
    '''
    print('Checking combined features')
    results = np.zeros(shape=(4, num_iterations))
    for j in tqdm(range(num_iterations)):
        model, _, X_test, _, y_test, _, tf_vocab = train_logistic_regression(df, col, col_is_tfidf)
        accuracy, f1, precision, recall = evaluate_logistic_regression(model, X_test, y_test, tf_vocab, PRINT=PRINT)
        results[0, j] = accuracy
        results[1, j] = f1
        results[2, j] = precision
        results[3, j] = recall

    results_df = populate_results_dataframe(results)
    print(results_df)
    # boxplot_results(results)

confidence_interval_logistic_regression(100, df, col='combined_text_500', col_is_tfidf=True)

Checking combined features


  9%|▉         | 9/100 [00:22<03:48,  2.51s/it]

In [None]:
confidence_interval_linear_SVM(100, df, col='combined_text_500', col_is_tfidf=True)

#### SVM

In [9]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

def grid_search_linear_SVM(df, col, param_grid = None, col_is_tfidf=False):
    if not param_grid:
        # param_grid = [
                     # {'classifier' : [SVC()],
                      # 'classifier__kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
                      # 'classifier__C' : np.logspace(-4, 4, 20),
                      # 'classifier__probability' : [True, False],
                      # 'classifier__tol' : [1e-4, 1e-3, 1e-2],
                      # 'classifier__decision_function_shape' : ['ovo', 'ovr']}
                      # ]
        param_grid = [
                     {'classifier' : [SVC()],
                      'classifier__kernel' : ['linear'],
                      'classifier__C' : np.logspace(-4, 4, 20),
                      'classifier__probability' : [True],
                      'classifier__tol' : [1e-4],
                      'classifier__decision_function_shape' : ['ovo']}
                      ]

    if not col_is_tfidf:
        X_tfidf, tf_vocab, X = tfidf_transformation(df, col)
    else:
        X_tfidf = np.array(df[col].tolist())
        tf_vocab = None
        
    y = df['y'].to_numpy()
    X_train, _, y_train, _ = train_test_split(X_tfidf, y)

    pipe = Pipeline([('classifier' , SVC())])

    model = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=10, n_jobs=-1).fit(X_train, y_train)
    best_model = model.fit(X_train, y_train)

    print('best estimator:')
    print(best_model.best_estimator_)
    return

grid_search_linear_SVM(df=df, col='vec_links_normalized', col_is_tfidf=True)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Fitting 5 folds for each of 20 candidates, totalling 100 fits
best estimator:
Pipeline(steps=[('classifier',
                 SVC(C=10000.0, decision_function_shape='ovo', kernel='linear',
                     probability=True, tol=0.0001))])


In [12]:
from tqdm import tqdm
from analysis_functions import *

def train_linear_SVM(df, col, col_is_tfidf=False):
    if not col_is_tfidf:
        X_tfidf, tf_vocab, X = tfidf_transformation(df, col)
    else:
        X_tfidf = np.array(df[col].tolist())
        tf_vocab = None
        
    y = df['y'].to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y)

    if not col_is_tfidf:
        # optimized for tf-idf combined
        # model = SVC(C = 11.288378916846883, decision_function_shape='ovo', probability=True, tol=1e-4).fit(X_train.toarray(), y_train)

        # optimized for tf-idf, prepended, combined
        model = SVC(C=0.00026366508987303583, decision_function_shape='ovo',
                        kernel='linear', probability=True, tol=0.0001).fit(X_train.toarray(), y_train)
    else:
        model = SVC(C=10000, decision_function_shape='ovo', probability=True, tol=0.0001).fit(X_train, y_train)

    if not col_is_tfidf:
        return model, X_train, X_test, y_train, y_test, X.toarray(), tf_vocab
    else:
        return model, X_train, X_test, y_train, y_test, X_tfidf, tf_vocab

def evaluate_linear_SVM(model, X_test, y_test, col_is_tfidf = False):
    
    if not col_is_tfidf:
        y_pred = model.predict(X_test.toarray())
    else:
        y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    return accuracy, f1, precision, recall

def confidence_interval_linear_SVM(num_iterations, df, col, col_is_tfidf=False):
    results = np.zeros(shape=(4, num_iterations))
    for j in tqdm(range(num_iterations)):
        model, _, X_test, _, y_test, _, vocab = train_linear_SVM(df, col=col, col_is_tfidf=col_is_tfidf)
        accuracy, f1, precision, recall = evaluate_linear_SVM(model, X_test, y_test, col_is_tfidf=col_is_tfidf)
        results[0, j] = accuracy
        results[1, j] = f1
        results[2, j] = precision
        results[3, j] = recall
    
    results_df = populate_results_dataframe(results)
    print(results_df)
    # boxplot_results(results)

    return results

results = confidence_interval_linear_SVM(100, df, col='vec_links_normalized', col_is_tfidf=True)

100%|██████████| 100/100 [2:03:58<00:00, 74.39s/it] 

      metric  lower_bound      mean  upper_bound
0   accuracy     0.946882  0.951027     0.955171
1         f1     0.909614  0.916681     0.923747
2  precision     0.968079  0.974043     0.980007
3     recall     0.852784  0.865827     0.878870





In [13]:
df['combined_text_500'] = df.apply(lambda row: np.concatenate((row.content_tfidf_500, row.meta_tfidf_500), axis=0),axis=1)
results = confidence_interval_logistic_regression(100, df, col='combined_text_500', col_is_tfidf=True)

TypeError: confidence_interval_logistic_regression() got an unexpected keyword argument 'col_is_tfidf'

In [25]:

results = confidence_interval_linear_SVM(100, df, col='meta_tfidf_500', col_is_tfidf=True)

100%|██████████| 100/100 [29:53<00:00, 17.93s/it]

      metric  lower_bound      mean  upper_bound
0   accuracy     0.939071  0.944203     0.949335
1         f1     0.895618  0.904871     0.914125
2  precision     0.952966  0.961369     0.969773
3     recall     0.840039  0.854789     0.869539





In [26]:

results = confidence_interval_linear_SVM(100, df, col='meta_tfidf_100', col_is_tfidf=True)

100%|██████████| 100/100 [08:39<00:00,  5.19s/it]

      metric  lower_bound      mean  upper_bound
0   accuracy     0.921989  0.927360     0.932730
1         f1     0.865866  0.874753     0.883641
2  precision     0.938092  0.948280     0.958468
3     recall     0.797867  0.811978     0.826088





Now, combine the vectors into a single feature, the order will be [content, meta, hyperlinking]

In [None]:
def combine_features(df):
    df['combined_features'] = df.apply(lambda row: np.concatenate((row.content_tfidf, row.meta_tfidf, row.vectorized_links), axis=0),axis=1)
    return df

df = combine_features(df)
df.head()

In [26]:
# Save df
with open('full_df.pkl', 'wb') as f:
    pickle.dump(df, f)
f.close()

## Investigate the relationship between classifier performance and number of tokens retained

In [None]:
from analysis_functions import confidence_interval_logistic_regression

results = confidence_interval_logistic_regression(100, df, 'content_tfidf')
