## Takes full_df (containing all features of the scraped data), creates classifiers and uses them to find the top logits.

In [2]:
import pickle
from pathlib import Path
# First, load the dataframe
with open(Path.cwd().parent / 'content_analysis' / 'classifier_comparison' / 'full_df.pkl', 'rb') as f:
    df = pickle.load(f)
f.close()

df.head()

Unnamed: 0,global_index,subindex,site,label,y,text,tokenized,meta text,meta tokenized,domain,...,num_outgoing_real_sites,num_outgoing_sites,outgoing_fake_sites_list,outgoing_real_sites_list,outgoing_sites_list,percent_fake_incoming,percent_fake_outgoing,incoming_to_outgoing_ratio,color,vectorized_links
0,0,0,http://9to5mac.com,real,0,"In a memo that was leaked to the Verge , Cook ...","[memo, wa, leak, verg, cook, say, appl, “, eve...","News and reviews for Apple products, apps, an...","[news, review, appl, product, app, rumor, prov...",.9to5mac.com,...,9,9,[],"[.deadline.com, .theverge.com, .sfexaminer.com...","[.deadline.com, .theverge.com, .sfexaminer.com...",0.8,0.0,0.555556,blue,"[0.8, 0.0, 0.5555555555555556]"
1,1,1,http://wfae.org,real,0,Charlotte Talks Local News Roundup: Housing...,"[charlott, talk, local, news, roundup, hous, f...","Charlotte Podcasts,Charlotte music,Charlotte ...","[charlott, podcast, charlott, music, charlott,...",.wfae.org,...,6,6,[],"[.gao.gov, .npr.org, .nytimes.com, .seattletim...","[.gao.gov, .npr.org, .nytimes.com, .seattletim...",0.25,0.0,0.666667,blue,"[0.25, 0.0, 0.6666666666666666]"
2,2,2,http://climatefeedback.org,real,0,The workshop will then move to more concrete e...,"[workshop, move, concret, exampl, initi, tackl...",Scientific Reference to Reliable Information ...,"[scientif, refer, reliabl, inform, climat, cha...",.climatefeedback.org,...,8,8,[],"[.smithsonianmag.com, .politico.com, .accuweat...","[.smithsonianmag.com, .politico.com, .accuweat...",0.0,0.0,0.125,blue,"[0.0, 0.0, 0.125]"
3,3,3,http://wdbj7.com,real,0,Power Life by Tony Horton All Adult...,"[power, life, toni, horton, adult, due, larg, ...","wdbj, virginia local news, virginia weather, ...","[wdbj, virginia, local, news, virginia, weathe...",.wdbj7.com,...,2,2,[],"[.nasa.gov, .nbc12.com]","[.nasa.gov, .nbc12.com]",0.25,0.0,2.0,blue,"[0.25, 0.0, 2.0]"
4,4,4,http://keyc.com,real,0,Good To Know This Recommended by ...,"[good, know, thi, recommend, recommend, high, ...","keyc, keyc 12 keyc news 12, keyc tv, keyc new...","[keyc, keyc, keyc, news, keyc, tv, keyc, news,...",.keyc.com,...,2,2,[],"[.startribune.com, .wrtv.com]","[.startribune.com, .wrtv.com]",1.0,0.0,0.5,blue,"[1.0, 0.0, 0.5]"


Pseudocode:

For both meta tags and content:

Use logit_explained_variance to get a list of top logit_tokens
Use logit_tokens to generate a vocab_list
Use the vocab_list along with vocabify_dataframe to generate a new set of reduced_tokens
Featurize the tokens w/ TF-IDF, save this as another column

Then:

Combine all the featurized vectors together and train a logistic regression classifier on it

In [4]:
from analysis_functions import *

_, _, content_logit_tokens = logit_explained_variance(df, 'tokenized', num_iterations=25)
_, _, meta_logit_tokens = logit_explained_variance(df, 'meta tokenized', num_iterations=25)

100%|██████████| 25/25 [19:38<00:00, 47.13s/it]
100%|██████████| 25/25 [01:17<00:00,  3.10s/it]


In [5]:
with open('content_logit_tokens.pkl', 'wb') as f:
    pickle.dump(content_logit_tokens, f)
f.close()
with open('meta_logit_tokens.pkl', 'wb') as f:
    pickle.dump(meta_logit_tokens, f)
f.close()

In [3]:
import pickle
from pathlib import Path

with open(Path.cwd().parent / 'content_analysis' / 'classifier_comparison' / 'full_df.pkl', 'rb') as f:
    df = pickle.load(f)
f.close()
with open('content_logit_tokens.pkl', 'rb') as f:
    content_logit_tokens = pickle.load(f)
f.close()
with open('meta_logit_tokens.pkl', 'rb') as f:
    meta_logit_tokens = pickle.load(f)
f.close()

def vocab_list_from_logit_tokens(logit_tokens, num_tokens=30):
    '''
    Takes logit_tokens output by logit_explained_variance() and creates vocab_list for vocabify_dataframe
    '''
    vocab_list = []
    for i in range(1,num_tokens+1):
        vocab_list.append(logit_tokens[i])
    return vocab_list

content_vocab_list = vocab_list_from_logit_tokens(content_logit_tokens)
meta_vocab_list = vocab_list_from_logit_tokens(meta_logit_tokens)

print(content_vocab_list)
print(meta_vocab_list)

['wordpress', 'obituari', 'privaci', 'web', 'uncategor', 'republ', 'di', 'kamala', 'trump', 'prank', 'yakima', 'brainberri', 'liberti', 'biden', 'dr', 'vermont', 'click', 'mask', 'net', 'menu', 'newslett', 'subscrib', 'freedom', 'browser', 'funer', 'ddo', 'art', 'plandem', 'der', 'harri']
['archiv', 'none', 'china', 'biden', 'trump', 'weather', 'counti', 'sport', 'liberti', 'scienc', 'area', 'citi', 'di', 'page', 'dr', 'site', 'hi', 'team', 'parent', 'healthi', 'evid', 'polic', 'climat', 'plandem', 'freedom', 'signal', 'florida', 'medium', 'tea', 'greek']


In [None]:
from analysis_functions import *

df = vocabify_dataframe(df, 'tokenized', 'reduced_content', content_vocab_list)
df.head()

In [None]:
df = vocabify_dataframe(df, 'meta tokenized', 'reduced_meta', meta_vocab_list)
df.head()

In [7]:
# Save the dataframe
with open('full_df.pkl', 'wb') as f:
    pickle.dump(df, f)
f.close()

In [8]:
# Get TF-IDF representations of the proper columns

meta_tfidf, meta_vocab, _ = tfidf_transformation(df, 'reduced_meta')
content_tfidf, content_vocab, _ = tfidf_transformation(df, 'reduced_content')

In [None]:
def add_tfidf_vectors_to_dataframe(df, tfidf, new_col_name):
    assert len(df) == tfidf.shape[0], 'ERROR: size mismatch'
    
    # Convert to array
    tfidf = tfidf.toarray()
    # Normalize
    tfidf = tfidf / np.max(tfidf)

    new_col = []
    for i in range(tfidf.shape[0]):
        new_col.append(tfidf[i,:])

    df[new_col_name] = new_col
    return df

df = add_tfidf_vectors_to_dataframe(df, meta_tfidf, 'meta_tfidf')
df.head()

In [None]:
df = add_tfidf_vectors_to_dataframe(df, content_tfidf, 'content_tfidf')
df.head()

Now, combine the vectors into a single feature, the order will be [content, meta, hyperlinking]

In [None]:
def combine_features(df):
    df['combined_features'] = df.apply(lambda row: np.concatenate((row.content_tfidf, row.meta_tfidf, row.vectorized_links), axis=0),axis=1)
    return df

df = combine_features(df)
df.head()

In [26]:
# Save df
with open('full_df.pkl', 'wb') as f:
    pickle.dump(df, f)
f.close()