## "full dataframe" refers to a dataframe with all domains and all features, such that classifiers may be trained simply by passing the dataframe to a function and selecting the proper columns

We are interested in three sets of features:
- Hyperlinking
- Content analysis
- Meta tags

In [2]:
import pandas as pd
import pickle

# Start by loading the content dataframe
# It is saved as html_df_tokenized.pkl
with open(r'C:\Users\ewais\Documents\GitHub\misinfo_detection\content_analysis\analyses v2\html_df_tokenized.pkl', 'rb') as f:
    content_df = pickle.load(f)
f.close()

# Then, load the meta tag dataframe
# It is saved as df_tokenized.pkl
with open(r'C:\Users\ewais\Documents\GitHub\misinfo_detection\meta tag analysis\df_tokenized.pkl', 'rb') as f:
    meta_df = pickle.load(f)
f.close()

# Load the hyperlinking dataframe
# It is saved as domain_df.pkl
with open(r'C:\Users\ewais\Documents\GitHub\misinfo_detection\mutual_hyperlinking\hyperlinking_df.pkl', 'rb') as f:
    hyperlinking_df = pickle.load(f)
f.close()


## meta_df is built off of content_df, so we will only use meta_df

In [None]:
# Get a sense of the columns of each dataframe
def print_column_names(df_name, df):
    print('~~~~~~~ {} ~~~~~~~~'.format(df_name))
    print(df.columns)

print_column_names('Content', content_df)
print_column_names('Meta Tags', meta_df)
print_column_names('Hyperlinking', hyperlinking_df)

~~~~~~~ Content ~~~~~~~~
Index(['index', 'site', 'label', 'y', 'text', 'tokenized'], dtype='object')
~~~~~~~ Meta Tags ~~~~~~~~
Index(['index', 'site', 'label', 'y', 'text', 'tokenized', 'meta text',
       'meta tokenized'],
      dtype='object')
~~~~~~~ Hyperlinking ~~~~~~~~
Index(['global_index', 'subindex', 'domain', 'label', 'num_incoming_sites',
       'incoming_sites_list', 'num_incoming_real_sites',
       'incoming_real_sites_list', 'num_incoming_fake_sites',
       'incoming_fake_sites_list', 'num_outgoing_fake_sites',
       'num_outgoing_real_sites', 'num_outgoing_sites',
       'outgoing_fake_sites_list', 'outgoing_real_sites_list',
       'outgoing_sites_list', 'percent_fake_incoming', 'percent_fake_outgoing',
       'incoming_to_outgoing_ratio', 'color', 'vectorized_links', 'y'],
      dtype='object')


In [14]:
# Rework the dataframes so they have the same column names
meta_df = meta_df.reset_index().rename(columns = {'index' : 'subindex', 'level_0' : 'global_index'})

In [28]:
meta_df.tail()

Unnamed: 0,global_index,subindex,site,label,y,text,tokenized,meta text,meta tokenized
994,994,495,http://japaninsides.com,fake,1,"Over the generations, the most improved sector...","[gener, improv, sector, thi, fifteenth, versio...",Learn more about JapanJapan InsideLearn more ...,"[learn, japanjapan, insidelearn, japan, learn,..."
995,995,496,http://voxpoliticalonline.com,fake,1,The constituencies that the Conservatives won ...,"[constitu, conserv, north, england, contain, m...","Vox Political, Mike Sivier, politics, UK, Eng...","[vox, polit, mike, sivier, polit, uk, england,..."
996,996,497,http://magavoter.com,fake,1,"Cookies are uniquely assigned to you, and can ...","[cooki, uniqu, assign, onli, read, web, server...",See relevant content for Magavoter.com,"[see, relev, content, magavot, com]"
997,997,498,http://anthonyblogan.com,fake,1,"Current events related to politics, general ne...","[current, event, relat, polit, gener, news, in...",ANTHONYBLOGAN.com is an original news and opi...,"[anthonyblogan, com, origin, news, opinion, we..."
998,998,499,http://focusonthefamily.com,fake,1,Take a pregnancy for example. When the upstair...,"[take, pregnanc, exampl, upstair, brain, ha, d...","books, audio, parenting books, marriage books...","[book, audio, parent, book, marriag, book, eva..."


In [38]:
# Join the dataframes on global_index
full_df = meta_df.join(hyperlinking_df, lsuffix='_meta', rsuffix='_hyperlinking', on='global_index')

# Remove redundant columns
full_df = full_df.drop(columns=['global_index_hyperlinking', 'subindex_hyperlinking', 'label_hyperlinking', 'y_hyperlinking'])
full_df = full_df.rename(columns={'global_index_meta' : 'global_index', 'subindex_meta' : 'subindex', 'label_meta': 'label', 'y_meta': 'y'})
print_column_names('Full', full_df)

~~~~~~~ Full ~~~~~~~~
Index(['global_index', 'subindex', 'site', 'label', 'y', 'text', 'tokenized',
       'meta text', 'meta tokenized', 'domain', 'num_incoming_sites',
       'incoming_sites_list', 'num_incoming_real_sites',
       'incoming_real_sites_list', 'num_incoming_fake_sites',
       'incoming_fake_sites_list', 'num_outgoing_fake_sites',
       'num_outgoing_real_sites', 'num_outgoing_sites',
       'outgoing_fake_sites_list', 'outgoing_real_sites_list',
       'outgoing_sites_list', 'percent_fake_incoming', 'percent_fake_outgoing',
       'incoming_to_outgoing_ratio', 'color', 'vectorized_links'],
      dtype='object')


In [39]:
# Save the dataframe
with open('full_df.pkl', 'wb') as f:
    pickle.dump(full_df, f)
f.close()