## For use after intra_links.py has been run

In [1]:
import pickle
from pathlib import Path
import pandas as pd
pd.set_option('display.max_rows', 100)

def generate_dataframe(truncate_lists = False):
    '''
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ARGUMENTS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
    truncate_lists : False or int
        - if False, full list of domains in dataframe remains
        - if True, each domain list is truncated to len(<domain list>) <= truncate_lists
    '''

    df = pd.DataFrame(columns=['index', 'site', 'num_links', 'link_dict'])

    with open('url_key_sp.pkl', 'rb') as f:
        key_dict = pickle.load(f)
    f.close()

    for i in range(100):
        try:
            path = Path.cwd().parent / 'link_lists' / (str(i) + '.pkl')
            with open(path, 'rb') as f:
                link_list = pickle.load(f)
            f.close()
        except:
            print('{}.pkl not present'.format(i))
            continue

        link_list = list(set(link_list)) # some lists have repeat domains
        link_list = [x for x in link_list if x] # some have empty strings
        if truncate_lists:
            link_list = link_list[-1*truncate_lists+1:]

        link_dict = {}
        for idx, link in enumerate(link_list):
            link_dict[idx] = link
        link_dict[idx+1] = key_dict[i] # add the original site to the link_dict

        df = df.append(
            {'index' : i,
            'site' : key_dict[i],
            'num_links' : len(link_dict),
            'link_dict': link_dict},
            ignore_index=True, 
        )

    return df

df = generate_dataframe(truncate_lists=100)

df.sort_values(by=['index']).reset_index(inplace=True)
df.head(100)

4.pkl not present
5.pkl not present
8.pkl not present
13.pkl not present
15.pkl not present
23.pkl not present
26.pkl not present
39.pkl not present
44.pkl not present
45.pkl not present
48.pkl not present
51.pkl not present
52.pkl not present
53.pkl not present
57.pkl not present
61.pkl not present
63.pkl not present
65.pkl not present
73.pkl not present
76.pkl not present
85.pkl not present
86.pkl not present
87.pkl not present
88.pkl not present
89.pkl not present
90.pkl not present
91.pkl not present
92.pkl not present
93.pkl not present
94.pkl not present
95.pkl not present
96.pkl not present
97.pkl not present
98.pkl not present
99.pkl not present


Unnamed: 0,index,site,num_links,link_dict
0,0,http://dailyexpose.uk,65,{0: 'https://dailyexpose.uk/author/rhoda-wilso...
1,1,http://rumble.com,61,{0: 'https://rumble.com/v113tdd-nba-playoffs-1...
2,2,http://harpers.org,67,{0: 'https://harpers.org/author/davidwonglouie...
3,3,http://kanekoa.substack.com,20,{0: 'https://kanekoa.substack.com/p/exposing-a...
4,6,http://pattyporter.net,3,"{0: 'https://www.pattyporter.net/home', 1: 'ht..."
5,7,http://independentside.com,33,{0: 'https://independentside.com/biden-says-th...
6,9,http://technofog.substack.com,18,{0: 'https://substack.com/signup?utm_source=su...
7,10,http://alexberenson.substack.com,26,"{0: 'https://alexberenson.substack.com/', 1: '..."
8,11,http://t.me,1,{25: 'http://t.me'}
9,12,http://gettr.com,20,{0: 'https://about.gettr.com/press/football-le...


In [None]:
# Remove duds
df = df.drop([8, 10, 16, 37], axis=0)

In [9]:
df = df.reset_index().drop('level_0', axis=1)

In [10]:
df.head()

Unnamed: 0,index,site,num_links,link_dict
0,0,http://dailyexpose.uk,65,{0: 'https://dailyexpose.uk/author/rhoda-wilso...
1,1,http://rumble.com,61,{0: 'https://rumble.com/v113tdd-nba-playoffs-1...
2,2,http://harpers.org,67,{0: 'https://harpers.org/author/davidwonglouie...
3,3,http://kanekoa.substack.com,20,{0: 'https://kanekoa.substack.com/p/exposing-a...
4,6,http://pattyporter.net,3,"{0: 'https://www.pattyporter.net/home', 1: 'ht..."


In [11]:
df.to_pickle('./sydney_powell_df.pkl')

In [8]:
# Create dummy dictionary
# test_df = df.head(2)
# print(test_df['link_dict'][1])

dict1 = {0: 'https://www.bbc.com/sport',
         1: 'https://www.bbc.com/reel',
         2: 'https://www.bbc.com/worklife/article/20210727-the-coded-words-of-job-adverts-that-reveal-important-clues'}

dict2 = {0: 'https://www.nytimes.com/section/sports', 
         1: 'https://www.nytimes.com/interactive/2021/07/27/opinion/jan-6-committee-facts.html?action=click&module=Opinion&pgtype=Homepage', 
         2: 'https://www.nytimes.com/live/2021/07/28/sports/olympics-tokyo-results-medals/?action=click&module=Top%20Stories&pgtype=Homepage'}

test_df = pd.DataFrame.from_dict(
    {
        'index' : [0, 1],
        'site' : ['https://bbc.com/', 'https://nytimes.com/'],
        'num_links' : [69, 420],
        'link_dict' : [dict1, dict2]
    }
)

test_df.to_pickle('./test_df.pkl')
