In [4]:
import pandas as pd
import networkx as nx

In [5]:
filename = "data/rappler-oct-2021-scan-processed.csv"
df = pd.read_csv(
    filename,
    usecols=["linker_name","linker_type","linker_url","source_name","source_type","source_url"],
    dtype=str
)

In [6]:
df

Unnamed: 0,linker_type,linker_name,linker_url,source_type,source_name,source_url
0,page,Rappler,www.facebook.com/rapplerdotcom,facebook,rapplerdotcom,www.facebook.com/rapplerdotcom
1,page,Nobel Prize,www.facebook.com/nobelprize,facebook,nobelprize,www.facebook.com/nobelprize
2,page,INQUIRER.net,www.facebook.com/inquirerdotnet,facebook,inquirerdotnet,www.facebook.com/inquirerdotnet
3,page,Nobel Prize,www.facebook.com/nobelprize,facebook,nobelprize,www.facebook.com/nobelprize
4,page,Rappler,www.facebook.com/rapplerdotcom,facebook,rapplerdotcom,www.facebook.com/rapplerdotcom
...,...,...,...,...,...,...
17405,page,Business Day,www.facebook.com/businessdaylive,external website,www.businesslive.co.za,www.businesslive.co.za
17406,page,CWA Canada,www.facebook.com/cwacanada,external website,www.bbc.com,www.bbc.com
17407,page,La Opinión,www.facebook.com/LaOpinionLA,external website,laopinion.com,laopinion.com
17408,page,Global Exchange,www.facebook.com/GlobalExchange,external website,www.democracynow.org,www.democracynow.org


In [5]:
sources = df[["source_name","source_type","source_url"]].drop_duplicates(subset=['source_url'])
sources

Unnamed: 0,source_name,source_type,source_url
0,rapplerdotcom,facebook,www.facebook.com/rapplerdotcom
1,nobelprize,facebook,www.facebook.com/nobelprize
2,inquirerdotnet,facebook,www.facebook.com/inquirerdotnet
5,StarImageArtist,facebook,www.facebook.com/StarImageArtist
8,philstarlife,facebook,www.facebook.com/philstarlife
...,...,...,...
17397,MWWomenforJustice,facebook,www.facebook.com/MWWomenforJustice
17398,www.radiosantafe.com,external website,www.radiosantafe.com
17404,www.ibtimes.co.in,external website,www.ibtimes.co.in
17407,laopinion.com,external website,laopinion.com


In [6]:
# Determine number of agents per source
hubs = pd.DataFrame()
for i in df.index.tolist():
    source = df.at[i,'source_url']
    dest = df.at[i,'linker_url']
    if source != dest:
        try:
            hubs.at[source,'connectors'] += 1
        except:
            hubs.at[source,'connectors'] = 1
hubs = hubs[ hubs['connectors'] > 20 ].sort_values(by='connectors',ascending=False).reset_index()
hubs = hubs[(~hubs['index'].isna()) & (hubs['index'] != 'bit.ly')]

In [7]:
hubs

Unnamed: 0,index,connectors
0,www.facebook.com/rapplerdotcom,3532.0
1,postly.app,376.0
2,www.rappler.com,329.0
3,www.facebook.com/nobelprize,293.0
4,www.facebook.com/subagenbangonpilipinas,236.0
...,...,...
82,https://youtube.com/watch?v=jVe7aeVne4g&featur...,22.0
83,www.facebook.com/CNNPhilippines,22.0
84,www.facebook.com/1113650032052216,22.0
85,https://www.facebook.com/photo.php?fbid=422676...,21.0


In [8]:
pd.merge(hubs,sources,left_on='index',right_on='source_url',how='left')

Unnamed: 0,index,connectors,source_name,source_type,source_url
0,www.facebook.com/rapplerdotcom,3532.0,rapplerdotcom,facebook,www.facebook.com/rapplerdotcom
1,postly.app,376.0,postly.app,external website,postly.app
2,www.rappler.com,329.0,www.rappler.com,external website,www.rappler.com
3,www.facebook.com/nobelprize,293.0,nobelprize,facebook,www.facebook.com/nobelprize
4,www.facebook.com/subagenbangonpilipinas,236.0,subagenbangonpilipinas,facebook,www.facebook.com/subagenbangonpilipinas
...,...,...,...,...,...
81,https://youtube.com/watch?v=jVe7aeVne4g&featur...,22.0,YouTube video: https://youtube.com/watch?v=jVe...,youtube,https://youtube.com/watch?v=jVe7aeVne4g&featur...
82,www.facebook.com/CNNPhilippines,22.0,CNNPhilippines,facebook,www.facebook.com/CNNPhilippines
83,www.facebook.com/1113650032052216,22.0,1113650032052216,facebook,www.facebook.com/1113650032052216
84,https://www.facebook.com/photo.php?fbid=422676...,21.0,Facebook profile:,facebook,https://www.facebook.com/photo.php?fbid=422676...


In [9]:
def shared_audience(pages,posts,site_ind1,site_ind2,threshold=0):
    site1 = pages.loc[site_ind1,'index']
    site2 = pages.loc[site_ind2,'index']
    agents1 = set(list(posts[posts['source_name'] == site1]['linker_slug']))
    agents2 = set(list(posts[posts['source_name'] == site2]['linker_slug']))
    common_agents = agents1.intersection(agents2)

    if len(common_agents) >= threshold:
        return {"shared": True, "commons": len(common_agents) }
    else:
        return {"shared": False, "commons": len(common_agents) }

In [10]:
pages_df = hubs
num_pages = len(hubs)

links = []
commons = []
page_inds = pages_df.index.tolist()
pairs = list(itertools.combinations(page_inds,2))
counter = 0
for pair in pairs:
    total_pairs = num_pages*(num_pages-1)/2
    res = shared_audience(pages=pages_df,posts=df,site_ind1=pair[0],site_ind2=pair[1])
    link = res['shared']
    commons.append(res['commons'])
    if link == True:
        links.append({
            "site1": pages_df.loc[pair[0],'index'],
            "site2": pages_df.loc[pair[1],'index'],
            "link": res['commons']
        })
    counter+= 1
    if counter%1000 == 0:
        print(f"Processed {counter} of {total_pairs} pairs")
    if counter == total_pairs:
        print(f"Processed {counter} of {total_pairs} pairs")

links = pd.DataFrame(links)

KeyError: 'linker_slug'

In [8]:
links

Unnamed: 0,site1,site2,link
0,rapplerdotcom,news.abs-cbn.com,49
1,rapplerdotcom,www.rappler.com,59
2,rapplerdotcom,www.gmanetwork.com,32
3,rapplerdotcom,News5Everywhere,42
4,rapplerdotcom,newsinfo.inquirer.net,39
...,...,...,...
2623,www.bulgaronline.com,www.getrealpundit.com,1
2624,www.bulgaronline.com,www.idntimes.com,0
2625,Facebook profile: Leon Junn,www.getrealpundit.com,0
2626,Facebook profile: Leon Junn,www.idntimes.com,0
