In [1]:
import networkx as nx
import pandas as pd

In [14]:
# Each row in links_final shows how a linker relates to another linker based on how many shared sources they have
filename_base = "fbh2_ct_scan_2016-processed-shared-sources"
links_final = pd.read_csv(f"results/{filename_base}-links.csv")
links_final = links_final[ links_final['link'] >= 4 ]
links_final

Unnamed: 0,site1,site2,link
0,www.facebook.com/groups/538773259559791,www.facebook.com/groups/1016387625038290,138
1,www.facebook.com/groups/538773259559791,www.facebook.com/groups/308734279321570,137
2,www.facebook.com/groups/1016387625038290,www.facebook.com/groups/859836347426689,98
3,www.facebook.com/groups/308734279321570,www.facebook.com/groups/1016387625038290,95
4,www.facebook.com/groups/308734279321570,www.facebook.com/groups/505037502920274,75
...,...,...,...
13320,www.facebook.com/groups/1695807327318825,www.facebook.com/groups/1749764535277188,4
13321,www.facebook.com/groups/287398064766941,www.facebook.com/groups/365684020289315,4
13322,www.facebook.com/groups/1695807327318825,www.facebook.com/groups/669427883158907,4
13323,www.facebook.com/groups/1695807327318825,www.facebook.com/groups/663999290411287,4


In [5]:
# See the link size distribution
import plotly.express as px
fig = px.histogram(links_final, x="link")
fig.show()

In [15]:
# Community detection algorithm that assigns a community to each linker
G = nx.Graph()
for i in links_final.index.tolist():
    source = links_final.at[i,'site1']
    dest = links_final.at[i,'site2']
    edge_size = links_final.at[i,'link']
    G.add_edge(source,dest, weight=edge_size)

from networkx import edge_betweenness_centrality as betweenness
def most_central_edge(G):
    centrality = betweenness(G, weight="weight")
    return max(centrality, key=centrality.get)
    
comp = nx.algorithms.community.centrality.girvan_newman(G, most_valuable_edge=most_central_edge)
communities = tuple(sorted(c) for c in next(comp))

In [16]:
# Checking how many linkers per community
for i in communities:
    print(len(i))

524
61
11
4
2
5
2
2


In [17]:
# Tag each node with its respective community
for i in range(len(communities)):
    for node in communities[i]:
        G.nodes[node]['name'] = node
        G.nodes[node]['comm'] = i

In [18]:
# Roots is a reference table for each linker, providing more info about each linker
roots = pd.read_csv(f"results/{filename_base}-nodes.csv")
roots

Unnamed: 0,index,connectors,linker_name,linker_type,linker_url
0,www.facebook.com/groups/538773259559791,2062.0,PHILIPPINE FEDERAL MOVEMENT INTERNATIONAL,group,www.facebook.com/groups/538773259559791
1,www.facebook.com/groups/308734279321570,1130.0,Freedom Wall (original),group,www.facebook.com/groups/308734279321570
2,www.facebook.com/groups/1016387625038290,1024.0,SARA DUTERTE SOLID SUPPORTERS,group,www.facebook.com/groups/1016387625038290
3,www.facebook.com/groups/859836347426689,443.0,(vf)DDS/ DIEHARD DUTERTE SUPPORTERS,group,www.facebook.com/groups/859836347426689
4,www.facebook.com/groups/505037502920274,376.0,Gising Na Pilipinas!,group,www.facebook.com/groups/505037502920274
...,...,...,...,...,...
2564,www.facebook.com/groups/1563625567284575,2.0,"Youth BJP , Odisha",group,www.facebook.com/groups/1563625567284575
2565,www.facebook.com/NamoKeralam,2.0,NAMO Keralam,page,www.facebook.com/NamoKeralam
2566,www.facebook.com/marendorser,2.0,Filipinos Endorsing Marcos A HERO,page,www.facebook.com/marendorser
2567,www.facebook.com/groups/422286841202782,2.0,BLF Inc. - Batang Licab Forever Inc.,group,www.facebook.com/groups/422286841202782


In [19]:
# Add a 'community' column to the roots table to include the tagged community of each node. If the linker has no tagged community,
# we tag it with '99', or no perceived community
roots = roots.set_index('index')
for root in roots.index.tolist():
    try:
        roots.at[root,'community'] = str(G.nodes[root]['comm'])
    except:
        roots.at[root,'community'] = str(99)

In [20]:
roots

Unnamed: 0_level_0,connectors,linker_name,linker_type,linker_url,community
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
www.facebook.com/groups/538773259559791,2062.0,PHILIPPINE FEDERAL MOVEMENT INTERNATIONAL,group,www.facebook.com/groups/538773259559791,0
www.facebook.com/groups/308734279321570,1130.0,Freedom Wall (original),group,www.facebook.com/groups/308734279321570,0
www.facebook.com/groups/1016387625038290,1024.0,SARA DUTERTE SOLID SUPPORTERS,group,www.facebook.com/groups/1016387625038290,0
www.facebook.com/groups/859836347426689,443.0,(vf)DDS/ DIEHARD DUTERTE SUPPORTERS,group,www.facebook.com/groups/859836347426689,0
www.facebook.com/groups/505037502920274,376.0,Gising Na Pilipinas!,group,www.facebook.com/groups/505037502920274,0
...,...,...,...,...,...
www.facebook.com/groups/1563625567284575,2.0,"Youth BJP , Odisha",group,www.facebook.com/groups/1563625567284575,99
www.facebook.com/NamoKeralam,2.0,NAMO Keralam,page,www.facebook.com/NamoKeralam,99
www.facebook.com/marendorser,2.0,Filipinos Endorsing Marcos A HERO,page,www.facebook.com/marendorser,99
www.facebook.com/groups/422286841202782,2.0,BLF Inc. - Batang Licab Forever Inc.,group,www.facebook.com/groups/422286841202782,99


In [21]:
roots.to_csv(f'results/{filename_base}-nodes-tagged.csv')

# Who are these roots sharing from?
We turn our attention to community 0 and community 3, since these are the largest tagged communities of linkers

In [25]:
# Master is the complete list of posts. For each post, we get the source and linker
master = pd.read_csv(
    f"data/{filename_base[:-15]}.csv",
    usecols=["linker_name","linker_type","linker_url","source_name","source_type","source_url"],
    dtype=str
)
master.head()

Unnamed: 0,linker_type,linker_name,linker_url,source_type,source_name,source_url
0,page,ABS-CBN,www.facebook.com/ABSCBNnetwork,external website,entertainment.abs-cbn.com,entertainment.abs-cbn.com
1,page,GMA News,www.facebook.com/gmanews,external website,www.gmanetwork.com,www.gmanetwork.com
2,page,PSSAP Underground Media,www.facebook.com/526965450801338,facebook,526965450801338,www.facebook.com/526965450801338
3,page,MOCHA USON BLOG,www.facebook.com/Mochablogger,facebook,Mochablogger,www.facebook.com/Mochablogger
4,page,MOCHA USON BLOG,www.facebook.com/Mochablogger,facebook,Mochablogger,www.facebook.com/Mochablogger


In [26]:
# Alias for communities to be observed
global_tags = {
    '0': 'comm_0',
    '1': 'comm_1'
}

In [28]:
# For each community of linkers, get the posts in master that contain these linkers.
# Root sources is then a dictionary with keys equal to the observed communities (i.e. keys in global tags)
# Each value in root sources is a dataframe of every unique source that the linkers of that community have sahred from.
# The sharers column shows how many linkers shared from that source. This column is specific to the community 
# (i.e. if Source X is shared 40 times in community 0 and 20 times in community 1, it will appear twice and the sharers column will show 40 and 20, respectively)
root_sources = {}
for i in global_tags.keys():
    roots_i = roots[roots['community'] == i].reset_index()
    root_sources[i] = (
        pd
        .merge(
            master,
            roots_i,
            left_on='linker_url',
            right_on='linker_url',
            how='inner'
        )
        .groupby('source_url')
        .agg(
            Sharers=('source_url','count'),
            Name=('source_name','first'),
            Type=('source_type','first')
        )
        .sort_values(by='Sharers',ascending=False)
        .reset_index()
    )
    root_sources[i]['community'] = [i]*len(root_sources[i])

In [29]:
# Concatenate all dataframes from the previous cell
root_sources_concat = pd.concat([root_sources[i] for i in root_sources.keys()])
root_sources_concat

Unnamed: 0,source_url,Sharers,Name,Type,community
0,www.facebook.com/rapplerdotcom,771,rapplerdotcom,facebook,0
1,www.facebook.com/Mochablogger,657,Mochablogger,facebook,0
2,www.facebook.com/526965450801338,648,526965450801338,facebook,0
3,news.abs-cbn.com,612,news.abs-cbn.com,external website,0
4,www.rappler.com,600,www.rappler.com,external website,0
...,...,...,...,...,...
445,www.facebook.com/530767340283701,1,530767340283701,facebook,1
446,www.facebook.com/533311433442726,1,533311433442726,facebook,1
447,www.facebook.com/615880665179442,1,615880665179442,facebook,1
448,https://www.facebook.com/photo.php?fbid=282761...,1,Facebook profile: Photos from Omaram Choudhary...,facebook,1


In [30]:
root_sources_concat.to_csv(f"results/{filename_base}-by-community.csv",index=False)

# Label sources and linkers

In [31]:
# Get all the unique sources from master
sources = master.drop_duplicates(subset=['source_url'])[["source_name","source_type","source_url"]]
sources

Unnamed: 0,source_name,source_type,source_url
0,entertainment.abs-cbn.com,external website,entertainment.abs-cbn.com
1,www.gmanetwork.com,external website,www.gmanetwork.com
2,526965450801338,facebook,www.facebook.com/526965450801338
3,Mochablogger,facebook,www.facebook.com/Mochablogger
6,pinoytrending.altervista.org,external website,pinoytrending.altervista.org
...,...,...,...
40424,PRO7PNP,facebook,www.facebook.com/PRO7PNP
40447,www.westernjournalism.com,external website,www.westernjournalism.com
40449,www.navhindtimes.in,external website,www.navhindtimes.in
40451,ktsmradio.iheart.com,external website,ktsmradio.iheart.com


In [32]:
# Assign a final community (color) to each source, use root_sources_concat as the reference table
# y = only in community 0
# b = only in community 1
# g = both in community 0 and 1
# w = no community
for i in sources.index.tolist():
    numComms = root_sources_concat[ root_sources_concat['source_url'] == sources.at[i,'source_url']]
    if len(numComms) == 2.0:
        sources.at[i,'color'] = 'g'
    elif len(numComms) == 1.0:
        if numComms.iloc[0]['community'] == '0':
            sources.at[i,'color'] = 'y'
        if numComms.iloc[0]['community'] == '1':
            sources.at[i,'color'] = 'b'
    else:
        sources.at[i,'color'] = 'w'
sources

Unnamed: 0,source_name,source_type,source_url,color
0,entertainment.abs-cbn.com,external website,entertainment.abs-cbn.com,y
1,www.gmanetwork.com,external website,www.gmanetwork.com,y
2,526965450801338,facebook,www.facebook.com/526965450801338,y
3,Mochablogger,facebook,www.facebook.com/Mochablogger,y
6,pinoytrending.altervista.org,external website,pinoytrending.altervista.org,y
...,...,...,...,...
40424,PRO7PNP,facebook,www.facebook.com/PRO7PNP,w
40447,www.westernjournalism.com,external website,www.westernjournalism.com,w
40449,www.navhindtimes.in,external website,www.navhindtimes.in,w
40451,ktsmradio.iheart.com,external website,ktsmradio.iheart.com,w


In [33]:
# Get all unique linkers from master
linkers = master.drop_duplicates(subset=['linker_url'])[["linker_name","linker_type","linker_url"]]
linkers

Unnamed: 0,linker_name,linker_type,linker_url
0,ABS-CBN,page,www.facebook.com/ABSCBNnetwork
1,GMA News,page,www.facebook.com/gmanews
2,PSSAP Underground Media,page,www.facebook.com/526965450801338
3,MOCHA USON BLOG,page,www.facebook.com/Mochablogger
7,ABS-CBN News,page,www.facebook.com/abscbnNEWS
...,...,...,...
40460,Save Our Schools; The Movement,page,www.facebook.com/162470283812585
40461,Network For Progress,page,www.facebook.com/networkforprogress
40462,Search Engine Marketing,group,www.facebook.com/groups/2211885060
40466,Fox News,page,www.facebook.com/FoxNews


In [34]:
# For each linker, get all sources connected to it.
# Then, lookup each source in the root_sources_concat table.
# Group these sources by how many times the linker shared it. (times_shared)
# Each source will have an assigned color (from the sources table)
for i in linkers.index.tolist():
    linker_sources = (
        master
            [master['linker_url'] == linkers.at[i,'linker_url']]
            .groupby('source_url')
            .agg(times_shared=('source_url','count'))
            .sort_values(by='times_shared',ascending=False)
            .reset_index()
    )
    linker_sources['color'] = linker_sources['source_url'].apply(
        lambda x: sources.set_index('source_url').at[x,'color']
    )
    try:
        color_tally = linker_sources.groupby('color').agg(c=('times_shared','sum')).reset_index()
        finalColor = color_tally.at[color_tally['c'].idxmax(),'color']
    except:
        finalColor = 'w'
    linkers.at[i,'color'] = finalColor
linkers

Unnamed: 0,linker_name,linker_type,linker_url,color
0,ABS-CBN,page,www.facebook.com/ABSCBNnetwork,w
1,GMA News,page,www.facebook.com/gmanews,y
2,PSSAP Underground Media,page,www.facebook.com/526965450801338,y
3,MOCHA USON BLOG,page,www.facebook.com/Mochablogger,y
7,ABS-CBN News,page,www.facebook.com/abscbnNEWS,y
...,...,...,...,...
40460,Save Our Schools; The Movement,page,www.facebook.com/162470283812585,g
40461,Network For Progress,page,www.facebook.com/networkforprogress,w
40462,Search Engine Marketing,group,www.facebook.com/groups/2211885060,y
40466,Fox News,page,www.facebook.com/FoxNews,w


In [35]:
# Concatenate the sources and linkers dataset, then rename some values for better visualizations
sources2 = sources.rename(columns={'source_name': 'name','source_type':'type','source_url':'url'})
sources2['type'] = ['source']*len(sources2)
sources2['size'] = sources2['url'].apply(lambda x: len(master[master['source_url'] == x]))
sources2['url'] = 'src_'+sources2['url']

linkers2 = linkers.rename(columns={'linker_name': 'name','linker_type':'type','linker_url':'url'})
linkers2['url'] = 'lnk_'+linkers2['url']
linkers2['type'] = ['linkers']*len(linkers2)
linkers2['size'] = [1]*len(linkers2)

allNodes = pd.concat([sources2,linkers2],ignore_index=True)
allNodes['finalColor'] = allNodes['type'] + allNodes['color']
allNodes = allNodes.drop_duplicates(subset=['url'])
allNodes['finalColor'] = allNodes['finalColor'].replace({
    'sourcew': 'No community (Source)',
    'sourcey': 'Community B (Source)',
    'sourceb': 'Community A (Source)',
    'sourceg': 'Community AB (Source)',
    'linkersw': 'No community (Sharer)',
    'linkersy': 'Community B (Sharer)',
    'linkersb': 'Community A (Sharer)',
    'linkersg': 'Community AB (Sharer)'
})
allNodes

Unnamed: 0,name,type,url,color,size,finalColor
0,entertainment.abs-cbn.com,source,src_entertainment.abs-cbn.com,y,9,Community B (Source)
1,www.gmanetwork.com,source,src_www.gmanetwork.com,y,336,Community B (Source)
2,526965450801338,source,src_www.facebook.com/526965450801338,y,738,Community B (Source)
3,Mochablogger,source,src_www.facebook.com/Mochablogger,y,767,Community B (Source)
4,pinoytrending.altervista.org,source,src_pinoytrending.altervista.org,y,62,Community B (Source)
...,...,...,...,...,...,...
14411,Save Our Schools; The Movement,linkers,lnk_www.facebook.com/162470283812585,g,1,Community AB (Sharer)
14412,Network For Progress,linkers,lnk_www.facebook.com/networkforprogress,w,1,No community (Sharer)
14413,Search Engine Marketing,linkers,lnk_www.facebook.com/groups/2211885060,y,1,Community B (Sharer)
14414,Fox News,linkers,lnk_www.facebook.com/FoxNews,w,1,No community (Sharer)


In [36]:
# Generate the links of the agent map
agentLinks = []
for i in master.index.tolist():
    agentLink = {
        'sourceUrl': 'src_' + str(master.at[i,'source_url']),
        'linkerUrl': 'lnk_' + str(master.at[i,'linker_url']),
        'linkId': str(master.at[i,'source_url']) + str(master.at[i,'linker_url'])
    }
    agentLinks.append(agentLink)

agentMap = (
    pd.DataFrame(agentLinks)
    .groupby(['sourceUrl','linkerUrl','linkId'])
    .agg(linkStrength=('linkId','count'))
    .sort_values(by='linkStrength',ascending=False)
    .reset_index()
)
agentMap

Unnamed: 0,sourceUrl,linkerUrl,linkId,linkStrength
0,src_www.facebook.com/groups/538773259559791,lnk_nan,www.facebook.com/groups/538773259559791nan,383
1,src_www.facebook.com/groups/308734279321570,lnk_nan,www.facebook.com/groups/308734279321570nan,289
2,src_www.facebook.com/IamAgainstPaidMedia,lnk_www.facebook.com/aapnomore4aamaadmi,www.facebook.com/IamAgainstPaidMediawww.facebo...,216
3,src_www.philnews.xyz,lnk_www.facebook.com/SecMartinAndanar,www.philnews.xyzwww.facebook.com/SecMartinAndanar,165
4,src_www.facebook.com/boycottabscbnstation,lnk_www.facebook.com/groups/538773259559791,www.facebook.com/boycottabscbnstationwww.faceb...,158
...,...,...,...,...
21027,src_t.co,lnk_www.facebook.com/605782269475008,t.cowww.facebook.com/605782269475008,1
21028,src_t.co,lnk_www.facebook.com/BabyfaceBenji,t.cowww.facebook.com/BabyfaceBenji,1
21029,src_t.co,lnk_www.facebook.com/EatingDisordersAwareness....,t.cowww.facebook.com/EatingDisordersAwareness....,1
21030,src_t.co,lnk_www.facebook.com/HugotWords,t.cowww.facebook.com/HugotWords,1


In [37]:
# Export agentMap and allNodes
agentMap.to_csv(f'results/{filename_base}-agent-map-color-coded-links.csv',index=False)
allNodes.to_csv(f'results/{filename_base}-agent-map-color-coded-nodes.csv',index=False)