In [2]:
import networkx as nx
import pandas as pd

In [23]:
# Each row in links_final shows how a linker relates to another linker based on how many shared sources they have
filename_base = "Political study - NLP Facebook Data - 20211124-processed-shared-sources"
links_final = pd.read_csv(f"results/{filename_base}-links.csv")
links_final = links_final[ links_final['link'] >= 3 ]
links_final

Unnamed: 0,site1,site2,link
0,www.facebook.com/KabataangBongbongMarcos,www.facebook.com/BBMYouthMovement,10
1,www.facebook.com/groups/376484219365408,www.facebook.com/groups/453741515435011,9
2,www.facebook.com/KabataangBongbongMarcos,www.facebook.com/Atty.LarryGadon2022,9
3,www.facebook.com/DocWillieOngOfficial,www.facebook.com/DocLizaOng,8
4,www.facebook.com/groups/881744732581920,www.facebook.com/groups/452799272396015,8
...,...,...,...
202,www.facebook.com/groups/1234783156572375,www.facebook.com/groups/927117404381091,3
203,www.facebook.com/groups/538773259559791,www.facebook.com/groups/547951305364179,3
204,www.facebook.com/iskomorenodomagoso,www.facebook.com/groups/628695250606685,3
205,www.facebook.com/groups/881744732581920,www.facebook.com/forthemotherlandph,3


In [13]:
# See the link size distribution
import plotly.express as px
fig = px.histogram(links_final, x="link")
fig.show()

In [24]:
# Community detection algorithm that assigns a community to each linker
G = nx.Graph()
for i in links_final.index.tolist():
    source = links_final.at[i,'site1']
    dest = links_final.at[i,'site2']
    edge_size = links_final.at[i,'link']
    G.add_edge(source,dest, weight=edge_size)

from networkx import edge_betweenness_centrality as betweenness
def most_central_edge(G):
    centrality = betweenness(G, weight="weight")
    return max(centrality, key=centrality.get)
    
comp = nx.algorithms.community.centrality.girvan_newman(G, most_valuable_edge=most_central_edge)
communities = tuple(sorted(c) for c in next(comp))

In [25]:
# Checking how many linkers per community
for i in communities:
    print(len(i))

67
6
4
4
4
2
4
2
2


In [26]:
# Tag each node with its respective community
for i in range(len(communities)):
    for node in communities[i]:
        G.nodes[node]['name'] = node
        G.nodes[node]['comm'] = i

In [27]:
# Roots is a reference table for each linker, providing more info about each linker
roots = pd.read_csv(f"results/{filename_base}-nodes.csv")
roots

Unnamed: 0,index,connectors,linker_name,linker_type,linker_url
0,www.facebook.com/iskomorenodomagoso,345.0,Isko Moreno Domagoso,page,www.facebook.com/iskomorenodomagoso
1,www.facebook.com/gmanews,153.0,GMA News,page,www.facebook.com/gmanews
2,www.facebook.com/abscbnNEWS,138.0,ABS-CBN News,page,www.facebook.com/abscbnNEWS
3,www.facebook.com/DocWillieOngOfficial,131.0,Doc Willie Ong,page,www.facebook.com/DocWillieOngOfficial
4,www.facebook.com/inquirerdotnet,110.0,INQUIRER.net,page,www.facebook.com/inquirerdotnet
...,...,...,...,...,...
806,www.facebook.com/HOYBOGOKA,2.0,Hoy BOGO.,page,www.facebook.com/HOYBOGOKA
807,www.facebook.com/121325864557146,2.0,MIGRANTE - DENMARK,page,www.facebook.com/121325864557146
808,www.facebook.com/OneIlocandia,2.0,One Ilocandia,page,www.facebook.com/OneIlocandia
809,www.facebook.com/groups/153932144667690,2.0,DZMM Teleradyo,group,www.facebook.com/groups/153932144667690


In [28]:
# Add a 'community' column to the roots table to include the tagged community of each node. If the linker has no tagged community,
# we tag it with '99', or no perceived community
roots = roots.set_index('index')
for root in roots.index.tolist():
    try:
        roots.at[root,'community'] = str(G.nodes[root]['comm'])
    except:
        roots.at[root,'community'] = str(99)

In [29]:
roots

Unnamed: 0_level_0,connectors,linker_name,linker_type,linker_url,community
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
www.facebook.com/iskomorenodomagoso,345.0,Isko Moreno Domagoso,page,www.facebook.com/iskomorenodomagoso,3
www.facebook.com/gmanews,153.0,GMA News,page,www.facebook.com/gmanews,99
www.facebook.com/abscbnNEWS,138.0,ABS-CBN News,page,www.facebook.com/abscbnNEWS,4
www.facebook.com/DocWillieOngOfficial,131.0,Doc Willie Ong,page,www.facebook.com/DocWillieOngOfficial,2
www.facebook.com/inquirerdotnet,110.0,INQUIRER.net,page,www.facebook.com/inquirerdotnet,99
...,...,...,...,...,...
www.facebook.com/HOYBOGOKA,2.0,Hoy BOGO.,page,www.facebook.com/HOYBOGOKA,99
www.facebook.com/121325864557146,2.0,MIGRANTE - DENMARK,page,www.facebook.com/121325864557146,99
www.facebook.com/OneIlocandia,2.0,One Ilocandia,page,www.facebook.com/OneIlocandia,99
www.facebook.com/groups/153932144667690,2.0,DZMM Teleradyo,group,www.facebook.com/groups/153932144667690,99


In [30]:
roots.to_csv(f'results/{filename_base}-nodes-tagged.csv')

# Who are these roots sharing from?
We turn our attention to community 0 and community 1, since these are the largest tagged communities of linkers

In [33]:
# Master is the complete list of posts. For each post, we get the source and linker
master = pd.read_csv(
    f"data/{filename_base[:-15]}.csv",
    usecols=["linker_name","linker_type","linker_url","source_name","source_type","source_url"],
    dtype=str
)
master.head()

Unnamed: 0,linker_type,linker_name,linker_url,source_type,source_name,source_url
0,page,KataKita,www.facebook.com/pageKataKita,facebook,pageKataKita,www.facebook.com/pageKataKita
1,page,Fan Idol,www.facebook.com/103793531351745,external website,avise.info,avise.info
2,group,Inday Sara for President Movement,www.facebook.com/groups/1070835103277770,facebook,globaldailymirror,www.facebook.com/globaldailymirror
3,group,"FERDINAND ""BONGBONG"" R. MARCOS, JR.",www.facebook.com/groups/2754357608223229,youtube,YouTube video: https://youtu.be/-rpjCg-Mdl4,https://youtu.be/-rpjCg-Mdl4
4,group,❤👉Ang Probinsyano👈❤,www.facebook.com/groups/173211334679596,facebook,randomreplay,www.facebook.com/randomreplay


In [34]:
# Alias for communities to be observed
global_tags = {
    '0': 'comm_0',
    '1': 'comm_1'
}

In [35]:
# For each community of linkers, get the posts in master that contain these linkers.
# Root sources is then a dictionary with keys equal to the observed communities (i.e. keys in global tags)
# Each value in root sources is a dataframe of every unique source that the linkers of that community have sahred from.
# The sharers column shows how many linkers shared from that source. This column is specific to the community 
# (i.e. if Source X is shared 40 times in community 0 and 20 times in community 1, it will appear twice and the sharers column will show 40 and 20, respectively)
root_sources = {}
for i in global_tags.keys():
    roots_i = roots[roots['community'] == i].reset_index()
    root_sources[i] = (
        pd
        .merge(
            master,
            roots_i,
            left_on='linker_url',
            right_on='linker_url',
            how='inner'
        )
        .groupby('source_url')
        .agg(
            Sharers=('source_url','count'),
            Name=('source_name','first'),
            Type=('source_type','first')
        )
        .sort_values(by='Sharers',ascending=False)
        .reset_index()
    )
    root_sources[i]['community'] = [i]*len(root_sources[i])

In [36]:
# Concatenate all dataframes from the previous cell
root_sources_concat = pd.concat([root_sources[i] for i in root_sources.keys()])
root_sources_concat

Unnamed: 0,source_url,Sharers,Name,Type,community
0,www.facebook.com/manilabulletin,71,manilabulletin,facebook,0
1,www.facebook.com/forthemotherlandph,58,forthemotherlandph,facebook,0
2,www.facebook.com/108727998230047,53,108727998230047,facebook,0
3,www.facebook.com/News5Everywhere,52,News5Everywhere,facebook,0
4,www.facebook.com/SaraDuterteForPresidentMovement,48,SaraDuterteForPresidentMovement,facebook,0
...,...,...,...,...,...
79,https://www.facebook.com/photo.php?fbid=244305...,1,Facebook profile:,facebook,1
80,https://www.facebook.com/photo.php?fbid=243426...,1,Facebook profile:,facebook,1
81,https://www.facebook.com/photo.php?fbid=242751...,1,Facebook profile:,facebook,1
82,https://www.facebook.com/photo.php?fbid=240691...,1,Facebook profile:,facebook,1


In [37]:
root_sources_concat.to_csv(f"results/{filename_base}-by-community.csv",index=False)

# Label sources and linkers

In [38]:
# Get all the unique sources from master
sources = master.drop_duplicates(subset=['source_url'])[["source_name","source_type","source_url"]]
sources

Unnamed: 0,source_name,source_type,source_url
0,pageKataKita,facebook,www.facebook.com/pageKataKita
1,avise.info,external website,avise.info
2,globaldailymirror,facebook,www.facebook.com/globaldailymirror
3,YouTube video: https://youtu.be/-rpjCg-Mdl4,youtube,https://youtu.be/-rpjCg-Mdl4
4,randomreplay,facebook,www.facebook.com/randomreplay
...,...,...,...
14650,cibac.partylist,facebook,www.facebook.com/cibac.partylist
14652,Facebook profile: Anna Rhea,facebook,https://www.facebook.com/photo.php?fbid=109456...
14653,Facebook profile: Adolfo,facebook,https://www.facebook.com/photo.php?fbid=421131...
14655,Facebook profile: Micks,facebook,https://www.facebook.com/photo.php?fbid=139490...


In [39]:
# Assign a final community (color) to each source, use root_sources_concat as the reference table
# y = only in community 0
# b = only in community 1
# g = both in community 0 and 1
# w = no community
for i in sources.index.tolist():
    numComms = root_sources_concat[ root_sources_concat['source_url'] == sources.at[i,'source_url']]
    if len(numComms) == 2.0:
        sources.at[i,'color'] = 'g'
    elif len(numComms) == 1.0:
        if numComms.iloc[0]['community'] == '0':
            sources.at[i,'color'] = 'y'
        if numComms.iloc[0]['community'] != '0':
            sources.at[i,'color'] = 'b'
    else:
        sources.at[i,'color'] = 'w'
sources

Unnamed: 0,source_name,source_type,source_url,color
0,pageKataKita,facebook,www.facebook.com/pageKataKita,w
1,avise.info,external website,avise.info,w
2,globaldailymirror,facebook,www.facebook.com/globaldailymirror,g
3,YouTube video: https://youtu.be/-rpjCg-Mdl4,youtube,https://youtu.be/-rpjCg-Mdl4,w
4,randomreplay,facebook,www.facebook.com/randomreplay,w
...,...,...,...,...
14650,cibac.partylist,facebook,www.facebook.com/cibac.partylist,w
14652,Facebook profile: Anna Rhea,facebook,https://www.facebook.com/photo.php?fbid=109456...,w
14653,Facebook profile: Adolfo,facebook,https://www.facebook.com/photo.php?fbid=421131...,y
14655,Facebook profile: Micks,facebook,https://www.facebook.com/photo.php?fbid=139490...,y


In [40]:
# Get all unique linkers from master
linkers = master.drop_duplicates(subset=['linker_url'])[["linker_name","linker_type","linker_url"]]
linkers

Unnamed: 0,linker_name,linker_type,linker_url
0,KataKita,page,www.facebook.com/pageKataKita
1,Fan Idol,page,www.facebook.com/103793531351745
2,Inday Sara for President Movement,group,www.facebook.com/groups/1070835103277770
3,"FERDINAND ""BONGBONG"" R. MARCOS, JR.",group,www.facebook.com/groups/2754357608223229
4,❤👉Ang Probinsyano👈❤,group,www.facebook.com/groups/173211334679596
...,...,...,...
14648,ART LOVERS PHILIPPINES,group,www.facebook.com/groups/1886335711429677
14649,Police Regional Office 7,page,www.facebook.com/PRO7PNP
14650,CIBAC Party-List,page,www.facebook.com/cibac.partylist
14652,PWETmalu ROdy,page,www.facebook.com/PsychopathSiDigong102


In [41]:
# For each linker, get all sources connected to it.
# Then, lookup each source in the root_sources_concat table.
# Group these sources by how many times the linker shared it. (times_shared)
# Each source will have an assigned color (from the sources table)
for i in linkers.index.tolist():
    linker_sources = (
        master
            [master['linker_url'] == linkers.at[i,'linker_url']]
            .groupby('source_url')
            .agg(times_shared=('source_url','count'))
            .sort_values(by='times_shared',ascending=False)
            .reset_index()
    )
    linker_sources['color'] = linker_sources['source_url'].apply(
        lambda x: sources.set_index('source_url').at[x,'color']
    )
    try:
        color_tally = linker_sources.groupby('color').agg(c=('times_shared','sum')).reset_index()
        finalColor = color_tally.at[color_tally['c'].idxmax(),'color']
    except:
        finalColor = 'w'
    linkers.at[i,'color'] = finalColor
linkers

Unnamed: 0,linker_name,linker_type,linker_url,color
0,KataKita,page,www.facebook.com/pageKataKita,w
1,Fan Idol,page,www.facebook.com/103793531351745,w
2,Inday Sara for President Movement,group,www.facebook.com/groups/1070835103277770,g
3,"FERDINAND ""BONGBONG"" R. MARCOS, JR.",group,www.facebook.com/groups/2754357608223229,w
4,❤👉Ang Probinsyano👈❤,group,www.facebook.com/groups/173211334679596,w
...,...,...,...,...
14648,ART LOVERS PHILIPPINES,group,www.facebook.com/groups/1886335711429677,w
14649,Police Regional Office 7,page,www.facebook.com/PRO7PNP,y
14650,CIBAC Party-List,page,www.facebook.com/cibac.partylist,w
14652,PWETmalu ROdy,page,www.facebook.com/PsychopathSiDigong102,w


In [42]:
# Concatenate the sources and linkers dataset, then rename some values for better visualizations
sources2 = sources.rename(columns={'source_name': 'name','source_type':'type','source_url':'url'})
sources2['type'] = ['source']*len(sources2)
sources2['size'] = sources2['url'].apply(lambda x: len(master[master['source_url'] == x]))
sources2['url'] = 'src_'+sources2['url']

linkers2 = linkers.rename(columns={'linker_name': 'name','linker_type':'type','linker_url':'url'})
linkers2['url'] = 'lnk_'+linkers2['url']
linkers2['type'] = ['linkers']*len(linkers2)
linkers2['size'] = [1]*len(linkers2)

allNodes = pd.concat([sources2,linkers2],ignore_index=True)
allNodes['finalColor'] = allNodes['type'] + allNodes['color']
allNodes = allNodes.drop_duplicates(subset=['url'])
allNodes['finalColor'] = allNodes['finalColor'].replace({
    'sourcew': 'No community (Source)',
    'sourcey': 'Community B (Source)',
    'sourceb': 'Community A (Source)',
    'sourceg': 'Community AB (Source)',
    'linkersw': 'No community (Sharer)',
    'linkersy': 'Community B (Sharer)',
    'linkersb': 'Community A (Sharer)',
    'linkersg': 'Community AB (Sharer)'
})
allNodes

Unnamed: 0,name,type,url,color,size,finalColor
0,pageKataKita,source,src_www.facebook.com/pageKataKita,w,10,No community (Source)
1,avise.info,source,src_avise.info,w,13,No community (Source)
2,globaldailymirror,source,src_www.facebook.com/globaldailymirror,g,135,Community AB (Source)
3,YouTube video: https://youtu.be/-rpjCg-Mdl4,source,src_https://youtu.be/-rpjCg-Mdl4,w,4,No community (Source)
4,randomreplay,source,src_www.facebook.com/randomreplay,w,3,No community (Source)
...,...,...,...,...,...,...
7099,ART LOVERS PHILIPPINES,linkers,lnk_www.facebook.com/groups/1886335711429677,w,1,No community (Sharer)
7100,Police Regional Office 7,linkers,lnk_www.facebook.com/PRO7PNP,y,1,Community B (Sharer)
7101,CIBAC Party-List,linkers,lnk_www.facebook.com/cibac.partylist,w,1,No community (Sharer)
7102,PWETmalu ROdy,linkers,lnk_www.facebook.com/PsychopathSiDigong102,w,1,No community (Sharer)


In [43]:
# Generate the links of the agent map
agentLinks = []
for i in master.index.tolist():
    agentLink = {
        'sourceUrl': 'src_' + str(master.at[i,'source_url']),
        'linkerUrl': 'lnk_' + str(master.at[i,'linker_url']),
        'linkId': str(master.at[i,'source_url']) + str(master.at[i,'linker_url'])
    }
    agentLinks.append(agentLink)

agentMap = (
    pd.DataFrame(agentLinks)
    .groupby(['sourceUrl','linkerUrl','linkId'])
    .agg(linkStrength=('linkId','count'))
    .sort_values(by='linkStrength',ascending=False)
    .reset_index()
)
agentMap

Unnamed: 0,sourceUrl,linkerUrl,linkId,linkStrength
0,src_www.facebook.com/News5Everywhere,lnk_www.facebook.com/News5Everywhere,www.facebook.com/News5Everywherewww.facebook.c...,461
1,src_www.facebook.com/BongGoPage,lnk_www.facebook.com/BongGoPage,www.facebook.com/BongGoPagewww.facebook.com/Bo...,236
2,src_www.facebook.com/inquirerdotnet,lnk_www.facebook.com/inquirerdotnet,www.facebook.com/inquirerdotnetwww.facebook.co...,185
3,src_www.facebook.com/235030693233048,lnk_www.facebook.com/iskomorenodomagoso,www.facebook.com/235030693233048www.facebook.c...,174
4,src_www.facebook.com/abscbnNEWS,lnk_www.facebook.com/abscbnNEWS,www.facebook.com/abscbnNEWSwww.facebook.com/ab...,173
...,...,...,...,...
6309,src_news.detik.com,lnk_www.facebook.com/detikcom,news.detik.comwww.facebook.com/detikcom,1
6310,src_news.abs-cbn.com,lnk_www.facebook.com/nowyouknowph,news.abs-cbn.comwww.facebook.com/nowyouknowph,1
6311,src_news.abs-cbn.com,lnk_www.facebook.com/groups/270316153480845,news.abs-cbn.comwww.facebook.com/groups/270316...,1
6312,src_news.abs-cbn.com,lnk_www.facebook.com/groups/209984009651605,news.abs-cbn.comwww.facebook.com/groups/209984...,1


In [44]:
# Export agentMap and allNodes
agentMap.to_csv(f'results/{filename_base}-agent-map-color-coded-links.csv',index=False)
allNodes.to_csv(f'results/{filename_base}-agent-map-color-coded-nodes.csv',index=False)