In [1]:
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
import glob
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
import matplotlib.pyplot as plt

from bokeh.io import output_notebook, show, save
from bokeh.models import Range1d, Circle, ColumnDataSource, MultiLine, EdgesAndLinkedNodes, NodesAndLinkedEdges, LabelSet
from bokeh.plotting import figure
from bokeh.plotting import from_networkx
from bokeh.palettes import Blues8, Reds8, Purples8, Oranges8, Viridis8, Spectral8, Turbo256, Viridis256
from bokeh.transform import linear_cmap
from networkx.algorithms import community

from functools import reduce

In [2]:
# Specify source folder
src_data_folder = r"C:\Users\SPARTA-USER\Documents\GitHub\tobacco_control\01_Data\Crowdtangle\URL Sharing"
bm_extracts = glob.glob(join(src_data_folder,"*.csv"))

# Import all CSVs into a dataframe
df_from_each_file = (pd.read_csv(f) for f in bm_extracts)
extract = pd.concat(df_from_each_file, ignore_index=True, sort=True)

In [3]:
#--- Drop duplicates
extract.drop_duplicates(subset="URL",keep="first",inplace=True)

#--- Drop rows with missing shared link
extract = extract[~extract["Link"].isnull()]

#--- Create new fields for Name and Shared Link
extract["Name"] = extract[["Page Name","Group Name","Subreddit"]].bfill(axis=1).iloc[:,0]
extract["Shared Link"] = extract[["Final Link","Link"]].bfill(axis=1).iloc[:,0]
extract["Weight"] = extract['Total Interactions'].apply(lambda i: str(i).replace(",",""))
extract["Weight"] = extract['Weight'].astype('float')


In [20]:
from pyvis.network import Network
import pandas as pd

url_net = Network(height='750px', width='100%', bgcolor='#222222', font_color='white')

# set the physics layout of the network
url_net.force_atlas_2based()

sources = extract['Name']
targets = extract['Shared Link']
weights = extract["Weight"]

edge_data = zip(sources, targets, weights)

for e in edge_data:
    src = e[0]
    dst = e[1]
    w = e[2]

    url_net.add_node(src, src, title=src, color='blue')
    url_net.add_node(dst, dst, title=dst, size=w, color='red')
    url_net.add_edge(src, dst, value=w, color='white')

neighbor_map = url_net.get_adj_list()

# add neighbor data to node hover data
for node in url_net.nodes:
    node['title'] += ' Neighbors:<br>' + '<br>'.join(neighbor_map[node['id']])
    node['value'] = len(neighbor_map[node['id']])

url_net.show_buttons(filter_=['physics'])
url_net.show('C:/Users/SPARTA-USER/Documents/GitHub/tobacco_control/03_Outputs/CrowdTangle/URL Sharing/url_sharing.html')

In [9]:
def cd (df, src, trgt, wt):
    G = nx.from_pandas_edgelist(df, source = src, target = trgt, edge_attr= wt)
        
    # Calculate network statistics
    degree_dict = dict(G.degree(G.nodes()))
    degree = pd.DataFrame.from_dict(degree_dict, orient='index', columns=["Degree"])
    betweenness_dict = nx.betweenness_centrality(G, normalized = True, endpoints = False) # Run betweenness centrality
    betweenness = pd.DataFrame.from_dict(betweenness_dict, orient='index', columns=["Betweenness Centrality"])
    eigenvector_dict = nx.eigenvector_centrality_numpy(G) # Run eigenvector centrality
    eigenvector = pd.DataFrame.from_dict(eigenvector_dict, orient='index', columns=["Eigenvector Centrality"])
    closeness_dict = nx.closeness_centrality(G) # Run closeness centrality
    closeness = pd.DataFrame.from_dict(closeness_dict, orient='index', columns=["Closeness Centrality"])
    pagerank_dict = nx.pagerank(G, alpha=0.8) # Run page rank             
    pagerank = pd.DataFrame.from_dict(pagerank_dict, orient='index', columns=["Page Rank"])           
    
    # compile the list of dataframes you want to merge
    stats_dfs = [degree, betweenness, eigenvector, closeness, pagerank]
    network_stats = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True,how='outer'), stats_dfs)
    return G, network_stats

G, network_stats = cd(extract, "Nam", "Shared Link")

In [13]:
network_stats.to_excel("C:/Users/SPARTA-USER/Documents/GitHub/tobacco_control/03_Outputs/CrowdTangle/URL Sharing/Network_Stats_URL_Sharing.xlsx")

In [4]:
extract2 = extract[["Name","Shared Link","Weight"]]
extract2.to_excel("C:/Users/SPARTA-USER/Documents/GitHub/tobacco_control/03_Outputs/CrowdTangle/URL Sharing/URL_Network_Data.xlsx")