## network data
0. Network-wide statistics
1. Creates dataframe for network-level statistics
2. Writes to csv file
3. Calculation notes below

In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

gml_files = glob('../output/network/*/*.gml')

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info
    ## plot spring layout
    # plt.figure(figsize=(10,10))
    # nx.draw_spring(graph, arrows=True, with_labels=True)

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

In [3]:
# create empty dataframe with columns

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    'avg degree',
                    'density',
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'highest degc',
                    'highest betc',
                    'highest cloc',
                    'avg in-deg',
                    'avg out-deg',
                    '# strong comp',
                    '# weak comp',
                    '# conn comp',
                    'avg node connect',
                    'deg assort coeff',
                    ]

network_data = pd.DataFrame(columns = network_data_columns)

In [4]:
# graph = directed, ugraph = undirected

for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected() ## to undirected graph
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    calculate_graph_inf(graph)
    calculate_graph_inf(ugraph)

    # calculate variables
    
    sent = filepath.split('/')[-1]
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = nx.density(graph)
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    in_deg = sum(graph.in_degree().values())/float(nx.number_of_nodes(graph))
    out_deg = sum(graph.out_degree().values())/float(nx.number_of_nodes(graph))
    avg_deg = in_deg + out_deg
    strong_comp = nx.number_strongly_connected_components(graph)
    weak_comp =  nx.number_weakly_connected_components(graph)
    avg_node_con = nx.average_node_connectivity(graph)
    deg_assort_coeff = nx.degree_assortativity_coefficient(graph)
    conn_comp = nx.number_connected_components(ugraph)
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)

    # save variables into list

    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    'avg degree':avg_deg,
                    'density':density,
                    'avg deg cent':avg_deg_cen,
                    'avg bet cent':avg_bet_cen,
                    'avg clo cent':avg_clo_cen,
                    'highest degc':highest_deg_cen,
                    'highest betc':highest_bet_cen,
                    'highest cloc':highest_clo_cen,
                    'avg in-deg':in_deg,
                    'avg out-deg':out_deg,
                    '# strong comp':strong_comp,
                    '# weak comp':weak_comp,
                    '# conn comp':conn_comp,
                    'avg node connect':avg_node_con,
                    'deg assort coeff':deg_assort_coeff,
                    }
    
    network_data = network_data.append(graph_values, ignore_index=True)

----------------------------------------
../output/network/negative/article03.gml
Name: article03.gml
Type: MultiDiGraph
Number of nodes: 17
Number of edges: 10
Average in degree:   0.5882
Average out degree:   0.5882
Name: article03.gml
Type: MultiGraph
Number of nodes: 17
Number of edges: 10
Average degree:   1.1765
----------------------------------------
../output/network/negative/article05.gml
Name: article05.gml
Type: MultiDiGraph
Number of nodes: 22
Number of edges: 22
Average in degree:   1.0000
Average out degree:   1.0000
Name: article05.gml
Type: MultiGraph
Number of nodes: 22
Number of edges: 22
Average degree:   2.0000
----------------------------------------
../output/network/negative/article06.gml
Name: article06.gml
Type: MultiDiGraph
Number of nodes: 127
Number of edges: 121
Average in degree:   0.9528
Average out degree:   0.9528
Name: article06.gml
Type: MultiGraph
Number of nodes: 127
Number of edges: 121
Average degree:   1.9055
------------------------------------

In [5]:
network_data

Unnamed: 0,name,sentiment,# nodes,# edges,avg degree,density,avg deg cent,avg bet cent,avg clo cent,highest degc,highest betc,highest cloc,avg in-deg,avg out-deg,# strong comp,# weak comp,# conn comp,avg node connect,deg assort coeff
0,article03.gml,negative,17,10,1.176471,0.036765,0.073529,0.000245,0.0375,"(parents, 0.1875)","(consent for vaccines, 0.00416666666667)","(parents, 0.2)",0.588235,0.588235,17,7,7,0.040441,
1,article05.gml,negative,22,22,2.0,0.047619,0.095238,0.000974,0.050095,"(Jim Carrey, 0.333333333333)","(mandatory vaccines, 0.0142857142857)","(Jim Carrey, 0.304761904762)",1.0,1.0,22,4,4,0.064935,0.132765
2,article06.gml,negative,127,121,1.905512,0.007562,0.015123,0.000212,0.012049,"(shingles vaccine, 0.134920634921)","(shingles vaccine, 0.0071746031746)","(shingles vaccine, 0.131499726327)",0.952756,0.952756,125,12,12,0.022747,-0.184981
3,article07.gml,negative,57,53,1.859649,0.016604,0.033208,0.000883,0.022391,"(CDC, 0.142857142857)","(evidence, 0.00974025974026)","(science fraud, 0.129017857143)",0.929825,0.929825,57,5,5,0.042293,-0.240594
4,article1.gml,negative,140,147,2.1,0.007554,0.015108,8e-05,0.009877,"(mercury, 0.107913669065)","(mercury, 0.00280210614117)","(CDC, 0.106766760505)",1.05,1.05,138,17,17,0.016084,-0.016839
5,article1001.gml,negative,134,134,2.0,0.007519,0.015038,0.000174,0.011085,"(SB 277, 0.157894736842)","(vaccine damage, 0.00398724082935)","(SB 277, 0.144760635767)",1.0,1.0,134,17,17,0.021154,-0.215881
6,article1021.gml,negative,64,64,2.0,0.015873,0.031746,0.000328,0.021927,"(SV40, 0.285714285714)","(SV40, 0.0143369175627)","(SV40, 0.183006535948)",1.0,1.0,64,10,10,0.033482,-0.271361
7,article152.gml,negative,78,67,1.717949,0.011156,0.022311,0.001176,0.020729,"(thimerosal, 0.207792207792)","(thimerosal, 0.0261449077239)","(thimerosal, 0.15012987013)",0.858974,0.858974,78,17,17,0.046953,-0.024056
8,article2308.gml,negative,66,56,1.69697,0.013054,0.026107,0.000219,0.017811,"(National Vaccine Injury Compensation Program,...","(National Vaccine Injury Compensation Program,...","(National Vaccine Injury Compensation Program,...",0.848485,0.848485,66,11,11,0.025408,-0.202044
9,article3335.gml,negative,120,128,2.133333,0.008964,0.017927,0.000374,0.017183,"(vaccines, 0.210084033613)","(vaccines, 0.0210084033613)","(adverse effects, 0.117647058824)",1.066667,1.066667,119,8,8,0.035854,-0.344902


In [14]:
# save dataframe to csv
network_data.to_csv('network_stats', encoding = 'utf-8')

- - -

## Network-wide variable calculations

In [None]:
# degree histogram: returns a list of frequencies of degrees
nx.degree_histogram(graph)

In [None]:
# degree centrality
#nx.degree_centrality(graph)
a = nx.degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['degree centrality']
dfIn = dfIn.sort_values(by=['degree centrality'])
dfIn

In [None]:
# betweenness centrality
#nx.betweenness_centrality(graph)
a = nx.betweenness_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['betweenness centrality']
dfIn = dfIn.sort_values(by=['betweenness centrality'])
dfIn

In [None]:
# closeness centrality
#nx.closeness_centrality(graph).values()
a = nx.closeness_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['closeness centrality']
dfIn = dfIn.sort_values(by=['closeness centrality'])
dfIn

In [None]:
# in degree centrality
a = nx.in_degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['in deg centrality']
dfIn = dfIn.sort_values(by=['in deg centrality'])
dfIn

In [None]:
# out degree centrality
b = nx.out_degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(b,orient='index')
dfIn.columns = ['out deg centrality']
dfIn = dfIn.sort_values(by=['out deg centrality'])
dfIn

- - -

In [None]:
# current-flow betweenness centrality (graph must be connected; run for largest component)
#nx.current_flow_betweenness_centrality(graph)

# eigenvector centrality

# degree assortativity coefficient
# average neighbor degree; average degree connectivity (k nearest neighbors)

#nx.edge_connectivity(graph)
#nx.node_connectivity(graph)

# clustering coefficient (cannot be multigraph)
# nx.average_clustering(graph)

- - -