# network data
0. Network-wide statistics
1. Creates dataframe for network-level statistics
2. Writes to csv file
3. Calculation notes below

In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

gml_files = glob('../output/network/*/*.gml')

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info
    ## plot spring layout
    # plt.figure(figsize=(10,10))
    # nx.draw_spring(graph, arrows=True, with_labels=True)

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

In [6]:
# create empty dataframe with columns

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    'avg degree',
                    'density',
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'highest degc',
                    'highest betc',
                    'highest cloc',
                    'avg in-deg',
                    'avg out-deg',
                    '# strong comp',
                    '# weak comp',
                    '# conn comp',
                    'avg node connect',
                    'deg assort coeff'
                    ]

network_data = pd.DataFrame(columns = network_data_columns)

In [7]:
# graph = directed, ugraph = undirected

for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected() ## to undirected graph
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    calculate_graph_inf(graph)
    calculate_graph_inf(ugraph)

    # calculate variables
    
    sent = filepath.split('/')[-1]
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    in_deg = sum(graph.in_degree().values())/float(nx.number_of_nodes(graph))
    out_deg = sum(graph.out_degree().values())/float(nx.number_of_nodes(graph))
    avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    strong_comp = nx.number_strongly_connected_components(graph)
    weak_comp =  nx.number_weakly_connected_components(graph)
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    conn_comp = nx.number_connected_components(ugraph)
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)

    # save variables into list

    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    'avg degree':avg_deg,
                    'density':density,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'highest degc':highest_deg_cen,
                    'highest betc':highest_bet_cen,
                    'highest cloc':highest_clo_cen,
                    'avg in-deg':"%.4f" % in_deg,
                    'avg out-deg':"%.4f" % out_deg,
                    '# strong comp':strong_comp,
                    '# weak comp':weak_comp,
                    '# conn comp':conn_comp,
                    'avg node connect':avg_node_con,
                    'deg assort coeff':deg_assort_coeff
                    }
    
    network_data = network_data.append(graph_values, ignore_index=True)
    
    if graph_num == 25:
        break

----------------------------------------
../output/network/negative/article03.gml
Name: article03.gml
Type: MultiDiGraph
Number of nodes: 18
Number of edges: 13
Average in degree:   0.7222
Average out degree:   0.7222
Name: article03.gml
Type: MultiGraph
Number of nodes: 18
Number of edges: 13
Average degree:   1.4444
----------------------------------------
../output/network/negative/article05.gml
Name: article05.gml
Type: MultiDiGraph
Number of nodes: 22
Number of edges: 25
Average in degree:   1.1364
Average out degree:   1.1364
Name: article05.gml
Type: MultiGraph
Number of nodes: 22
Number of edges: 25
Average degree:   2.2727
----------------------------------------
../output/network/negative/article06.gml
Name: article06.gml
Type: MultiDiGraph
Number of nodes: 124
Number of edges: 121
Average in degree:   0.9758
Average out degree:   0.9758
Name: article06.gml
Type: MultiGraph
Number of nodes: 124
Number of edges: 121
Average degree:   1.9516
------------------------------------

In [8]:
network_data

Unnamed: 0,name,sentiment,# nodes,# edges,avg degree,density,avg deg cent,avg bet cent,avg clo cent,highest degc,highest betc,highest cloc,avg in-deg,avg out-deg,# strong comp,# weak comp,# conn comp,avg node connect,deg assort coeff
0,article03.gml,negative,18,13,1.4444,0.0425,0.085,0.0025,0.048,"(parents, 0.176470588235)","(immune system, 0.0147058823529)","(parents, 0.210084033613)",0.7222,0.7222,18,5,5,0.0686,
1,article05.gml,negative,22,25,2.2727,0.0541,0.1082,0.001,0.0558,"(Jim Carrey, 0.333333333333)","(mandatory vaccines, 0.00952380952381)","(Jim Carrey, 0.321428571429)",1.1364,1.1364,22,3,3,0.0758,0.1488
2,article06.gml,negative,124,121,1.9516,0.0079,0.0159,0.0003,0.0129,"(shingles vaccine, 0.130081300813)","(shingles vaccine, 0.00739704118353)","(shingles vaccine, 0.130216802168)",0.9758,0.9758,122,10,10,0.0254,-0.1979
3,article07.gml,negative,56,57,2.0357,0.0185,0.037,0.0024,0.0338,"(scientific fraud, 0.181818181818)","(CDC, 0.0276094276094)","(Rep. Bill Posey, 0.204642166344)",1.0179,1.0179,55,3,3,0.0802,-0.0986
4,article1.gml,negative,140,147,2.1,0.0076,0.0151,0.0001,0.0099,"(mercury, 0.107913669065)","(mercury, 0.00280210614117)","(CDC, 0.106766760505)",1.05,1.05,138,17,17,0.0161,-0.0168
5,article1001.gml,negative,134,134,2.0,0.0075,0.015,0.0002,0.0111,"(SB 277, 0.157894736842)","(vaccine damage, 0.00398724082935)","(SB 277, 0.144760635767)",1.0,1.0,134,17,17,0.0212,-0.2159
6,article1021.gml,negative,64,64,2.0,0.0159,0.0317,0.0003,0.0219,"(SV40, 0.285714285714)","(SV40, 0.0143369175627)","(SV40, 0.183006535948)",1.0,1.0,64,10,10,0.0335,-0.2714
7,article152.gml,negative,78,67,1.7179,0.0112,0.0223,0.0012,0.0207,"(thimerosal, 0.207792207792)","(thimerosal, 0.0261449077239)","(thimerosal, 0.15012987013)",0.859,0.859,78,17,17,0.047,-0.0241
8,article2308.gml,negative,66,56,1.697,0.0131,0.0261,0.0002,0.0178,"(National Vaccine Injury Compensation Program,...","(National Vaccine Injury Compensation Program,...","(National Vaccine Injury Compensation Program,...",0.8485,0.8485,66,11,11,0.0254,-0.202
9,article3335.gml,negative,120,128,2.1333,0.009,0.0179,0.0004,0.0172,"(vaccines, 0.210084033613)","(vaccines, 0.0210084033613)","(adverse effects, 0.117647058824)",1.0667,1.0667,119,8,8,0.0359,-0.3449


In [None]:
# save dataframe to csv
network_data.to_csv('network_df', encoding = 'utf-8')

- - -

# single network graph calculations

In [9]:
# for individual network

graph = nx.read_gml('../output/network/negative/article03.gml')

ugraph = graph.to_undirected()
print nx.info(graph)
print nx.info(ugraph)

Name: 
Type: MultiDiGraph
Number of nodes: 18
Number of edges: 13
Average in degree:   0.7222
Average out degree:   0.7222
Name: 
Type: MultiGraph
Number of nodes: 18
Number of edges: 13
Average degree:   1.4444


In [10]:
# degree histogram: returns a list of frequencies of degrees
nx.degree_histogram(graph)

[0, 11, 6, 1]

In [11]:
# degree centrality

a = nx.degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['degree centrality']
dfIn = dfIn.sort_values(by=['degree centrality'])
dfIn

Unnamed: 0,degree centrality
parent,0.058824
children,0.058824
truthful,0.058824
preventable disease,0.058824
doctors,0.058824
get child vaccinated,0.058824
vaccine consent forms,0.058824
parents who know that vaccines are not safe,0.058824
safe,0.058824
vaccines are not safe,0.058824


In [12]:
# betweenness centrality

a = nx.betweenness_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['betweenness centrality']
dfIn = dfIn.sort_values(by=['betweenness centrality'])
dfIn

Unnamed: 0,betweenness centrality
parent,0.0
children,0.0
truthful,0.0
preventable disease,0.0
parents,0.0
doctors,0.0
get child vaccinated,0.0
vaccine consent forms,0.0
vaccine-injured children,0.0
parents who know that vaccines are not safe,0.0


In [13]:
# closeness centrality

a = nx.closeness_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['closeness centrality']
dfIn = dfIn.sort_values(by=['closeness centrality'])
dfIn

Unnamed: 0,closeness centrality
vaccine consent forms,0.0
children,0.0
truthful,0.0
vaccines are not safe,0.0
safe,0.0
preventable disease,0.0
vaccine-injured children,0.0
get child vaccinated,0.0
consent for vaccines,0.058824
parent,0.058824


In [14]:
# in degree centrality
a = nx.in_degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['in deg centrality']
dfIn = dfIn.sort_values(by=['in deg centrality'])
dfIn

Unnamed: 0,in deg centrality
parent,0.0
parents who know that vaccines are not safe,0.0
vaccinated children,0.0
parents,0.0
doctors,0.0
children,0.058824
truthful,0.058824
preventable disease,0.058824
immune system,0.058824
consent for vaccines,0.058824


In [15]:
# out degree centrality
b = nx.out_degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(b,orient='index')
dfIn.columns = ['out deg centrality']
dfIn = dfIn.sort_values(by=['out deg centrality'])
dfIn

Unnamed: 0,out deg centrality
vaccine consent forms,0.0
children,0.0
truthful,0.0
vaccines are not safe,0.0
safe,0.0
preventable disease,0.0
vaccine-injured children,0.0
get child vaccinated,0.0
immune system,0.058824
consent for vaccines,0.058824


- - -

In [None]:
# current-flow betweenness centrality (graph must be connected; run for largest component)
#nx.current_flow_betweenness_centrality(graph)

# eigenvector centrality

# degree assortativity coefficient
# average neighbor degree; average degree connectivity (k nearest neighbors)

#nx.edge_connectivity(graph)
#nx.node_connectivity(graph)

# clustering coefficient (cannot be multigraph)
# nx.average_clustering(graph)

- - -