# u_pos stats

In [1]:
# 1_network_df

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

plt.style.use('ggplot')
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)


In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

---
# Calculate network statistics

In [3]:
# load undirected
gml_files = glob('../output/network/u_pos.gml')

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    #'avg deg',
                    'density',
                    'deg assort coef', 
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'high deg cent',
                    'high bet cent',
                    'high clo cent',
                    'avg node conn',
                    '# conn comp',
                    'gc size'
                    ]
network_data = pd.DataFrame(columns = network_data_columns)

In [4]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)
    
    # calculate variables
    sent = "positive"
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    #avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    #conn_comp = nx.number_weakly_connected_components(graph) # not for undirected
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)
    #Gc = len(max(nx.weakly_connected_component_subgraphs(graph), key=len))

    # save variables into list
    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    #'avg deg':avg_deg,
                    'density':density,
                    'deg assort coef':deg_assort_coeff,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'high deg cent':highest_deg_cen,
                    'high bet cent':highest_bet_cen,
                    'high clo cent':highest_clo_cen,
                    'avg node conn':avg_node_con
                    #'# conn comp':conn_comp,
                    #'gc size':Gc
                    }
    network_data = network_data.append(graph_values, ignore_index=True)

----------
../output/network/u_pos.gml
Name: u_pos.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969


In [5]:
# print network data for greatest component
network_data

Unnamed: 0,name,sentiment,# nodes,# edges,density,deg assort coef,avg deg cent,avg bet cent,avg clo cent,high deg cent,high bet cent,high clo cent,avg node conn,# conn comp,gc size
0,u_pos.gml,positive,652.0,1140.0,0.0054,-0.0799,0.0054,0.0043,0.185,"(vaccines, 0.1044546851)","(parents, 0.218725048513)","(parents, 0.330742137194)",1.0567,,


In [7]:
# save
#network_data.to_csv('../output/df/u_pos.csv')

---
# all nodes table

In [8]:
# 2_node_df: list all nodes and centrality
data_columns = ['name',
                'sentiment'
                ]
data = pd.DataFrame(columns = data_columns)
combined_df = pd.DataFrame()

In [9]:
for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 10)
    print(gml_graph)
    calculate_graph_inf(graph)

    ## calculate variables and save into list
    sent = "positive"    
    deg_cent = nx.degree_centrality(graph)
    bet_cent = nx.betweenness_centrality(graph)
    clo_cent = nx.closeness_centrality(graph)
    graph_values = {'name':filename,
                    'sentiment':sent
                    }
    data = data.append(graph_values, ignore_index=True)

    degree = nx.degree(graph)
    deg_df = pd.DataFrame.from_dict(degree, orient = 'index')
    deg_df.columns = ['degree']
    # degree centrality
    deg_cent = nx.degree_centrality(graph)
    dc_df = pd.DataFrame.from_dict(deg_cent, orient = 'index')
    dc_df.columns = ['deg cent']
    # betweenness centrality
    bet_cent = nx.betweenness_centrality(graph)
    bc_df = pd.DataFrame.from_dict(bet_cent, orient = 'index')
    bc_df.columns = ['bet cent']
    # closeness centrality
    clo_cent = nx.closeness_centrality(graph)
    cc_df = pd.DataFrame.from_dict(clo_cent, orient = 'index')
    cc_df.columns = ['clo cent']
    # concat node frames into node_df
    frames = [deg_df, dc_df, bc_df, cc_df]
    node_df = pd.concat(frames, axis = 1)
    node_df.index.name = 'node'
    node_df = node_df.reset_index()

    values = pd.DataFrame(graph_values, columns = ('name', 'sentiment'), index = [0])
    
    # df = merges graph_values with node_df for single graph and fill NaNs
    df = pd.concat([values, node_df], axis = 1)
    df = df.fillna(method='ffill')
    combined_df = combined_df.append(df)

----------
../output/network/u_pos.gml
Name: u_pos.gml
Type: MultiGraph
Number of nodes: 652
Number of edges: 1140
Average degree:   3.4969


In [10]:
# print entire network
combined_df

Unnamed: 0,name,sentiment,node,degree,deg cent,bet cent,clo cent
0,u_pos.gml,positive,neighbors,1,0.001536,0.000000,0.177652
1,u_pos.gml,positive,vitamins,1,0.001536,0.000000,0.011151
2,u_pos.gml,positive,colleges,1,0.001536,0.000000,0.183437
3,u_pos.gml,positive,influenza,2,0.003072,0.000599,0.150718
4,u_pos.gml,positive,parents of autistic children,6,0.009217,0.004474,0.238568
5,u_pos.gml,positive,religious exemption,9,0.013825,0.005750,0.242208
6,u_pos.gml,positive,results,1,0.001536,0.000000,0.193034
7,u_pos.gml,positive,Scott Morrison,1,0.001536,0.000000,0.001536
8,u_pos.gml,positive,repetitive behaviors,1,0.001536,0.000000,0.112424
9,u_pos.gml,positive,Michael Mina,2,0.003072,0.000005,0.003072


---
## Undirected cc

In [11]:
# list of connected components by size (undirected graph)
connected_components = [len(c) for c in sorted(nx.connected_components(graph), key=len, reverse=True)]

# generate connected components as subgraphs (undirected graph)
subgraphs = list(nx.connected_component_subgraphs(graph))

# greatest component (undirected MultiGraph)
u_Gc = max(nx.connected_component_subgraphs(graph), key=len)
u_Gc.name = "undirected Gc"

In [12]:
print "connected components = ", connected_components
print nx.info(u_Gc)

connected components =  [585, 15, 7, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: undirected Gc
Type: MultiGraph
Number of nodes: 585
Number of edges: 1088
Average degree:   3.7197


---
# Centrality tables

In [13]:
# make sure you're using the right graph
print gml_files
print gml_graph
print graph

['../output/network/u_pos.gml']
../output/network/u_pos.gml
u_pos.gml


In [14]:
# degree centrality
dc = nx.degree_centrality(graph)
dc_df = pd.DataFrame.from_dict(dc, orient = 'index')
dc_df.columns = ['degree cent']
dc_df = dc_df.sort_values(by = ['degree cent'])
#dc_df

# betweenness centrality
bc = nx.betweenness_centrality(graph)
bc_df = pd.DataFrame.from_dict(bc, orient = 'index')
bc_df.columns = ['betweenness cent']
bc_df = bc_df.sort_values(by = ['betweenness cent'])
#bc_df

# closeness centrality
cc = nx.closeness_centrality(graph)
cc_df = pd.DataFrame.from_dict(cc, orient = 'index')
cc_df.columns = ['closeness cent']
cc_df = cc_df.sort_values(by = ['closeness cent'])
#cc_df

In [15]:
dc_df

Unnamed: 0,degree cent
neighbors,0.001536
arm,0.001536
elite list,0.001536
sex,0.001536
testing,0.001536
free vaccine,0.001536
Caribbean,0.001536
medical law,0.001536
strong-arm tactics,0.001536
gift from God,0.001536


In [16]:
bc_df

Unnamed: 0,betweenness cent
neighbors,0.000000
public schools,0.000000
behavioral research,0.000000
diarrhea deaths,0.000000
efficacious,0.000000
Early Childhood Australia's chief executive,0.000000
arm,0.000000
elite list,0.000000
sex,0.000000
testing,0.000000


In [17]:
cc_df

Unnamed: 0,closeness cent
meningococcal conjugate booster,0.001536
autism-linked genes,0.001536
benefit,0.001536
short amount of time,0.001536
critical period,0.001536
16 years of age,0.001536
prenatal development,0.001536
factor,0.001536
government,0.001536
insulin,0.001536
