# network data
0. Network-wide statistics
1. Creates dataframe for network-level statistics
2. Writes to csv file
3. Calculation notes below

In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

gml_files = glob('../output/network/*/*.gml')

In [2]:
def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info
    ## plot spring layout
    # plt.figure(figsize=(10,10))
    # nx.draw_spring(graph, arrows=True, with_labels=True)

def highest_centrality(cent_dict):
    """Returns a tuple (node,value) with the node
    with largest value from centrality dictionary."""
    # create ordered tuple of centrality data
    cent_items = [(b,a) for (a,b) in cent_dict.iteritems()]
    # sort in descending order
    cent_items.sort()
    cent_items.reverse()
    return tuple(reversed(cent_items[0]))

In [3]:
# create empty dataframe with columns

network_data_columns = ['name',
                    'sentiment',
                    '# nodes',
                    '# edges',
                    'avg degree',
                    'density',
                    'avg deg cent',
                    'avg bet cent',
                    'avg clo cent',
                    'highest degc',
                    'highest betc',
                    'highest cloc',
                    'avg in-deg',
                    'avg out-deg',
                    '# strong comp',
                    '# weak comp',
                    '# conn comp',
                    'avg node connect',
                    'deg assort coeff'
                    ]

network_data = pd.DataFrame(columns = network_data_columns)

In [4]:
# graph = directed, ugraph = undirected

for graph_num, gml_graph in enumerate(gml_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected() ## to undirected graph
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    calculate_graph_inf(graph)
    calculate_graph_inf(ugraph)

    # calculate variables
    
    sent = filepath.split('/')[-1]
    nodes = nx.number_of_nodes(graph)
    edges = nx.number_of_edges(graph)
    density = float("{0:.4f}".format(nx.density(graph)))
    avg_deg_cen = np.array(nx.degree_centrality(graph).values()).mean()
    avg_bet_cen = np.array(nx.betweenness_centrality(graph).values()).mean()
    avg_clo_cen = np.array(nx.closeness_centrality(graph).values()).mean()
    in_deg = sum(graph.in_degree().values())/float(nx.number_of_nodes(graph))
    out_deg = sum(graph.out_degree().values())/float(nx.number_of_nodes(graph))
    avg_deg = float("{0:.4f}".format(in_deg + out_deg))
    strong_comp = nx.number_strongly_connected_components(graph)
    weak_comp =  nx.number_weakly_connected_components(graph)
    avg_node_con = float("{0:.4f}".format((nx.average_node_connectivity(graph))))
    deg_assort_coeff = float("{0:.4f}".format((nx.degree_assortativity_coefficient(graph))))
    conn_comp = nx.number_connected_components(ugraph)
    deg_cen = nx.degree_centrality(graph)
    bet_cen = nx.betweenness_centrality(graph)
    clo_cen = nx.closeness_centrality(graph)
    highest_deg_cen = highest_centrality(deg_cen)
    highest_bet_cen = highest_centrality(bet_cen)
    highest_clo_cen = highest_centrality(clo_cen)

    # save variables into list

    graph_values = {'name':filename,
                    'sentiment':sent,
                    '# nodes':nodes,
                    '# edges':edges,
                    'avg degree':avg_deg,
                    'density':density,
                    'avg deg cent':"%.4f" % avg_deg_cen,
                    'avg bet cent':"%.4f" % avg_bet_cen,
                    'avg clo cent':"%.4f" % avg_clo_cen,
                    'highest degc':highest_deg_cen,
                    'highest betc':highest_bet_cen,
                    'highest cloc':highest_clo_cen,
                    'avg in-deg':"%.4f" % in_deg,
                    'avg out-deg':"%.4f" % out_deg,
                    '# strong comp':strong_comp,
                    '# weak comp':weak_comp,
                    '# conn comp':conn_comp,
                    'avg node connect':avg_node_con,
                    'deg assort coeff':deg_assort_coeff
                    }
    
    network_data = network_data.append(graph_values, ignore_index=True)
    
    if graph_num == 0:
        break

----------------------------------------
../output/network/positive/article2.gml
Name: article2.gml
Type: MultiDiGraph
Number of nodes: 28
Number of edges: 23
Average in degree:   0.8214
Average out degree:   0.8214
Name: article2.gml
Type: MultiGraph
Number of nodes: 28
Number of edges: 23
Average degree:   1.6429


In [5]:
network_data

Unnamed: 0,name,sentiment,# nodes,# edges,avg degree,density,avg deg cent,avg bet cent,avg clo cent,highest degc,highest betc,highest cloc,avg in-deg,avg out-deg,# strong comp,# weak comp,# conn comp,avg node connect,deg assort coeff
0,article2.gml,positive,28,23,1.6429,0.0304,0.0608,0.0006,0.033,"(decrease in exemption rates, 0.222222222222)","(decrease in exemption rates, 0.00712250712251)","(decrease in exemption rates, 0.185185185185)",0.8214,0.8214,28,7,7,0.041,0.0631


In [None]:
# save dataframe to csv
network_data.to_csv('network_df', encoding = 'utf-8')

- - -

# single network graph calculations

In [None]:
# for individual network

graph = nx.read_gml('../output/network/negative/article03.gml')

ugraph = graph.to_undirected()
print nx.info(graph)
print nx.info(ugraph)

In [None]:
# degree histogram: returns a list of frequencies of degree values
nx.degree_histogram(graph)

In [None]:
# degree centrality

a = nx.degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['degree centrality']
dfIn = dfIn.sort_values(by=['degree centrality'])
dfIn

In [None]:
# betweenness centrality

a = nx.betweenness_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['betweenness centrality']
dfIn = dfIn.sort_values(by=['betweenness centrality'])
dfIn

In [None]:
# closeness centrality

a = nx.closeness_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['closeness centrality']
dfIn = dfIn.sort_values(by=['closeness centrality'])
dfIn

In [None]:
# in degree centrality
a = nx.in_degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(a,orient='index')
dfIn.columns = ['in deg centrality']
dfIn = dfIn.sort_values(by=['in deg centrality'])
dfIn

In [None]:
# out degree centrality
b = nx.out_degree_centrality(graph)
dfIn=pd.DataFrame.from_dict(b,orient='index')
dfIn.columns = ['out deg centrality']
dfIn = dfIn.sort_values(by=['out deg centrality'])
dfIn

- - -

In [None]:
# current-flow betweenness centrality (graph must be connected; run for largest component)
#nx.current_flow_betweenness_centrality(graph)

# eigenvector centrality

# degree assortativity coefficient
# average neighbor degree; average degree connectivity (k nearest neighbors)

#nx.edge_connectivity(graph)
#nx.node_connectivity(graph)

# clustering coefficient (cannot be multigraph)
# nx.average_clustering(graph)

## Connected components of a graph

In [8]:
for cc in nx.connected_component_subgraphs(ugraph):
    print cc, " has ", len(cc.nodes()), " nodes."

article2.gml  has  2  nodes.
article2.gml  has  2  nodes.
article2.gml  has  9  nodes.
article2.gml  has  3  nodes.
article2.gml  has  7  nodes.
article2.gml  has  2  nodes.
article2.gml  has  3  nodes.


- - -