# sentiment_graphs
0. (make_union & single_calc copy)
1. import positive, negative, neutral gml files
2. write to directed and undirected gml files
3. component and connectivity analysis
4. plot graph components and subgraphs

In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
from glob import glob

pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60)

#gml_files = glob('../output/network/*/*.gml')
pos_files = glob('../output/network/positive/*.gml')
neg_files = glob('../output/network/negative/*.gml')
neu_files = glob('../output/network/neutral/*.gml')

def calculate_graph_inf(graph):
    graph.name = filename
    info = nx.info(graph)
    print info

def plot_graph(graph):
    info = nx.info(graph)
    print info
    plt.figure(figsize=(10,10))
    nx.draw_spring(graph, with_labels = True)

In [None]:
# don't run
pos_graphs = []
pos_ugraphs = []
neg_graphs = []
neg_ugraphs = []
neu_graphs = []
neu_ugraphs = []

for graph_num, gml_graph in enumerate(pos_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected()
    U = graph.to_undirected(reciprocal=True)
    e = U.edges()
    ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    print(nx.info(graph))
    pos_graphs.append(graph)
    pos_ugraphs.append(ugraph)
    
for graph_num, gml_graph in enumerate(neg_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected()
    U = graph.to_undirected(reciprocal=True)
    e = U.edges()
    ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    print(nx.info(graph))
    neg_graphs.append(graph)
    neg_ugraphs.append(ugraph)
    
for graph_num, gml_graph in enumerate(neu_files):
    graph = nx.read_gml(gml_graph)
    ugraph = graph.to_undirected()
    U = graph.to_undirected(reciprocal=True)
    e = U.edges()
    ugraph.add_edges_from(e)
    (filepath, filename) = os.path.split(gml_graph)
    print('-' * 40)
    print(gml_graph)
    print(nx.info(graph))
    neu_graphs.append(graph)
    neu_ugraphs.append(ugraph)

In [None]:
# don't run
# compose(G1, G2): combine graphs identifying nodes common to both
pos_all = nx.compose_all(pos_graphs)
pos_uall = nx.compose_all(pos_ugraphs)
neg_all = nx.compose_all(neg_graphs)
neg_uall = nx.compose_all(neg_ugraphs)
neu_all = nx.compose_all(neu_graphs)
neu_uall = nx.compose_all(neu_ugraphs)

# write to gml
nx.write_gml(pos_all, "positive_all.gml")
nx.write_gml(pos_uall, "positive_uall.gml")
nx.write_gml(neg_all, "negative_all.gml")
nx.write_gml(neg_uall, "negative_uall.gml")
nx.write_gml(neu_all, "neutral_all.gml")
nx.write_gml(neu_uall, "neutral_uall.gml")

- - -

# start here

In [2]:
#graph = nx.read_gml("positive_all.gml")
#ugraph = nx.read_gml("positive_uall.gml")

graph = nx.read_gml("../output/network/article_neg.gml")
ugraph = graph.to_undirected()
U = graph.to_undirected(reciprocal=True)
e = U.edges()
ugraph.add_edges_from(e)

In [3]:
def drawIt(graph, what = 'graph'):
    nsize = graph.number_of_nodes()
    print "Drawing %s of size %s:" % (what, nsize)
    
    if nsize > 20:
        plt.figure(figsize=(10, 10))
        if nsize > 40:
            nx.draw_spring(graph, with_labels = True, node_size = 70, font_size = 12)
        else:
            nx.draw_spring(graph, with_labels = True)
    else:
        nx.draw_spring(graph, with_labels = True)
    plt.show()

def describeGraph(graph):
    components = sorted(nx.connected_components(graph), key = len, reverse = True)
    cc = [len(c) for c in components]
    subgraphs = list(nx.connected_component_subgraphs(graph))
    params = (graph.number_of_edges(),graph.number_of_nodes(),len(cc))
    print "Graph has %s nodes, %s edges, %s connected components\n" % params
    drawIt(graph)
    for sub in components:
        drawIt(graph.subgraph(sub), what = 'component')

## Components & connectivity

In [4]:
# list of connected components (sets of nodes), starting with largest
print "List of connected components =", [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]

# generate connected components as subgraphs; Gc is largest component
subgraphs = list(nx.connected_component_subgraphs(ugraph))

List of connected components = [1211, 10, 10, 7, 6, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


### Greatest component

In [5]:
Gc = max(nx.connected_component_subgraphs(ugraph), key=len)
print "Size of greatest component =", len(Gc)

Size of greatest component = 1211


Moody and White provide an algorithm for identifying k-components in a graph, which is based on Kanevsky’s algorithm for finding all minimum-size node cut-sets of a graph (implemented in all_node_cuts() function):

1. Compute node connectivity, k, of the input graph G.
2. Identify all k-cutsets at the current level of connectivity using Kanevsky’s algorithm.
3. Generate new graph components based on the removal of these cutsets. Nodes in a cutset belong to both sides of the induced cut.
4. If the graph is neither complete nor trivial, return to 1; else end.

In [10]:
# returns all minimum k cutsets of an undirected graph
# i.e., the set(s) of nodes of cardinality equal to the node connectivity of G
# thus if removed, would break G into two or more connected components
cutsets = list(nx.all_node_cuts(Gc))
print "# of cutsets =", len(cutsets)

# of cutsets = 250


In [9]:
# returns a set of nodes or edges of minimum cardinality that disconnects G
print "Min node cut =", nx.minimum_node_cut(Gc, s='vaccine', t='autism')
print "Min edge cut =", nx.minimum_edge_cut(Gc)

 Min node cut = set([u'measles mortality rate', u'aluminum', u'unvaccinated children', u'vaccine-autism link', u'vaccine industry', u'swine flu vaccine', u'child', u'Matthew Downing', u'children', u'additives'])
Min edge cut = set([(u'monosodium glutamate', u'central nervous system disorders')])


In [10]:
nx.minimum_node_cut(Gc, s='vaccine', t='autism')

{u'Matthew Downing',
 u'additives',
 u'aluminum',
 u'child',
 u'children',
 u'measles mortality rate',
 u'swine flu vaccine',
 u'unvaccinated children',
 u'vaccine industry',
 u'vaccine-autism link'}

In [52]:
a = nx.minimum_edge_cut(Gc, s='autism', t='vaccine')
a

{(u"GlaxoSmithKine's Energix B", u'vaccine'),
 (u'Matthew Downing', u'vaccine'),
 (u'additives', u'vaccine'),
 (u'autism', u'vaccine'),
 (u'autoimmune disorder', u'vaccine'),
 (u'brain damaged and autistic', u'vaccine'),
 (u'harm', u'vaccine'),
 (u'population', u'vaccine'),
 (u'toxic adjuvants', u'vaccine'),
 (u'vaccine industry', u'vaccine'),
 (u'vaccine-autism link', u'vaccine')}

In [43]:
labels = nx.get_edge_attributes(Gc,'edge')
edgelabels = {}
for e in labels.keys():
    e1 = e[0:2]
    edgelabels[e1]=labels[e]
edgelabels

{(u'public', u'autism'): u'wonders why they have',
 (u'hepatitis B vaccine',
  u'infant boys'): u'triples risk of developing ASD in',
 (u'pro-vaccine', u'Rep. Bill Posey'): u'is absolutely, resolutely',
 (u'patients', u'payment'): u'fight for',
 (u'holocaust',
  u'vaccine apologists'): u'relish in aiding and abetting a real-life',
 (u'family member', u'deaths'): u'please talk about it if suffered',
 (u'contrived swine flu panic campaign', u'WHO'): u'launched',
 (u'effective',
  u'studies funded by pharmaceutical companies'): u'have proved influenza vaccine is',
 (u'unconscious', u'cataplexy'): u'causes a person to fall',
 (u'quack science section', u'Washington Post'): u'is',
 (u'newborns', u'hepatitis B vaccine'): u'routine use began 1992 for all',
 (u'government healthcare reform',
  u'mandatory vaccines'): u'may include trying to make',
 (u'outright lies', u'CDC and Big Pharma'): u'has between them',
 (u'higher doses', u'profits'): u'to maximize',
 (u'children', u'mandatory vaccines

In [53]:
for e in a:
    if edgelabels.has_key(e):
        print e,edgelabels[e]
    else:
        rev_e = e[::-1]
        print rev_e, edgelabels[rev_e]

(u'brain damaged and autistic', u'vaccine') caused by toxic adjuvants and additives in
(u'autism', u'vaccine') caused
(u'harm', u'vaccine') people ignore
(u'vaccine-autism link', u'vaccine') studies examine just one type of
(u'Matthew Downing', u'vaccine') had been given a vaccine not approved for his age
(u'additives', u'vaccine') has 
(u'autoimmune disorder', u'vaccine') may have been caused by
(u'vaccine industry', u'vaccine') seems to view the 308 million people living in the U.S. as little more than pin cushions for their profitable
(u"GlaxoSmithKine's Energix B", u'vaccine') to be the worst of the bunch
(u'toxic adjuvants', u'vaccine') has 
(u'population', u'vaccine') no vaccine is completely safe in the entire


In [18]:
# this takes forever
# average connectivity k of a graph G is the average of local node connectivity over all pairs of nodes of G

#nx.average_node_connectivity(Gc)

In [12]:
# NEW SUMMARY

print "List of connected components =", [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]
print "Size of greatest component =", len(Gc)
print "# of cutsets =", len(cutsets)
print "Min node cut =", nx.minimum_node_cut(Gc)
print "Min edge cut =", nx.minimum_edge_cut(Gc)

List of connected components = [1211, 10, 10, 7, 6, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Size of greatest component = 1211
# of cutsets = 250
Min node cut = set([u'vaccine industry'])
Min edge cut = set([(u'monosodium glutamate', u'central nervous system disorders')])


- - -

In [34]:
# OLD summary DON'T DELETE YET

print "List of connected components =", [len(c) for c in sorted(nx.connected_components(ugraph), key=len, reverse=True)]
print "Size of greatest component =", len(Gc)
print "# of cutsets =", len(cutsets)
print "Min node cut =", nx.minimum_node_cut(Gc)
print "Min edge cut =", nx.minimum_edge_cut(Gc)

List of connected components = [709, 15, 7, 7, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Size of greatest component = 709
# of cutsets = 165
Min node cut = set([u'Tdap vaccine'])
Min edge cut = set([(u'Tdap vaccine', u'Pregnant women')])
