In [None]:
import networkx as nx 
import igraph as ig
import numpy as np 
import scipy as sp 
import pickle
import pandas as pd 
import collections
import requests 
import math 
import json
import re

Constructing the graphs from the co-occurrence logs in CSV format

In [93]:
def make_graph(path,timestamp,graph_format="gpickle",output_dir="graphs/"):
    graph_file = open(path,"rb")
    co_occurrences = graph_file.readlines()
    co_occurrences = [x.strip() for x in co_occurrences] 
    
    graph = nx.parse_edgelist(co_occurrences, nodetype = str, data=(('weight',int),), delimiter=",")
    
    outfile = output_dir+"graph_"+str(timestamp)+"."+graph_format
    if graph_format == "gpickle":
        nx.write_gpickle(graph,outfile)
    
    elif graph_format == "gexf":
        nx.write_gexf(graph,outfile)
        
    elif graph_format == "gml":
        nx.write_gml(graph,outfile)
        
    elif graph_format == "pajek":
        nx.write_pajek(graph,outfile)
    
    elif graph_format == "edgelist":
        nx.write_weighted_edgelist(graph,outfile)
    
    return {
        "outputfile":outfile,
        "graph":graph
    }


Community detection at each snapshot using iGraph. There is no interface between networkx and iGraph; hence, from the raw_data, the networkx-constructed graph is saved into pajek format to be used by iGraph in community detection.

In [62]:
def detect_communities(graph,timestamp,method="infomap",save_to_file=True,output_dir="graphs/"):
    if method == "infomap":
        results = graph.community_infomap(edge_weights="weight")
        
    elif method == "label_prop":
        results = graph.community_label_propagation(weights="weight")
        
    elif method == "multilevel":
        results = graph.community_multilevel(weights="weight")
        
    elif method == "walktrap":
        results = graph.community_walktrap(weights="weight")
    
    elif method == "louvain":
        results = graph.community_fastgreedy(weights="weight")
    
    graph.vs["community"] = results.membership
    
    if save_to_file: 
        graph.write_pickle(fname=output_dir+"graph_" + str(timestamp) + method + "_comms.pickle")
    
    return graph

def get_assigned_community(node_id,graph):
    assigned_comm = graph.vs[node_id]["community"]
    return assigned_comm

def get_community(community_id,graph):
    nodes_in_comm = graph.vs.select(community_eq=community_id)
    return [v.index for v in nodes_in_comm]

def get_all_communities(graph):
    communities = set(graph.vs["community"])
    all_communities = {}
    for comm in communities:
        nodes_in_comm = graph.vs.select(community_eq=comm)
        all_communities[comm] = [v.index for v in nodes_in_comm]
    
    return all_communities

Utility functions for community matching and event detection across snapshots.
Theta is the similarity threshold for matching a pair of communities. 
Phi is th fluctuation threshold that measures the change in size of a community that has a match.

In [60]:
def compute_similarity(comm1,comm2):
    common = set(comm1).intersection(set(comm2))
    return min(len(common)*1.0/len(comm1), len(common)*1.0/len(comm2))

def find_matches(communities1,communities2,theta=0.20,comm_size=3):
    matches = {}
    
    for id1,nodes1 in communities1.iteritems():  #communities in time n
        similarities = dict()
        
        if len(nodes1)>= comm_size:
            for id2,nodes2 in communities2.iteritems(): #communities in time n+1  
                if len(nodes2) >= comm_size:
                    theta_p = compute_similarity(nodes1,nodes2)  
                    similarities[id2]=theta_p
            
            non_zero = filter(lambda x: x != 0, similarities.values()) 
            match = [item for item in non_zero if item >= theta]

            if len(match) >= 1: 
                matched = similarities.keys()[similarities.values().index(match[0])] 
                matches[id1] = matched

            elif len(match) == 0:
                matches[id1] = -1 
                
    print matches
    return matches

def detect_event(matches,phi=0.10):
    events = {}
    
    return events

Utility functions for extracting features from communities in snapshots

In [None]:
#Feature extraction from communities 

Training and testing the topic evolution prediction model

In [None]:
#Building the topic evolution prediction model 

In [36]:
graph_dir = "graphs/"
data_dir = "data/"

In [107]:
#For snapshot i
timestamp = 1
location = data_dir + "cooc_1.csv"
output = make_graph(location,timestamp,graph_format="edgelist")
graph = output["graph"]
path = output["outputfile"]

#i_graph =  ig.Graph.Read_Pajek(path)
i_graph = ig.read(path, format="ncol", directed=False, names=True)
i_graph_comms = detect_communities(i_graph,timestamp)

#For snapshot i+1 
timestamp = 2
location = data_dir + "cooc_2.csv"
output = make_graph(location,timestamp,graph_format="edgelist")
graph = output["graph"]
path = output["outputfile"]

#i_graph =  ig.Graph.Read_Pajek(path)
i_graph = ig.read(path, format="ncol", directed=False, names=True)
i_graph_comms = detect_communities(i_graph,timestamp)

In [113]:
#Get the set of communities discovered for a pair of network snapshots
graph1 = ig.Graph.Read_Pickle(graph_dir+"graph_1_comms.pickle")
graph1_comms = get_all_communities(graph1) #set of communities from snapshot 1
graph2 = ig.Graph.Read_Pickle(graph_dir+"graph_2_comms.pickle")
graph2_comms = get_all_communities(graph2) #set of communities from snapshot 2

In [119]:
#print get_community(228,graph1)
print graph1.vs[1312]

igraph.Vertex(<igraph.Graph object at 0x118f62050>,1312,{'y': 0.0, 'x': 0.0, 'shape': 'ellipse', 'id': 'C0006030', 'community': 228})


In [114]:
#Find matching communities and detect their corresponding evolutions. 
print "There are %s in snapshot i."%len(graph1_comms)
print "There are %s in snapshot i+1."%len(graph2_comms)

There are 878 in snapshot i.
There are 862 in snapshot i+1.


In [94]:
timestamp = 3
location = data_dir + "cooc_3.csv"
output = make_graph(location,timestamp,graph_format="edgelist")
graph = output["graph"]
path = output["outputfile"]

In [112]:
i_graph = ig.read(data_dir+"cooc_3.csv", format="ncol", directed=False, names=True)

InternalError: Error at foreign.c:243: Parse error in NCOL file, line 1 (syntax error, unexpected NEWLINE, expecting ALNUM), Parse error

In [110]:
all_edges = []
for e in i_graph.es:
    all_edges.append(e.tuple)
    
if (2842, 13638) in all_edges:
    print "Found 1."
    
if (13638, 2842) in all_edges:
    print "Found 2."