In [1]:
from os.path import join
import joblib
import pickle
import sys
sys.path.append('/g/g15/cedre/')
sys.path.append('/g/g15/cedre/cnrg')
sys.path.append('/g/g15/cedre/cnrg/utils')
sys.path.append('/g/g15/cedre/pysparkplug')
sys.path.append('/g/g15/cedre/pysparkplug/pysp')

import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
rng = np.random.RandomState()

In [2]:
sys.path.append('/g/g15/cedre/gonzalez_mrhyde/scripts')
from data_scripts import read_data, create_graphs
from rule_to_rule_scripts import convert_LMG, decompose, ancestor, common_ancestor
from rule_to_rule_scripts import update_grammar_independent, update_rule_case1, update_rule_case2

In [3]:
def stats(graphs, T):
    print('this,\tnext,\toverlap,\trounded%,\tadd,\tdel,\tchanges,\tchurn%')
    
    avg_churn = 0
    for idx in range(T - 1):
        this_nodes = set(graphs[idx].nodes())
        next_nodes = set(graphs[idx + 1].nodes())
        overlap = this_nodes & next_nodes
        additions = next_nodes - this_nodes
        deletions = this_nodes - next_nodes
        changes = additions | deletions
        churn = 100 * len(changes) // graphs[idx].size()
        avg_churn += churn
        print(graphs[idx].order(),
              '\t',
              graphs[idx + 1].order(),
              '\t',
              len(overlap),
              '\t\t',
              100 * len(overlap) // len(this_nodes),
              '\t\t',
              len(additions),
              '\t',
              len(deletions),
              '\t',
              len(changes),
              '\t\t',
              churn)
    
    avg_churn /= (T - 1)
    print(f'average churn: {avg_churn}')    

In [4]:
datadir = '/g/g15/cedre/gonzalez_mrhyde/data'
T = 10

# nips

In [None]:
dataname = 'nips'
graphs, years = read_data(dataname=dataname)

In [None]:
stats(graphs, len(graphs))

# fb-messages

In [5]:
dataname = 'fb-messages'
path = join(datadir, dataname, f'{dataname}.edges')

graph_data = nx.read_edgelist(path,
                              delimiter=',',
                              nodetype=int,
                              data=[('time', float)])

edges = sorted([(u, v, d['time']) for u, v, d in graph_data.edges(data=True)],
               key=lambda x: x[2])
times = sorted([t for u, v, t in edges])
delta = len(times) // T

edge_brackets = [edges[k * delta:(k + 1) * delta]
                 for k in range(T + 1)]
edge_brackets[T - 1] += edge_brackets[T]
del edge_brackets[T]

edges_clean = [[(u, v) for u, v, t in bracket]
               for bracket in edge_brackets]

graphs = [graph_data.edge_subgraph(clean)
          for clean in edges_clean]

In [6]:
cum_graphs = [graphs[0].subgraph(sorted(nx.connected_components(graphs[0]), key=len, reverse=True)[0])]

for idx, graph in enumerate(graphs[1:]):
    next_graph = cum_graphs[idx].copy()
    next_graph.add_nodes_from(graph.nodes())
    next_graph.add_edges_from(graph.edges())
    next_graph = next_graph.subgraph(sorted(nx.connected_components(next_graph), key=len, reverse=True)[0])
    cum_graphs.append(next_graph)

for g in cum_graphs:
    print(nx.is_connected(g))

True
True
True
True
True
True
True
True
True
True


In [7]:
stats(cum_graphs, T)

this,	next,	overlap,	rounded%,	add,	del,	changes,	churn%
402 	 666 	 402 		 100 		 264 	 0 	 264 		 19
666 	 902 	 666 		 100 		 236 	 0 	 236 		 8
902 	 1057 	 902 		 100 		 155 	 0 	 155 		 3
1057 	 1218 	 1057 		 100 		 161 	 0 	 161 		 2
1218 	 1364 	 1218 		 100 		 146 	 0 	 146 		 1
1364 	 1484 	 1364 		 100 		 120 	 0 	 120 		 1
1484 	 1643 	 1484 		 100 		 159 	 0 	 159 		 1
1643 	 1745 	 1643 		 100 		 102 	 0 	 102 		 0
1745 	 1888 	 1745 		 100 		 143 	 0 	 143 		 1
average churn: 4.0


In [8]:
test_graphs = []
k = len(graphs)

for i in range(len(graphs)):
    cum_nodes = set()
    cum_edges = set()
    
    for g in graphs[max(0, i - k + 1):i + 1]:
        cum_nodes |= set(g.nodes())
        cum_edges |= set(g.edges())
        
    cum_graph = nx.Graph()
    cum_graph.add_nodes_from(cum_nodes)
    cum_graph.add_edges_from(cum_edges)
    
    cum_graph = cum_graph.subgraph(sorted(nx.connected_components(cum_graph), key=len, reverse=True)[0])
    
    test_graphs.append(cum_graph)

In [9]:
stats(test_graphs, T)

this,	next,	overlap,	rounded%,	add,	del,	changes,	churn%
402 	 667 	 402 		 100 		 265 	 0 	 265 		 19
667 	 904 	 667 		 100 		 237 	 0 	 237 		 8
904 	 1061 	 904 		 100 		 157 	 0 	 157 		 3
1061 	 1224 	 1061 		 100 		 163 	 0 	 163 		 2
1224 	 1369 	 1224 		 100 		 145 	 0 	 145 		 1
1369 	 1490 	 1369 		 100 		 121 	 0 	 121 		 1
1490 	 1649 	 1490 		 100 		 159 	 0 	 159 		 1
1649 	 1751 	 1649 		 100 		 102 	 0 	 102 		 0
1751 	 1893 	 1751 		 100 		 142 	 0 	 142 		 1
average churn: 4.0


In [10]:
graphs, years = read_data(dataname='fb-messages', cumulative=10)

In [11]:
stats(graphs, T)

this,	next,	overlap,	rounded%,	add,	del,	changes,	churn%
402 	 663 	 402 		 100 		 261 	 0 	 261 		 18
663 	 899 	 663 		 100 		 236 	 0 	 236 		 8
899 	 1054 	 899 		 100 		 155 	 0 	 155 		 3
1054 	 1211 	 1054 		 100 		 157 	 0 	 157 		 2
1211 	 1352 	 1211 		 100 		 141 	 0 	 141 		 1
1352 	 1473 	 1352 		 100 		 121 	 0 	 121 		 1
1473 	 1626 	 1473 		 100 		 153 	 0 	 153 		 1
1626 	 1730 	 1626 		 100 		 104 	 0 	 104 		 0
1730 	 1870 	 1730 		 100 		 140 	 0 	 140 		 1
average churn: 3.888888888888889


# email-dnc

In [None]:
dataname = 'email-dnc'
path = join(datadir, dataname, f'{dataname}.edges')

with open(path, 'r') as infile:
    for idx, line in enumerate(infile):
        u, v, t = map(int, line.strip().split(','))

print('ligma')

In [None]:
dataname = 'email-dnc'
path = join(datadir, dataname, f'{dataname}.edges')

graph_data = nx.read_edgelist(path,
                              delimiter=',',
                              nodetype=int,
                              data=[('time', float)])

edges = sorted([(u, v, d['time']) for u, v, d in graph_data.edges(data=True)],
               key=lambda x: x[2])
times = sorted([t for u, v, t in edges])
delta = len(times) // T

edge_brackets = [edges[k * delta:(k + 1) * delta]
                 for k in range(T + 1)]
edge_brackets[T - 1] += edge_brackets[T]
del edge_brackets[T]

edges_clean = [[(u, v) for u, v, t in bracket]
               for bracket in edge_brackets]

graphs = [graph_data.edge_subgraph(clean)
          for clean in edges_clean]

In [None]:
stats(graphs, T)

In [None]:
graphs, years = read_data(dataname='email-dnc', cumulative=10)

In [None]:
stats(graphs, T)

# ca-cit-HepTh

In [None]:
dataname = 'ca-cit-HepTh'
path = join(datadir, dataname, f'{dataname}.edges')

graph_data = nx.read_edgelist(path,
                              delimiter=' ',
                              nodetype=int,
                              data=[('', int), ('time', int)])

In [None]:
edges = sorted([(u, v, d['time']) for u, v, d in graph_data.edges(data=True)],
               key=lambda x: x[2])
times = sorted([t for u, v, t in edges])
delta = len(times) // T

edge_brackets = [edges[k * delta:(k + 1) * delta]
                 for k in range(T + 1)]
edge_brackets[T - 1] += edge_brackets[T]
del edge_brackets[T]

edges_clean = [[(u, v) for u, v, t in bracket]
               for bracket in edge_brackets]

graphs = [graph_data.edge_subgraph(clean)
          for clean in edges_clean]

In [None]:
stats(graphs, T)

# ca-cit-HepPh

In [None]:
dataname = 'ca-cit-HepPh'
path = join(datadir, dataname, f'{dataname}.edges')

graph_data = nx.read_edgelist(path,
                              delimiter=' ',
                              nodetype=int,
                              data=[('', int), ('time', int)])

In [None]:
edges = sorted([(u, v, d['time']) for u, v, d in graph_data.edges(data=True)],
               key=lambda x: x[2])
times = sorted([t for u, v, t in edges])
delta = len(times) // T

edge_brackets = [edges[k * delta:(k + 1) * delta]
                 for k in range(T + 1)]
edge_brackets[T - 1] += edge_brackets[T]
del edge_brackets[T]

edges_clean = [[(u, v) for u, v, t in bracket]
               for bracket in edge_brackets]

graphs = [graph_data.edge_subgraph(clean)
          for clean in edges_clean]

In [None]:
stats(graphs, T)

# tech-as-topology

In [None]:
dataname = 'tech-as-topology'
path = join(datadir, dataname, f'{dataname}.edges')

with open(path, 'r') as infile:
    for line in infile:
        u, v, k, t = map(int, line.strip().split(' '))
        if k != 1:
            print(k)

print('confirming that the third data point on each line is always 1')

In [None]:
dataname = 'tech-as-topology'
path = join(datadir, dataname, f'{dataname}.edges')

graph_data = nx.read_edgelist(path,
                              delimiter=' ',
                              nodetype=int,
                              data=[('', int), ('time', int)])

In [None]:
edges = sorted([(u, v, d['time']) for u, v, d in graph_data.edges(data=True)],
               key=lambda x: x[2])
times = sorted([t for u, v, t in edges])
delta = len(times) // T

edge_brackets = [edges[k * delta:(k + 1) * delta]
                 for k in range(T + 1)]
edge_brackets[T - 1] += edge_brackets[T]
del edge_brackets[T]

edges_clean = [[(u, v) for u, v, t in bracket]
               for bracket in edge_brackets]

graphs = [graph_data.edge_subgraph(clean)
          for clean in edges_clean]

In [None]:
stats(graphs, T)