In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys, os, shutil, random
import numpy as np
import networkx as nx

In [None]:
sys.path.insert(0,"../python/")
import concept_drift.graph_generator as gg
import concept_drift.experiment_utils as ceu
import centrality_utils.temporal_pagerank as tprc
import centrality_utils.temporal_katz_computer as tkc
import simulator_utils.graph_simulator as gsim
from centrality_utils.base_computer import link2str

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
cmap = "coolwarm"

In [None]:
ds_to_title = {
    "facebook":"Facebook",
    "students":"Students",
    "tumblr":"Tumblr",
    "enron":"Enron"
}

# 1. Set Parameters

In [None]:
centrality_score_dir = "../data/polina_graphs/centrality_measures/"
fig_root = "../results/concept_drift/"

In [None]:
mode = "students"
#mode = "enron"
#mode = "facebook"
#mode = "tumblr"

In [None]:
n = 500
n_sub = 400
delta = 50
iters = 10000
max_iter = 2000

# 2. Sampling "temporal" edges from a random graph

## i.) Generate random graph

In [None]:
G = gg.weighted_DiGraph(n, mode = mode, data_prefix="../data/polina_graphs/")

In [None]:
n_order = list(G.nodes())
E = G.number_of_edges()

## ii.) Katz index

### Experiment with Katz-index damping factor

   * If Katz diverges then exclude the damping factor from the experiments!!!

In [None]:
katz_damping = [0.01]

In [None]:
# katz
valid_katz_damping = []
katz_values = []
for kd in katz_damping:
    print(kd)
    try:
        katz = nx.katz_centrality(G,alpha=kd,max_iter=max_iter,tol=0.001)
        katz_scores = [katz[n] for n in n_order]
        katz_values.append(katz_scores)
        valid_katz_damping.append(kd)
    except nx.PowerIterationFailedConvergence:
        print("Convergence failed for beta=%.3f" % kd)
        continue
    except:
        raise
# divergent katz damping is excluded
katz_damping = valid_katz_damping
print(katz_damping)

## iii.) Sampling temporal edges 

### Define number of sampled edges

In [None]:
print(delta,iters, n_sub)

In [None]:
samples = []

### First Sample

In [None]:
nodes_1 = random.sample(n_order, n_sub)
samples.append(ceu.get_stream(G=G, iters=iters, katz_alphas=katz_damping, katz_max_iter=max_iter, node_sample=nodes_1))

### Second Sample

In [None]:
nodes_2 = random.sample(n_order, n_sub)
G = gg.change_weights(G)
samples.append(ceu.get_stream(G=G, iters=iters, katz_alphas=katz_damping, katz_max_iter=max_iter, node_sample=nodes_2))

### Third Sample

In [None]:
nodes_3 = random.sample(n_order, n_sub)
G = gg.change_weights(G)
samples.append(ceu.get_stream(G=G, iters=iters, katz_alphas=katz_damping, katz_max_iter=max_iter, node_sample=nodes_3))

## v.) Concatenate stream

In [None]:
stream = []
pr_items = []
katz_items = [[] for i in range(len(katz_damping))]
for stream_item, pr_item, katz_item in samples:
    # append stream
    stream += stream_item
    print(len(stream_item))
    # append pagerank
    pr_vals = [pr_item.get(n, 0.0) for n in n_order]
    pr_items.append((n_order,pr_vals))
    # append katz
    for j in range(len(katz_damping)):
        # score can be zero (if node not occurred inthe sampled stream)
        katz_vals =  [katz_item[j].get(n, 0.0) for n in n_order]
        katz_items[j].append((n_order,katz_vals))
len(stream), len(pr_items), len(katz_items), len(katz_items[0])

# 3.) Simulate models on sampled edges

In [None]:
gsim_params = []
experiment_path = "%s/%s" % (centrality_score_dir, mode)
time_type = "index"

In [None]:
if os.path.exists(experiment_path):
    print("Deleting former files...")
    shutil.rmtree(experiment_path)
    print("Files were deleted!")

In [None]:
src, trg = zip(*stream)
edge_idx = range(len(stream))
graph_data = np.array(list(zip(edge_idx, src, trg)))

### a.) Parametrize Temporal PageRank

In [None]:
tpr_params = []
tpr_params += [tprc.TemporalPageRankParams(0.85,b) for b in [0.0,0.01,0.05,0.5,0.95]] 

In [None]:
if len(tpr_params) > 0:
    gsim_params.append(tprc.TemporalPageRankComputer(n_order,tpr_params))

### b). Parametrize OnlineRank

In [None]:
OLR_BETA = 0.01
c_values = [1.0,10.0,100.0]

In [None]:
olr_params = []
for c in c_values:
    norm_factor = c / E
    olr_params += [tkc.TemporalKatzParams(OLR_BETA, tkc.ExponentialWeighter(base=np.exp(-1),norm=1.0/norm_factor))]

In [None]:
if len(olr_params) > 0:
    gsim_params.append(tkc.TemporalKatzComputer(n_order,olr_params))

#### Selected Parameters

In [None]:
for olr_item in olr_params:
    print(olr_item)

### c.) Run

In [None]:
boundaries, eval_snapshots = [], []
for i in range(delta,len(stream)+delta,delta):
    boundaries.append(i)
    eval_snapshots.append(-1+i/delta)
len(boundaries), len(eval_snapshots)

In [None]:
%%time
gsim_obj = gsim.OnlineGraphSimulator(graph_data, time_type=time_type, verbose=False)
experiment_graph_stats = gsim_obj.run_with_boundaries(gsim_params,boundaries,experiment_path, max_index=None)

# 4.) Evaluation

In [None]:
def show_concept_drift(ground_truth, tpr_items, olr_items, legends, corr_type="Weighted Kendall-tau"):
    markers = ["--","-","-.",":"]
    prefixes, corrs = [], []
    #tpr
    for tpr_item in tpr_items:
        tpr_prefix = "%s/original/%s/tpr" % (experiment_path, str(tpr_item))
        prefixes.append(tpr_prefix)
        corrs.append(ceu.get_correlations(tpr_prefix, eval_snapshots, ground_truth))
    #olr
    for olr_item in olr_items:
        olr_prefix = "%s/original/%s/tk" % (experiment_path, str(olr_item))
        prefixes.append(olr_prefix)
        corrs.append(ceu.get_correlations(olr_prefix, eval_snapshots, ground_truth))
    #plot
    visu_records = list(zip(prefixes, corrs))
    plt.figure(figsize=(18,5))
    shift = int(len(eval_snapshots)/6)
    x = list(range(0,len(eval_snapshots),shift))
    ticks = [val * delta for val in x]
    for i, rec in enumerate(visu_records):
        pref, corr = rec
        if corr_type == "pearson":
            plt.plot(corr[:,0],corr[:,1],markers[i],label=legends[i])
        elif corr_type == "spearman":
            plt.plot(corr[:,0],corr[:,2],markers[i],label=legends[i])
        elif corr_type == "kendall":
            plt.plot(corr[:,0],corr[:,3],markers[i],label=legends[i])
        else:
            plt.plot(corr[:,0],corr[:,4],markers[i],label=legends[i])
    plt.ylabel(corr_type)
    x_ticks_1 = [99,199,299,399,499,599]
    x_ticks_2 = [5000,10000,15000,20000,25000,30000]
    plt.xticks(x_ticks_1,x_ticks_2)
    plt.xlabel("Number of temporal edges")
    plt.legend(loc=4)

In [None]:
tpr_items = [tpr_params[3]]
olr_indices = [0,1,2]
olr_items = [olr_params[i] for i in olr_indices]
legends = ["Temporal PageRank"]
legends += ["Temporal Katz: c=%i/E" % c_values[i] for i in olr_indices]
print(legends)

In [None]:
import seaborn as sns
sns.set(font="Palatino",font_scale = 2.0)
sns.set_style("whitegrid")

## a.) convergence to pagerank

%%time
show_concept_drift(pr_items, tpr_items, olr_items, legends)
plt.title(ds_to_title[mode])

## b.) convergence ot katz-index

In [None]:
%%time
show_concept_drift(katz_items[0], tpr_items, olr_items, legends)
plt.title(ds_to_title[mode])