In [None]:
import sys, os, shutil
sys.path.insert(0,"../../python/")
import concept_drift.graph_generator as gg
import concept_drift.experiment_utils as ceu
import centrality_utils.temporal_pagerank as tprc
import centrality_utils.online_rank_computer as olrc
import simulator_utils.graph_simulator as gsim
from centrality_utils.online_rank_computer import link2str

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from scipy.stats import pearsonr, spearmanr

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def get_alfa_for_katz(beta,c,E):
    """Damping factor of Katz from induction proof"""
    return beta / (E*(1-np.exp(-c)))

# 0. Load Parameters

In [None]:
mode = "students"
#mode = "facebook"
#mode = "tumblr"
#mode = "enron"

In [None]:
n = 500
olr_beta = 0.05
pow_exp = 5.0
is_outdegree_normed = True

In [None]:
is_random_sample = True
handle_sinks = False
use_custom_katz = False
weight_str = 'weight'

# 1. Sampling "temporal" edges from a random graph

In [None]:
samples = []

## i.) Generate random graph

In [None]:
G = gg.weighted_DiGraph(n, seed = 1.0, mode = mode, weights = 'random', handle_sinks=handle_sinks, pow_exp=pow_exp)

In [None]:
weights = [w['weight'] for s,t,w in list(G.edges(data=True))]
plt.hist(weights,bins=50)
plt.show()

### Define number of sampled edges

In [None]:
delta = 50
iters = 10000
print(delta,iters)

### Experiment with Katz-index damping factor

In [None]:
n_order = G.nodes()
E = G.number_of_edges()

x = 1-np.random.power(50, size=E)
plt.hist(x,bins=50)
plt.show()

In [None]:
c_factors = [5.0,1000.0]
c_list = [i/E for i in c_factors]
c_list

In [None]:
katz_damping = [get_alfa_for_katz(olr_beta,c,E) for c in c_list]
katz_damping

In [None]:
# katz
katz_values = []
katz_custom_items = []
for kd in katz_damping:
    print(kd)
    if use_custom_katz:
        katz = ceu.custom_katz(G,alpha=kd,max_iter=1000)
    else:
        katz = nx.katz_centrality(G,alpha=kd,max_iter=10000,weight=None)
    katz_values.append([katz[n] for n in n_order])
    # for later correlation computation
    katz_item = list(zip(*katz.items()))
    katz_custom_items.append([katz_item,katz_item,katz_item])

In [None]:
# pagerank
pr = nx.pagerank(G)
pr_values = [pr[n] for n in n_order]

### Correlation of Katz to PageRank: with stronger damping it increases

In [None]:
for i, kd in enumerate(katz_damping):
    print(kd, spearmanr(katz_values[i], pr_values))
    print(kd, pearsonr(katz_values[i], pr_values))
    print()

## ii.) First Sample

In [None]:
k_alpha = katz_damping[0]
print(k_alpha)

In [None]:
samples.append(ceu.get_stream(G, iters, katz_alpha=k_alpha, is_custom_katz=use_custom_katz, norm_outdegree=is_outdegree_normed, random_sample=is_random_sample, weight=weight_str))

## iii.) Second Sample (change weights)

In [None]:
G = gg.change_weights(G, pow_exp=pow_exp)
samples.append(ceu.get_stream(G, iters, katz_alpha=k_alpha, is_custom_katz=use_custom_katz, norm_outdegree=is_outdegree_normed, random_sample=is_random_sample, weight=weight_str))

## iv.) Third Sample (change weights)

In [None]:
G = gg.change_weights(G, pow_exp=pow_exp)
samples.append(ceu.get_stream(G, iters, katz_alpha=k_alpha, is_custom_katz=use_custom_katz, norm_outdegree=is_outdegree_normed, random_sample=is_random_sample, weight=weight_str))

## v.) Concatenate stream

In [None]:
nodes = G.nodes()
pr_items = []
katz_items = []

stream = []
for stream_item, pr_item, katz_item in samples:
    stream += stream_item
    pr_items.append(list(zip(*pr_item.items())))
    katz_items.append(list(zip(*katz_item.items())))
    print(len(stream_item))
len(stream)

## Similarity of weighted PageRank and Katz-index

In [None]:
len(pr_items)

In [None]:
d1, d2, d3 = dict(zip(*pr_items[0])), dict(zip(*pr_items[1])), dict(zip(*pr_items[2]))
values_1 = [d1[n] for n in n_order]
values_2 = [d2[n] for n in n_order]
values_3 = [d3[n] for n in n_order]

In [None]:
spearmanr(values_1, values_2), spearmanr(values_2, values_3)

In [None]:
d1, d2, d3 = dict(zip(*katz_items[0])), dict(zip(*katz_items[1])), dict(zip(*katz_items[2]))
values_1 = [d1[n] for n in n_order]
values_2 = [d2[n] for n in n_order]
values_3 = [d3[n] for n in n_order]

In [None]:
spearmanr(values_1, values_2), spearmanr(values_2, values_3)

# 2.) Simulate models on sampled edges

In [None]:
gsim_params = []
experiment_path = "../../data/centrality_scores/concept_drift_%s_random%s_outdeg%s" % (mode, is_random_sample, is_outdegree_normed)
time_type = "index"

In [None]:
if os.path.exists(experiment_path):
    print("Deleting former files...")
    shutil.rmtree(experiment_path)
    print("Files were deleted!")

In [None]:
src, trg = zip(*stream)
edge_idx = range(len(stream))
graph_data = np.array(list(zip(edge_idx, src, trg)))
edges = [link2str(link) for link in graph_data[:,1:3].tolist()]

### a.) Parametrize Temporal PageRank

In [None]:
tpr_params = []
tpr_params += [tprc.TemporalPageRankParams(0.85,b) for b in [0.0,0.01,0.05,0.5,0.95]] 

In [None]:
if len(tpr_params) > 0:
    gsim_params.append(tprc.TemporalPageRankComputer(nodes,tpr_params))

### b.) Parametrize OnlineRank

In [None]:
olr_params = []
for c in c_list:
    olr_params += [olrc.OnlineRankParams(0.05, olr_beta, olrc.ExponentialWeighter(base=np.exp(-1),norm=1.0/c))]

In [None]:
if len(olr_params) > 0:
    gsim_params.append(olrc.OnlineRankComputer(nodes,edges,olr_params))

### c.) Run

In [None]:
boundaries = list(range(0,len(stream)+delta,delta))[1:]
print(min(boundaries), max(boundaries))
len(boundaries)

In [None]:
%%time
gsim_obj = gsim.OnlineGraphSimulator(graph_data, time_type=time_type, verbose=False)
experiment_graph_stats = gsim_obj.run_with_boundaries(gsim_params,boundaries,experiment_path, max_index=None)

# 3.) Evaluation - Concept drift

In [None]:
eval_snapshots = range(len(boundaries))

In [None]:
def show_concept_drift(ground_truth, tpr_items, olr_items, legends):
    markers = ["--","-",":"]
    prefixes, corrs = [], []
    #tpr
    for tpr_item in tpr_items:
        tpr_prefix = "%s/original/%s/tpr" % (experiment_path, str(tpr_item))
        prefixes.append(tpr_prefix)
        corrs.append(ceu.get_correlations(tpr_prefix, eval_snapshots, delta, iters, ground_truth, visu=False))
    #olr
    for olr_item in olr_items:
        olr_prefix = "%s/original/%s/olr" % (experiment_path, str(olr_item))
        prefixes.append(olr_prefix)
        corrs.append(ceu.get_correlations(olr_prefix, eval_snapshots, delta, iters, ground_truth, visu=False))
    #plot
    visu_records = list(zip(prefixes, corrs))
    print(len(visu_records))
    plt.figure(figsize=(18,5))
    x = [99,199,299,399,499,599]
    ticks = [5000,10000,15000,20000,25000,30000]
    plt.title(mode)
    for i, rec in enumerate(visu_records):
        pref, corr = rec
        plt.plot(corr[:,0],corr[:,2],markers[i],label=legends[i])
    plt.ylim(0.0,1.0)
    plt.ylabel("spearman")
    plt.xticks(x,ticks)
    plt.xlabel("number of temporal edges")
    plt.legend(loc=4)

In [None]:
eval_snapshots

In [None]:
for p in olr_params:
    print(p)

In [None]:
tpr_items = [tpr_params[3]]
olr_items = [olr_params[0],olr_params[1]]
legends = [
    "Temporal PageRank",
    "Online Centrality: c=%i/E" % c_factors[0],
    "Online Centrality: c=%i/E" % c_factors[1]
]
print(legends)

In [None]:
import seaborn as sns
sns.set(font="Palatino",font_scale = 2.0)
sns.set_style("whitegrid")

## a.) convergence to pagerank

In [None]:
show_concept_drift(pr_items, tpr_items, olr_items, legends)

## b.) convergence ot katz-index

In [None]:
katz_damping

In [None]:
show_concept_drift(katz_items, tpr_items, olr_items, legends)