In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import re, math, itertools

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="Droid Sans",font_scale = 2)
sns.set_style("whitegrid")
sns.set_color_codes("dark")

In [None]:
import sys, os
sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm
import popularity_model.popularity_model_utils as pmu

# Choose dataset

In [None]:
dataset_id = 'yo_pagerank'

In [None]:
dataset_stat_file = "../correlation_experiments/%s_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

In [None]:
num_of_users = 2000 #int(total_num_matcher.group(1))
num_of_days = len(stat_df)-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_kendall = list(stat_df["kendall"])[:num_of_days-1]
data_w_kendall = list(stat_df["w_kendall"])[:num_of_days-1]

# Popularity model

In [None]:
print num_of_users, num_of_days

**TODO: fit powerlaw exponent on real data aggregated centrality values!!!**

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=0.1)

# Kendall utils

In [None]:
import scipy.stats as ss

def tiedrank(vector):
    return (len(vector) + 1) * np.ones(len(vector)) - ss.rankdata(vector)

def get_list_for_corr(M,day_idx):
    idx = day_idx
    day_one = np.ceil(M[idx,:])
    day_two = np.ceil(M[idx+1,:])

    ind_one=np.nonzero(day_one)[0];
    ind_two=np.nonzero(day_two)[0];
    ind=np.union1d(ind_one,ind_two)

    ranks_day_one=tiedrank(day_one[ind])
    ranks_day_two=tiedrank(day_two[ind])
    return ranks_day_one, ranks_day_two

In [None]:
def findWKendall(rankX,rankY):
    n = len(rankX)
    denomX=0
    denomY=0
    denomXW=0
    denomYW=0
    num=0
    numW=0

    for i in range(n):
        for j in range(i+1,n):
            weightXY= 1/rankY[i]+1/rankY[j]
            weightX=1/rankX[i]+1/rankX[j];
            weightY=1/rankY[i]+1/rankY[j];
            termX=np.sign(rankX[i]-rankX[j]);
            termY=np.sign(rankY[i]-rankY[j]);
            denomX=denomX+(termX)**2;
            denomY=denomY+(termY)**2;
            denomXW=denomXW+(termX)**2*weightX;
            denomYW=denomYW+(termY)**2*weightY;
            num=num+termX*termY;
            numW=numW+termX*termY*weightXY;

    Kendall=num/math.sqrt(denomX*denomY);
    WKendall=numW/math.sqrt(denomXW*denomYW);
    return [Kendall, WKendall]

In [None]:
import scipy.stats as stats
import operator

def get_correlations(A, num_of_days):
    kendall = []
    for i in xrange(1,num_of_days):
        rank_list_0, rank_list_1 = get_list_for_corr(A,i-1)
        kendall.append(stats.kendalltau(rank_list_0,rank_list_1)[0])
    return kendall

def get_custom_correlations_2(A, num_of_days):
    """Return unweighted and weighted correlations"""
    res = []
    for i in xrange(1,num_of_days):
        rank_list_0, rank_list_1 = get_list_for_corr(A,i-1)
        res.append(findWKendall(rank_list_0,rank_list_1))
    return np.array(res)

# Experiments

## 1.) Model stability related to daily variations

In [None]:
def test_stability_for_alpha(lambdas, num_samples):
    rmse_arr = []
    for selected_lambda in lambdas:
        for i in xrange(num_samples):
            sample_model = pm.PopularityModel(num_of_users, num_of_days)
            x_act_leaders = sample_model.get_centrality_with_markov(p, p_overlap, lambda_=selected_lambda)
            res = get_custom_correlations_2(x_act_leaders, num_of_days)
            kendall, w_kendall = list(res[:,0]), list(res[:,1])
            rmse_arr.append([selected_lambda, 'unweighted', pmu.rmse(data_kendall, kendall)])
            rmse_arr.append([selected_lambda, 'weighted', pmu.rmse(data_w_kendall, w_kendall)])
            print i
        print "lambda=%f finished" % selected_lambda
    rmse_df = pd.DataFrame(np.array(rmse_arr), columns=["lambda","type","rmse"])
    rmse_df["rmse"] = rmse_df["rmse"].astype("float64")
    return rmse_df

In [None]:
num_of_samples = 5
lambdas = np.linspace(0.0, 1.0, num=11)
rmse_corr = test_stability_for_alpha(lambdas,num_of_samples)

In [None]:
import graphlab
data = graphlab.SFrame(rmse_corr)
unweighted_rmse_corr = data[data["type"] == "unweighted"]
weighted_rmse_corr = data[data["type"] == "weighted"]
weighted_rmse_avg = weighted_rmse_corr.groupby("lambda",{'rmse_avg':graphlab.aggregate.AVG('rmse')})
weighted_rmse_std = weighted_rmse_corr.groupby("lambda",{'rmse_std':graphlab.aggregate.STD('rmse')})
weighted_rmse = weighted_rmse_avg.join(weighted_rmse_std,"lambda")
unweighted_rmse_avg = unweighted_rmse_corr.groupby("lambda",{'rmse_avg':graphlab.aggregate.AVG('rmse')})
unweighted_rmse_std = unweighted_rmse_corr.groupby("lambda",{'rmse_std':graphlab.aggregate.STD('rmse')})
unweighted_rmse = unweighted_rmse_avg.join(unweighted_rmse_std,"lambda")
weighted_rmse = weighted_rmse.to_dataframe()
unweighted_rmse = unweighted_rmse.to_dataframe()

In [None]:
marker = itertools.cycle(("o", "s", "^", "v", ">", "<", "D","*"))
fig, axes = plt.subplots(figsize=(12,8))
plt.errorbar(unweighted_rmse["lambda"],unweighted_rmse["rmse_avg"],yerr=unweighted_rmse["rmse_std"],fmt='o',label="unweighted",elinewidth=3,marker=marker.next(),markersize='10')
plt.errorbar(weighted_rmse["lambda"],weighted_rmse["rmse_avg"],yerr=weighted_rmse["rmse_std"],fmt='o',label="weighted",elinewidth=3,marker=marker.next(),markersize='10')
plt.legend()

In [None]:
rmse_corr.to_csv('../final_plot_data/yo-0.2/kendall_stability.csv',sep=";",index=False)

## 2.) Optimal _lambda_ parameter visualization

In [None]:
model_opt = pm.PopularityModel(num_of_users, num_of_days)
X_act_leaders = model_opt.get_centrality_with_markov(p, p_overlap, lambda_=0.3)

In [None]:
res = get_custom_correlations_2(X_act_leaders, num_of_days)
opt_kendall, opt_w_kendall = list(res[:,0]), list(res[:,1])

In [None]:
marker = itertools.cycle(("o", "s", "^", "v", ">", "<", "D","*")) 
def plot_correlations(num_of_days, values, labels, caption, figsize=(12,8)):
    plt.figure(figsize=figsize)
    for i in range(len(values)):
        plt.plot(range(num_of_days-1),values[i],'-o',label=labels[i],marker=marker.next(),markersize='10')
    plt.ylim(-1.0,1.1)
    plt.legend()
    plt.show()

In [None]:
label_list = ["kendall model","kendall data","weighted kendall model","weighted kendall data"]
plot_correlations(num_of_days,[opt_kendall,data_kendall,opt_w_kendall,data_w_kendall],label_list,"Kendall's tau")

rmse_corr.to_csv('../final_plot_data/yo-0.2/kendall_stability.csv',sep=";",index=False)

In [None]:
for val in opt_kendall:
    print val

In [None]:
for val in opt_w_kendall:
    print val