In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import re, math, itertools

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="Droid Sans",font_scale = 2)
sns.set_style("whitegrid")
sns.set_color_codes("dark")

In [None]:
import sys, os
sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm
import popularity_model.popularity_model_utils as pmu

# Choose dataset

In [None]:
dataset_id = '15o_pagerank'

In [None]:
dataset_stat_file = "../correlation_experiments/%s_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

In [None]:
num_of_users = int(total_num_matcher.group(1))
num_of_days = len(stat_df)-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_spearman = list(stat_df["spearman"])[:num_of_days-1]
data_w_spearman = list(stat_df["w_spearman"])[:num_of_days-1]

# Popularity model

In [None]:
print num_of_users, num_of_days

**TODO: fit powerlaw exponent on real data aggregated centrality values!!!**

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=0.1)

# Experiments

### Search for optimal _lambda_ parameter

In [None]:
lambdas = np.linspace(0.0, 1.0, num=11)
print lambdas

(opt_lambda, opt_spearman, opt_diff, X_act_leaders) = pmu.get_opt_lambda_for_model(lambdas, model, p, p_overlap, num_of_days, data_spearman, for_weighted=False)

(opt_w_lambda, opt_w_spearman, opt_w_diff, w_X_act_leaders) = pmu.get_opt_lambda_for_model(lambdas, model, p, p_overlap, num_of_days, data_w_spearman, for_weighted=True)

print opt_lambda, opt_w_lambda

In [None]:
marker = itertools.cycle(("o", "s", "^", "v", ">", "<", "D","*")) 
def plot_correlations(num_of_days, values, labels, caption, figsize=(12,8)):
    plt.figure(figsize=figsize)
    for i in range(len(values)):
        plt.plot(range(num_of_days-1),values[i],'-o',label=labels[i],marker=marker.next(),markersize='10')
    plt.ylim(-1.0,1.1)
    plt.legend()
    plt.show()

label_list = ["spearman model","spearman data","weighted spearman model","weighted spearman data"]
plot_correlations(num_of_days,[opt_spearman,data_spearman,opt_w_spearman,data_w_spearman],label_list,"Spearman's rho")

## 1.) Model stability related to daily variations

In [None]:
def test_stability_for_alpha(lambdas, num_samples):
    rmse_arr = []
    for selected_lambda in lambdas:
        for i in xrange(num_samples):
            sample_model = pm.PopularityModel(num_of_users, num_of_days)
            x_act_leaders = sample_model.get_centrality_with_markov(p, p_overlap, lambda_=selected_lambda)
            res = pmu.get_custom_correlations(x_act_leaders, num_of_days)
            spearman, w_spearman = list(res[:,0]), list(res[:,1])
            rmse_arr.append([selected_lambda, 'unweighted', pmu.rmse(data_spearman, spearman)])
            rmse_arr.append([selected_lambda, 'weighted', pmu.rmse(data_w_spearman, w_spearman)])
        print "lambda=%f finished" % selected_lambda
    rmse_df = pd.DataFrame(np.array(rmse_arr), columns=["lambda","type","rmse"])
    rmse_df["rmse"] = rmse_df["rmse"].astype("float64")
    return rmse_df

In [None]:
num_of_samples = 10
rmse_corr = test_stability_for_alpha(lambdas,num_of_samples)

In [None]:
rmse_corr.to_csv('./stability_spearman/%s_%i_kendall_stability_%i.csv' % (dataset_id, num_of_users, num_of_samples),sep=";")

In [None]:
import graphlab
data = graphlab.SFrame(rmse_corr)
unweighted_rmse_corr = data[data["type"] == "unweighted"]
weighted_rmse_corr = data[data["type"] == "weighted"]
weighted_rmse_avg = weighted_rmse_corr.groupby("lambda",{'rmse_avg':graphlab.aggregate.AVG('rmse')})
weighted_rmse_std = weighted_rmse_corr.groupby("lambda",{'rmse_std':graphlab.aggregate.STD('rmse')})
weighted_rmse = weighted_rmse_avg.join(weighted_rmse_std,"lambda")
unweighted_rmse_avg = unweighted_rmse_corr.groupby("lambda",{'rmse_avg':graphlab.aggregate.AVG('rmse')})
unweighted_rmse_std = unweighted_rmse_corr.groupby("lambda",{'rmse_std':graphlab.aggregate.STD('rmse')})
unweighted_rmse = unweighted_rmse_avg.join(unweighted_rmse_std,"lambda")
weighted_rmse = weighted_rmse.to_dataframe()
unweighted_rmse = unweighted_rmse.to_dataframe()

In [None]:
marker = itertools.cycle(("o", "s", "^", "v", ">", "<", "D","*")) 
fig, axes = plt.subplots(figsize=(12,8))
plt.errorbar(unweighted_rmse["lambda"],unweighted_rmse["rmse_avg"],yerr=unweighted_rmse["rmse_std"],fmt='o',label="unweighted",elinewidth=3,marker=marker.next(),markersize='10')
plt.errorbar(weighted_rmse["lambda"],weighted_rmse["rmse_avg"],yerr=weighted_rmse["rmse_std"],fmt='o',label="weighted",elinewidth=3,marker=marker.next(),markersize='10')
plt.legend()

### RESULTS:

   * The model is stable for unweighted spearman. The mean RMSE is very similar for all lambda (so far average is computed only from 10 samples)
   * For lambda=0 weighted spearman is instable. For positive lambdas the mean RMSE is less than 0.1

plt.figure(figsize=(20,10))
sns.boxplot(data=rmse_corr,x="lambda",y="rmse",hue="type")
plt.show()

## 2.) Model stability related to number of users

def test_stability_for_user_count(user_ratios, original_num_users, fixed_lambda, num_samples):
    rmse_arr = []
    for ratio in user_ratios:
        for i in xrange(num_samples):
            sample_model = pm.PopularityModel(int(math.ceil(original_num_users*ratio)), num_of_days)
            x_act_leaders = sample_model.get_centrality_with_markov(p, p_overlap, lambda_=fixed_lambda)
            res = pmu.get_custom_correlations(x_act_leaders, num_of_days)
            spearman, w_spearman = list(res[:,0]), list(res[:,1])
            rmse_arr.append([ratio, 'unweighted', pmu.rmse(data_spearman, spearman)])
            rmse_arr.append([ratio, 'weighted', pmu.rmse(data_w_spearman, w_spearman)])
        print "ratio=%f finished" % ratio
    rmse_df = pd.DataFrame(np.array(rmse_arr), columns=["ratio","type","rmse"])
    rmse_df["rmse"] = rmse_df["rmse"].astype("float64")
    return rmse_df

rmse_corr = test_stability_for_user_count([0.1,0.5,1.0,10],num_of_users, 0.1, 10)

### RESULTS:

   * The model does not perform well for small user counts
   * Especially weighted spearman gives incorrect results. I think that the reason for this is the very few number of leaders (e.g.: for maidan on the first day it is only **4 leader** - see the following computation...)
   * For large number of users the model is stable for both unweighted and weighted spearman.
   * Later I will do experiments with _ratio_>10 too...

(num_of_users * 0.1) * (0.018416 * 0.1) # N * q_1

plt.figure(figsize=(20,10))
sns.boxplot(data=rmse_corr,x="ratio",y="rmse",hue="type")
plt.show()