In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import re, math, itertools

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="Droid Sans",font_scale = 2)
sns.set_style("whitegrid")
sns.set_color_codes("dark")

In [None]:
import sys, os
sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm
import popularity_model.popularity_model_utils as pmu

# Choose dataset

In [None]:
dataset_id = 'yo_pagerank'
dataset_title = "Yosoy"

In [None]:
dataset_stat_file = "../correlation_experiments/%s_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

In [None]:
num_of_users = int(total_num_matcher.group(1))
num_of_days = len(stat_df)-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_spearman = list(stat_df["spearman"])[:num_of_days-1]
data_kendall = list(stat_df["kendall"])[:num_of_days-1]
data_w_kendall = list(stat_df["w_kendall"])[:num_of_days-1]

## Stability plot

In [None]:
spearman_rmse_df = pd.read_csv('../final_plot_data/yo-0.2/spearman_stability.csv',sep=";")
kendall_rmse_df = pd.read_csv('../final_plot_data/yo-0.2/kendall_stability.csv',sep=";")

In [None]:
import graphlab
data = graphlab.SFrame(kendall_rmse_df)
unweighted_rmse_corr = data[data["type"] == "unweighted"]
weighted_rmse_corr = data[data["type"] == "weighted"]
weighted_rmse_avg = weighted_rmse_corr.groupby("lambda",{'rmse_avg':graphlab.aggregate.AVG('rmse')})
weighted_rmse_std = weighted_rmse_corr.groupby("lambda",{'rmse_std':graphlab.aggregate.STD('rmse')})
weighted_rmse = weighted_rmse_avg.join(weighted_rmse_std,"lambda")
unweighted_rmse_avg = unweighted_rmse_corr.groupby("lambda",{'rmse_avg':graphlab.aggregate.AVG('rmse')})
unweighted_rmse_std = unweighted_rmse_corr.groupby("lambda",{'rmse_std':graphlab.aggregate.STD('rmse')})
unweighted_rmse = unweighted_rmse_avg.join(unweighted_rmse_std,"lambda")
kendall_w_rmse = weighted_rmse.to_dataframe()
kendall_rmse = unweighted_rmse.to_dataframe()

spearman_rmse_df
kendall_rmse_df

In [None]:
import graphlab
data = graphlab.SFrame(spearman_rmse_df)
unweighted_rmse_corr = data[data["type"] == "unweighted"]
#weighted_rmse_corr = data[data["type"] == "weighted"]
#weighted_rmse_avg = weighted_rmse_corr.groupby("lambda",{'rmse_avg':graphlab.aggregate.AVG('rmse')})
#weighted_rmse_std = weighted_rmse_corr.groupby("lambda",{'rmse_std':graphlab.aggregate.STD('rmse')})
#weighted_rmse = weighted_rmse_avg.join(weighted_rmse_std,"lambda")
unweighted_rmse_avg = unweighted_rmse_corr.groupby("lambda",{'rmse_avg':graphlab.aggregate.AVG('rmse')})
unweighted_rmse_std = unweighted_rmse_corr.groupby("lambda",{'rmse_std':graphlab.aggregate.STD('rmse')})
unweighted_rmse = unweighted_rmse_avg.join(unweighted_rmse_std,"lambda")
#spearman_w_rmse = weighted_rmse.to_dataframe()
spearman_rmse = unweighted_rmse.to_dataframe()

In [None]:
marker = itertools.cycle(("o", "s", "^", "v", ">", "<", "D","*"))
fig, axes = plt.subplots(figsize=(12,8))
plt.title(dataset_title)
plt.errorbar(spearman_rmse["lambda"],spearman_rmse["rmse_avg"],yerr=spearman_rmse["rmse_std"],fmt='o',label="unweighted spearman",elinewidth=3,marker=marker.next(),markersize='10')
plt.errorbar(kendall_rmse["lambda"],kendall_rmse["rmse_avg"],yerr=kendall_rmse["rmse_std"],fmt='o',label="unweighted kendall",elinewidth=3,marker=marker.next(),markersize='10')
plt.errorbar(kendall_w_rmse["lambda"],kendall_w_rmse["rmse_avg"],yerr=kendall_w_rmse["rmse_std"],fmt='o',label="weighted kendall",elinewidth=3,marker=marker.next(),markersize='10')
plt.xlim(-0.03,1.0)
plt.xlabel('Days')
plt.ylabel('RMSE')
plt.legend()
plt.savefig('../final_plot_data/yo-0.2/yosoy_stability.png')

# Utils

In [None]:
def read_corr_file(f_name):
    corrs = []
    f = open(f_name)
    for line in f:
        corrs.append(float(line.rstrip()))
    f.close()
    return corrs

# Popularity model

In [None]:
opt_spearman = read_corr_file('../final_plot_data/yo-0.2/spearman_0.2.csv')

In [None]:
opt_kendall = read_corr_file('../final_plot_data/yo-0.2/kendall_0.2.csv')

In [None]:
opt_w_kendall = read_corr_file('../final_plot_data/yo-0.2/w_kendall_0.2.csv')

In [None]:
marker = itertools.cycle(("o", "s", "^", "v", ">", "<", "D","*")) 
def plot_correlations(fig_path, num_of_days, values, labels, ylabel, ylim=(-1.0,1.1), figsize=(12,8)):
    plt.figure(figsize=figsize)
    plt.title(dataset_title)
    ax = plt.subplot(111)
    for i in range(len(values)):
        ax.plot(range(num_of_days-1),values[i],'-o',label=labels[i],marker=marker.next(),markersize='10')
    plt.xlim(-0.5,num_of_days-1)
    plt.ylim(*ylim)
    ax.set_xlabel('Days')
    ax.set_ylabel(ylabel)
    ax.legend(loc='lower center',bbox_to_anchor=(0.5, 0.0),ncol=3,fancybox=True,shadow=True)
    plt.savefig(fig_path)
    plt.show()

In [None]:
label_list = ["spearman model","spearman data","kendall model","kendall data","weighted kendall model","weighted kendall data"]
plot_correlations('../final_plot_data/yo-0.2/yo_0.2.png',num_of_days,[opt_spearman,data_spearman,opt_kendall,data_kendall,opt_w_kendall,data_w_kendall],label_list,'Correlation measure')

# Difference plot

In [None]:
spearman_diff = list(np.array(opt_spearman) - np.array(data_spearman))
kendall_diff = list(np.array(opt_kendall) - np.array(data_kendall))
w_kendall_diff = list(np.array(opt_w_kendall) - np.array(data_w_kendall))

In [None]:
label_list = ["spearman","kendall","weighted kendall"]
plot_correlations('../final_plot_data/yo-0.2//yo_0.2_diff.png',num_of_days,[spearman_diff,kendall_diff,w_kendall_diff],label_list,'Correlation difference')#,ylim=(-0.6,0.6))