In [None]:
import sys

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

In [None]:
sys.path.insert(0,'../../python/')
import correlation_new.correlation_computer as cc
import correlation_new.correlation_utils_new as cu
import popularity_model.popularity_model as pm
import popularity_model.popularity_model_utils as pmu

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/PopularityModelScores.json",sys.argv)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Choose dataset

In [None]:
experiment_folder = ph.get("experiment_folder")
dataset_id = ph.get("dataset_id")
measure_id = ph.get("measure_id")
N_THREADS = ph.get("num_workers")
LAMBDA = 0.1

input_prefix = "%s/results/corr_and_stats/" % experiment_folder

In [None]:
dataset_stat_file = input_prefix + "/%s_%s.csv" % (dataset_id, measure_id)
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

In [None]:
stat_df.head()

#### extract dataset sizes

In [None]:
num_of_days = len(stat_df)
num_of_users = 0
with open(input_prefix + "/%s_total_vertex_count.txt" % (dataset_id)) as f:
    num_of_users = int(f.readline())
if num_of_users == 0:
    raise RuntimeError('Invalid total vertex count!')

#### experiment with only 10% of all user count

In [None]:
print num_of_users
num_of_users = int(np.ceil(num_of_users*0.1))
print num_of_users

In [None]:
print num_of_users, num_of_days

In [None]:
p = list(stat_df["prev_day_frac"])[:num_of_days]
p_overlap = list(stat_df["overlap_frac"])[:num_of_days]

# Correlations in real data

In [None]:
data_kendall = list(stat_df["kendall"])[:num_of_days-1]

In [None]:
len(data_kendall)

# Stability Experiments

In [None]:
def plot_correlations(num_of_days, values, labels, caption, figsize=(10,5)):
    plt.figure(figsize=figsize)
    plt.title(caption)
    for i in range(len(values)):
        plt.plot(range(num_of_days-1),values[i],'-o',label=labels[i])
    plt.ylim(-1.0,1.1)
    plt.legend()
    plt.show()

In [None]:
label_list = ['kendall_model','kendall_data']

In [None]:
def get_result_for_active_nodes(M):
    act_model_kendall = cu.get_correlations_from_matrix_for_act(M, num_of_days, corr_type='kendall', n_threads=N_THREADS)
    plot_correlations(num_of_days,[act_model_kendall, data_kendall],label_list,"Correlations (active)")

### Search for optimal _lambda_ parameter

In [None]:
lambdas = np.linspace(0.0, 1.0, num=11)
print lambdas

In [None]:
pop_model_parameters = (num_of_users, num_of_days, p, p_overlap)
out_prefix = "%s/results/model_stability/%s_%s" % (experiment_folder, dataset_id, measure_id)
#out_prefix = None
rmse_df = pmu.test_stability_for_lambdas(lambdas,5,"kendall",measure_id,pop_model_parameters,data_kendall,n_threads=N_THREADS,output_prefix=out_prefix)

In [None]:
rmse_stats = pmu.summarize_stability_results(rmse_df)
rmse_stats

In [None]:
rmse_stats["rmse_mean"].plot()

In [None]:
opt_lambda = float(rmse_stats.sort_values("rmse_mean").head(1)["lambda"])
print opt_lambda

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)
X_act = model.get_centrality_with_markov(p, p_overlap)
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=opt_lambda)

In [None]:
get_result_for_active_nodes(X_act_leaders)