In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import sys
from datawand.parametrization import ParamHelper

In [None]:
ph = ParamHelper("../pipelines/PopularityModelScores.json",sys.argv)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re, math

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys

sys.path.insert(0,'../python/')
import correlation_new.correlation_computer as cc
import popularity_model.popularity_model as pm

# Choose dataset

In [None]:
experiment_folder = ph.get("experiment_folder")

In [None]:
dataset_id = ph.get("dataset_id")

In [None]:
dataset_stat_file = "%s/centrality_data/%s_results.csv" % (experiment_folder,dataset_id)
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

#### Kendall's Tau is computation intensive: so only a small sample is taken

In [None]:
num_of_users = 5000 #int(total_num_matcher.group(1))
num_of_days = len(stat_df)-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_kendall = list(stat_df["kendall"])[:num_of_days-1]
data_w_kendall = list(stat_df["w_kendall"])[:num_of_days-1]

# Popularity model

In [None]:
print num_of_users, num_of_days

**TODO: fit powerlaw exponent on real data aggregated centrality values!!!**

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

### I. popularity of users

In [None]:
ax = sns.distplot(model.U)

### II. daily variations

In [None]:
ax = sns.distplot(model.alpha[:,0])

### III. calculate daily centrality scores (without Markov model)

In [None]:
ax = sns.distplot(model.X[0,:])

### IV. Introducing Markov model without leaders

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

### V. Introducing Markov model with leaders

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=0.1)

#### Rounding of scores is needed because:
   * in original datasets there are many ties for centrality scores
   * there are very few ties amoung the original generated scores

In [None]:
X_act_leaders = np.ceil(X_act_leaders)

In [None]:
X_act_leaders[0,:]

In [None]:
import scipy.stats as stats

In [None]:
rank_list_0, rank_list_1 = cc.get_union_of_active_nodes(X_act_leaders[0,:],X_act_leaders[1,:])

In [None]:
%%time
print stats.kendalltau(rank_list_0, rank_list_1)

In [None]:
%%time
cc.computeWKendall(rank_list_0, rank_list_1,ranked_input=True)

# Export centrality scores

In [None]:
import os

def export_daily_scores(output_folder, M):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i in range(num_of_days):
        f = open(output_folder + '/centrality_scores_%i.txt' % i,'w')
        for j in range(num_of_users):
            if M[i,j] > 0.0:
                f.write('%i %f\n' % (j,M[i,j]))
        f.close()
    print 'Daily scores were exported to files.'                

In [None]:
score_folder = "%s/popularity_scores" % experiment_folder

In [None]:
export_daily_scores('%s/%s_pop_model/centrality_scores/' % (score_folder,dataset_id), model.X)

In [None]:
export_daily_scores('%s/%s_pop_model_markov/centrality_scores/' % (score_folder,dataset_id), X_act)

In [None]:
export_daily_scores('%s/%s_pop_model_leaders/centrality_scores/' % (score_folder,dataset_id), X_act_leaders)