In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys

sys.path.insert(0,'../../python/')
import correlation_new.correlation_computer as cc
import correlation_new.correlation_utils_new as cu
import popularity_model.popularity_model as pm

# Choose dataset

In [None]:
dataset_id = 'maidan'
measure_id = 'in_degree'
LAMBDA = 0.1

input_prefix = "/mnt/idms/fberes/NETWORK/andreas_article/results/corr_and_stats/"

In [None]:
dataset_stat_file = input_prefix + "/%s_%s.csv" % (dataset_id, measure_id)
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

In [None]:
stat_df.head()

#### extract dataset sizes

In [None]:
num_of_days = len(stat_df)
num_of_users = 0
with open(input_prefix + "/%s_total_vertex_count.txt" % (dataset_id)) as f:
    num_of_users = int(f.readline())
if num_of_users == 0:
    raise RuntimeError('Invalid total vertex count!')

In [None]:
print num_of_users, num_of_days

In [None]:
p = list(stat_df["prev_day_frac"])[:num_of_days]
p_overlap = list(stat_df["overlap_frac"])[:num_of_days]

# Correlations in real data

In [None]:
data_pearson = list(stat_df["pearson"])[:num_of_days-1]
data_spearman = list(stat_df["spearman"])[:num_of_days-1]

In [None]:
len(data_pearson)

# Popularity model

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

### I. popularity of users

In [None]:
ax = sns.distplot(model.U)

### II. daily variations

In [None]:
ax = sns.distplot(model.alpha[:,0])

### III. calculate daily centrality scores (without Markov model)

In [None]:
ax = sns.distplot(model.X[0,:])

### IV. Introducing Markov model without leaders

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

### V. Introducing Markov model with leaders

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=LAMBDA)

# Experiments

In [None]:
def plot_correlations(num_of_days, values, labels, caption, figsize=(10,5)):
    plt.figure(figsize=figsize)
    plt.title(caption)
    for i in range(len(values)):
        plt.plot(range(num_of_days-1),values[i],'-o',label=labels[i])
    plt.ylim(-1.0,1.1)
    plt.legend()
    plt.show()

## 1.) correlations without Markov model

#### correlations for all vertices

   * there is no fit to real data (spearman)
   * pearson is quite instable

In [None]:
model_pearson = cu.get_correlations_from_matrix(model.X, num_of_days, corr_type='pearson')
model_spearman = cu.get_correlations_from_matrix(model.X, num_of_days, corr_type='spearman')
label_list = ['pearson_model','pearson_data','spearman_model','spearman_data']
plot_correlations(num_of_days,[model_pearson, data_pearson, model_spearman,data_spearman],label_list,"Correlations (all)")

#### correlations for only active vertices

   * result are the same as for "all vertices" scenario

In [None]:
act_model_pearson = cu.get_correlations_from_matrix_for_act(model.X, num_of_days, corr_type='pearson')
act_model_spearman = cu.get_correlations_from_matrix_for_act(model.X, num_of_days, corr_type='spearman')
label_list = ['pearson_model','pearson_data','spearman_model','spearman_data']
plot_correlations(num_of_days,[act_model_pearson, data_pearson, act_model_spearman, data_spearman],label_list,"Correlations (active)")

## 2.) correlations with Markov model

#### correlations for all vertices

   * there is still no fit for spearman

In [None]:
model_pearson = cu.get_correlations_from_matrix(X_act, num_of_days, corr_type='pearson')
model_spearman = cu.get_correlations_from_matrix(X_act, num_of_days, corr_type='spearman')
label_list = ['pearson_model','pearson_data','spearman_model','spearman_data']
plot_correlations(num_of_days,[model_pearson, data_pearson, model_spearman,data_spearman],label_list,"Correlations (all)")

#### correlations for only active vertices

   * there is a good fit for spearman

In [None]:
act_model_pearson = cu.get_correlations_from_matrix_for_act(X_act, num_of_days, corr_type='pearson')
act_model_spearman = cu.get_correlations_from_matrix_for_act(X_act, num_of_days, corr_type='spearman')
label_list = ['pearson_model','pearson_data','spearman_model','spearman_data']
plot_correlations(num_of_days,[act_model_pearson, data_pearson, act_model_spearman, data_spearman],label_list,"Correlations (active)")

## 3.) correlations with Markov model and Leaders

In [None]:
from sklearn.metrics import mean_squared_error
import math

def rmse(original_values, model_values):
    return math.sqrt(mean_squared_error(original_values, model_values))

def get_result_for_lambda(proposed_lambda, pop_model, prob, prob_overlap, number_of_days, corr_type):
    scores_with_leaders = pop_model.get_centrality_with_markov(prob, prob_overlap, lambda_=proposed_lambda)
    corr = cu.get_correlations_from_matrix_for_act(scores_with_leaders, number_of_days,corr_type)
    return corr, scores_with_leaders

def get_opt_lambda_for_model(proposed_lambdas, pop_model, prob, prob_overlap, number_of_days, original_corr_values, corr_type):
    opt_lambda = proposed_lambdas[0]
    (opt_corr, opt_scores) = get_result_for_lambda(opt_lambda, pop_model, prob, prob_overlap, number_of_days, corr_type)
    opt_diff = rmse(original_corr_values, opt_corr)
    print opt_lambda, opt_diff
    for i in range(1, len(proposed_lambdas)):
        current_lambda = proposed_lambdas[i]
        (current_corr, current_scores) = get_result_for_lambda(current_lambda, pop_model, prob, prob_overlap, number_of_days, corr_type)
        current_diff = rmse(original_corr_values, current_corr)
        print current_lambda, current_diff
        if current_diff < opt_diff:
            opt_lambda = current_lambda
            opt_corr = current_corr
            opt_scores = current_scores
            opt_diff = current_diff
    return (opt_lambda, opt_corr, opt_diff, opt_scores)

### Search for optimal _lambda_ parameter

In [None]:
lambdas = np.linspace(0.0, 1.0, num=11)
print lambdas

In [None]:
(opt_lambda, opt_spearman, opt_diff, X_act_leaders) = get_opt_lambda_for_model(lambdas, model, p, p_overlap, num_of_days, data_spearman, corr_type='spearman')

In [None]:
print opt_lambda

#### correlations for all vertices

In [None]:
model_pearson = cu.get_correlations_from_matrix(X_act_leaders, num_of_days, corr_type='pearson')
model_spearman = cu.get_correlations_from_matrix(X_act_leaders, num_of_days, corr_type='spearman')
label_list = ['pearson_model','pearson_data','spearman_model','spearman_data']
plot_correlations(num_of_days,[model_pearson, data_pearson, model_spearman,data_spearman],label_list,"Correlations (all)")

#### correlations for only active vertices

In [None]:
act_model_pearson = cu.get_correlations_from_matrix_for_act(X_act_leaders, num_of_days, corr_type='pearson')
act_model_spearman = cu.get_correlations_from_matrix_for_act(X_act_leaders, num_of_days, corr_type='spearman')
label_list = ['pearson_model','pearson_data','spearman_model','spearman_data']
plot_correlations(num_of_days,[act_model_pearson, data_pearson, act_model_spearman, data_spearman],label_list,"Correlations (active)")