In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re, math

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys

sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm

# Choose dataset

In [None]:
dataset_id = 'maidan_pagerank'

In [None]:
dataset_stat_file = "../correlation_experiments/%s_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

In [None]:
num_of_users = int(total_num_matcher.group(1))
num_of_days = len(stat_df)-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_spearman = list(stat_df["spearman"])[:num_of_days-1]
data_w_spearman = list(stat_df["w_spearman"])[:num_of_days-1]

# Popularity model

In [None]:
print num_of_users, num_of_days

**TODO: fit powerlaw exponent on real data aggregated centrality values!!!**

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

### I. popularity of users

In [None]:
ax = sns.distplot(model.U)

### II. daily variations

In [None]:
ax = sns.distplot(model.alpha[:,0])

### III. calculate daily centrality scores (without Markov model)

In [None]:
ax = sns.distplot(model.X[0,:])

### IV. Introducing Markov model without leaders

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

### V. Introducing Markov model with leaders

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=0.1)

# Export centrality scores

In [None]:
import os

def export_daily_scores(output_folder, M):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i in range(num_of_days):
        f = open(output_folder + '/centrality_scores_%i.txt' % i,'w')
        for j in range(num_of_users):
            if M[i,j] > 0.0:
                f.write('%i %f\n' % (j,M[i,j]))
        f.close()
    print 'Daily scores were exported to files.'                

In [None]:
#export_daily_scores('../correlation_experiments/%s_nelly_model/centrality_scores/' % dataset_id, X)

In [None]:
#export_daily_scores('../correlation_experiments/%s_nelly_model_markov/centrality_scores/' % dataset_id, X_act)

In [None]:
#export_daily_scores('../correlation_experiments/%s_nelly_model_leaders/centrality_scores/' % dataset_id, X_act_leaders)

# Experiments

In [None]:
def plot_correlations(num_of_days, values, labels, caption, figsize=(10,5)):
    plt.figure(figsize=figsize)
    plt.title(caption)
    for i in range(len(values)):
        plt.plot(range(num_of_days-1),values[i],'-o',label=labels[i])
    plt.ylim(-1.0,1.1)
    plt.legend()
    plt.show()

In [None]:
import scipy.stats as stats
import operator

def get_correlations(A, num_of_days):
    pearson, spearman = [], []
    for i in xrange(1,num_of_days):
        pearson.append(stats.pearsonr(A[i-1,:],A[i,:])[0])
        spearman.append(stats.spearmanr(A[i-1,:],A[i,:])[0])
    return pearson, spearman
    
def filter_active_users(A, num_of_days):
    num_users = A.shape[1]
    centrality_maps = []
    for i in range(num_of_days):
        centrality_maps.append({})
        for j in range(num_users):
            val = A[i,j]
            if val > 0.0:
                centrality_maps[i][j] = val
    return centrality_maps

def get_custom_correlations(A, num_of_days, corr_type='spearman'):
    """Return unweighted and weighted correlations"""
    return cu.compute_correlation_sequential(filter_active_users(A,num_of_days),corr_type=corr_type)

arr = np.array([[1.2,0.0,3.4],[0.0,1.1,0.0],[0.0,6.7,11.0]])
print filter_active_users(arr,3)

## 1.) correlations without Markov model

#### scipy correlation code

   * there is no fit to real data

In [None]:
sci_pearson, sci_spearman = get_correlations(model.X, num_of_days)
#plot_correlations(num_of_days,[sci_spearman,sci_pearson],['spearman','pearson'],"Correlations")
plot_correlations(num_of_days,[sci_spearman,data_spearman],['model','data'],"Spearman's rho")

#### custom correlation code

   * there is no fit to real data (YET)
   * we have the same results as the scipy implementation for unweighted spearman (this is good!)

In [None]:
res = get_custom_correlations(model.X, num_of_days)
spearman, w_spearman = list(res[:,0]), list(res[:,1])
label_list = ['model_spearman','data_spearman','model_w_spearman','data_w_spearman']
plot_correlations(num_of_days,[spearman,data_spearman,w_spearman,data_w_spearman],label_list,"Spearman's rho")

## 2.) correlations with Markov model

#### scipy correlation code

   * scipy correlation code gives bad results! Ties are not handled properly?
   * **TODO: investigate the custom correlation code (which gives good results... because of average tie position resolution?)**

In [None]:
sci_pearson, sci_spearman = get_correlations(X_act, num_of_days)
#plot_correlations(num_of_days,[sci_spearman,sci_pearson],['spearman','pearson'],"Correlations")
plot_correlations(num_of_days,[sci_spearman,data_spearman],['model','data'],"Spearman's rho")

#### custom correlation code

   * there is a match between the model and the data for normal Spearman's correlation
   * the weighted Spearman's correlation does not fit real data - the reason for this is the absence of **leaders**

In [None]:
res = get_custom_correlations(X_act, num_of_days)
spearman, w_spearman = list(res[:,0]), list(res[:,1])
label_list = ['model_spearman','data_spearman','model_w_spearman','data_w_spearman']
plot_correlations(num_of_days,[spearman,data_spearman,w_spearman,data_w_spearman],label_list,"Spearman's rho")

## 3.) correlations with Markov model and Leaders

In [None]:
from sklearn.metrics import mean_squared_error
import math

def rmse(original_values, model_values):
    return math.sqrt(mean_squared_error(original_values, model_values))

def get_result_for_lambda(proposed_lambda, pop_model, prob, prob_overlap, number_of_days, for_weighted):
    scores_with_leaders = pop_model.get_centrality_with_markov(prob, prob_overlap, lambda_=proposed_lambda)
    results = get_custom_correlations(scores_with_leaders, number_of_days)
    if for_weighted:
        model_spearman = list(results[:,1])
    else:
        model_spearman = list(results[:,0])
    return model_spearman, scores_with_leaders

def get_opt_lambda_for_model(proposed_lambdas, pop_model, prob, prob_overlap, number_of_days, original_corr_values, for_weighted=False):
    opt_lambda = proposed_lambdas[0]
    (opt_spearman, opt_scores) = get_result_for_lambda(opt_lambda, pop_model, prob, prob_overlap, number_of_days, for_weighted)
    opt_diff = rmse(original_corr_values, opt_spearman)
    #print opt_diff
    for i in range(1, len(proposed_lambdas)):
        current_lambda = proposed_lambdas[i]
        (current_spearman, current_scores) = get_result_for_lambda(current_lambda, pop_model, prob, prob_overlap, number_of_days, for_weighted)
        current_diff = rmse(original_corr_values, current_spearman)
        #print current_lambda, current_diff
        if current_diff < opt_diff:
            opt_lambda = current_lambda
            opt_spearman = current_spearman
            opt_scores = current_scores
            opt_diff = current_diff
    return (opt_lambda, opt_spearman, opt_diff, opt_scores)

X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=0.1)

### Search for optimal _lambda_ parameter

In [None]:
lambdas = np.linspace(0.0, 1.0, num=11)
print lambdas

In [None]:
(opt_lambda, opt_spearman, opt_diff, X_act_leaders) = get_opt_lambda_for_model(lambdas, model, p, p_overlap, num_of_days, data_spearman, for_weighted=False)

In [None]:
(opt_w_lambda, opt_w_spearman, opt_w_diff, w_X_act_leaders) = get_opt_lambda_for_model(lambdas, model, p, p_overlap, num_of_days, data_w_spearman, for_weighted=True)

In [None]:
print opt_lambda, opt_w_lambda

#### scipy correlation code

   * there is no effect of the leaders
   * there is still _positive correlations_ (because of too many 0.0 centralities)

In [None]:
sci_pearson, sci_spearman = get_correlations(X_act_leaders, num_of_days)
#plot_correlations(num_of_days,[sci_spearman,sci_pearson],['spearman','pearson'],"Correlations")
plot_correlations(num_of_days,[sci_spearman,data_spearman],['model','data'],"Spearman's rho")

#### custom correlation code

   * even better results than without leaders
   * unweighted spearman is almost a perfect match on model and real data
   * weighted spearman fits much better to real data with the introduction of leaders (as expected)

In [None]:
label_list = ['model_spearman: %f' % opt_lambda,'data_spearman','model_w_spearman: %f' % opt_w_lambda,'data_w_spearman']
plot_correlations(num_of_days,[opt_spearman,data_spearman,opt_w_spearman,data_w_spearman],label_list,"Spearman's rho")