In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re, math

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys

sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm

# Choose dataset

In [None]:
dataset_id = 'yo_pagerank'

In [None]:
dataset_stat_file = "../correlation_experiments/%s_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

#### Kendall's Tau is computation intensive: so only a small sample is taken

In [None]:
num_of_users = 1000 #int(total_num_matcher.group(1))
num_of_days = len(stat_df)-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_kendall = list(stat_df["kendall"])[:num_of_days-1]
data_w_kendall = list(stat_df["w_kendall"])[:num_of_days-1]

# Popularity model

In [None]:
print num_of_users, num_of_days

**TODO: fit powerlaw exponent on real data aggregated centrality values!!!**

In [None]:
model = pm.PopularityModel(num_of_users, num_of_days)

### I. popularity of users

In [None]:
ax = sns.distplot(model.U)

### II. daily variations

In [None]:
ax = sns.distplot(model.alpha[:,0])

### III. calculate daily centrality scores (without Markov model)

In [None]:
ax = sns.distplot(model.X[0,:])

### IV. Introducing Markov model without leaders

In [None]:
X_act = model.get_centrality_with_markov(p, p_overlap)

### V. Introducing Markov model with leaders

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=0.1)

# Export centrality scores

In [None]:
import os

def export_daily_scores(output_folder, M):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i in range(num_of_days):
        f = open(output_folder + '/centrality_scores_%i.txt' % i,'w')
        for j in range(num_of_users):
            if M[i,j] > 0.0:
                f.write('%i %f\n' % (j,M[i,j]))
        f.close()
    print 'Daily scores were exported to files.'                

In [None]:
#export_daily_scores('../correlation_experiments/%s_nelly_model/centrality_scores/' % dataset_id, X)

In [None]:
#export_daily_scores('../correlation_experiments/%s_nelly_model_markov/centrality_scores/' % dataset_id, X_act)

In [None]:
#export_daily_scores('../correlation_experiments/%s_nelly_model_leaders/centrality_scores/' % dataset_id, X_act_leaders)

# Experiments

In [None]:
def plot_correlations(num_of_days, values, labels, caption, figsize=(10,5)):
    plt.figure(figsize=figsize)
    plt.title(caption)
    for i in range(len(values)):
        plt.plot(range(num_of_days-1),values[i],'-o',label=labels[i])
    plt.ylim(-1.0,1.1)
    plt.legend()
    plt.show()

In [None]:
import scipy.stats as stats
import operator

def get_correlations(A, num_of_days):
    kendall = []
    for i in xrange(1,num_of_days):
        kendall.append(stats.kendalltau(A[i-1,:],A[i,:])[0])
    return kendall
    
def filter_active_users(A, num_of_days):
    num_users = A.shape[1]
    centrality_maps = []
    for i in range(num_of_days):
        centrality_maps.append({})
        for j in range(num_users):
            val = A[i,j]
            if val > 0.0:
                centrality_maps[i][j] = val
    return centrality_maps

def get_custom_correlations(A, num_of_days):
    """Return unweighted and weighted correlations"""
    return cu.compute_correlation_sequential(filter_active_users(A,num_of_days),corr_type="kendall")

arr = np.array([[1.2,0.0,3.4],[0.0,1.1,0.0],[0.0,6.7,11.0]])
print filter_active_users(arr,3)

## 1.) correlations without Markov model

#### scipy correlation code

   * there is no fit to real data

In [None]:
kendall = get_correlations(model.X, num_of_days)
plot_correlations(num_of_days,[kendall,data_kendall],['model','data'],"Kendall's Tau")

#### custom correlation code

   * there is no fit to real data (YET)
   * we have the same results as the scipy implementation for unweighted spearman (this is good!)

In [None]:
res = get_custom_correlations(model.X, num_of_days)
kendall, w_kendall = list(res[:,0]), list(res[:,1])
label_list = ['model_kendall','data_kendall','model_w_kendall','data_w_kendall']
plot_correlations(num_of_days,[kendall,data_kendall,w_kendall,data_w_kendall],label_list,"Kendall's Tau")

## 2.) correlations with Markov model

#### scipy correlation code

   * scipy correlation code gives bad results! Ties are not handled properly?
   * **TODO: investigate the custom correlation code (which gives good results... because of average tie position resolution?)**

In [None]:
kendall = get_correlations(X_act, num_of_days)
plot_correlations(num_of_days,[kendall,data_kendall],['model','data'],"Kendall's Tau")

#### custom correlation code

   * there is NO match between the model and the data for normal Kendall's Tau
   * the weighted Kendall's correlation is close to real data

In [None]:
res = get_custom_correlations(X_act, num_of_days)
kendall, w_kendall = list(res[:,0]), list(res[:,1])
label_list = ['model_kendall','data_kendall','model_w_kendall','data_w_kendall']
plot_correlations(num_of_days,[kendall,data_kendall,w_kendall,data_w_kendall],label_list,"Kendall's Tau")

## 3.) correlations with Markov model and Leaders

In [None]:
X_act_leaders = model.get_centrality_with_markov(p, p_overlap, lambda_=0.2)

#### scipy correlation code

   * there is no effect of the leaders
   * there is still _positive correlations_ (because of too many 0.0 centralities)

In [None]:
kendall = get_correlations(X_act_leaders, num_of_days)
plot_correlations(num_of_days,[kendall,data_kendall],['model','data'],"Kendall's Tau")

#### custom correlation code

   * even better results than without leaders
   * unweighted spearman is almost a perfect match on model and real data
   * weighted spearman fits much better to real data with the introduction of leaders (as expected)

In [None]:
res = get_custom_correlations(X_act_leaders, num_of_days)
kendall, w_kendall = list(res[:,0]), list(res[:,1])
label_list = ['model_kendall','data_kendall','model_w_kendall','data_w_kendall']
plot_correlations(num_of_days,[kendall,data_kendall,w_kendall,data_w_kendall],label_list,"Kendall's Tau")