In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import re, math

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import sys

sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu

# Choose dataset

In [None]:
dataset_id = 'yo_pagerank'

In [None]:
dataset_stat_file = "../correlation_experiments/%s_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

#### extract number of users in data

In [None]:
print stat_df.columns[2]

total_num_matcher = re.match(r'.*\(total=(\d+?)\)', stat_df.columns[2], re.M|re.I)
if not total_num_matcher:
    raise RuntimeError("Column name does NOT match the regex!")

#### rename a column

In [None]:
cols = list(stat_df.columns)
cols[2] = "fraction_of_active_nodes"
stat_df.columns = cols

In [None]:
stat_df.head()

In [None]:
num_of_users = int(total_num_matcher.group(1))
num_of_days = len(stat_df)-1

In [None]:
p = list(stat_df["fraction_of_active_nodes"])[:num_of_days]
p_overlap = list(stat_df["fraction_of_users_in_2day_intersections"])[:num_of_days]

# Correlations in real data

In [None]:
data_spearman = list(stat_df["spearman"])[:num_of_days-1]
data_w_spearman = list(stat_df["w_spearman"])[:num_of_days-1]
data_kendall = list(stat_df["kendall"])[:num_of_days-1]
data_w_kendall = list(stat_df["w_kendall"])[:num_of_days-1]

# Popularity model

num_of_users = 10000
num_of_days = 10

In [None]:
print num_of_users, num_of_days

In [None]:
def reverse_sort(a):
    a.sort(axis=0)
    return a[::-1]

### I. user popularity

In [None]:
coef = 1.5
U = np.random.pareto(coef, size=num_of_users)
U = reverse_sort(U)

In [None]:
ax = sns.distplot(U)

### II. daily variations

In [None]:
alpha = np.random.exponential(scale=1.0, size=(num_of_days, num_of_users))

print num_of_users * num_of_days
print np.count_nonzero(alpha)

In [None]:
ax = sns.distplot(alpha[:,0])

### III. calculate daily centrality scores (without Markov model)

In [None]:
X = alpha * U

In [None]:
ax = sns.distplot(X[0,:])

### IV. Introducing Markov model with leaders

In [None]:
def list_subset(l,idx,exclusive=False):
    l_idx = [l[i] for i in idx] 
    if exclusive:
        return list(set(l)-set(l_idx))
    else:
        return l_idx

In [None]:
def binornd(n,p):
    probs = np.zeros(n)
    #indeces = np.random.binomial(n,p,size=n)
    #for i in indeces:
    for i in range(n):
        if np.random.rand(1) < p:
            probs[i] = 1.0
    return probs

In [None]:
def get_centrality_with_markov(lambda_, X, p, p_overlap, num_of_days, num_of_users):
    leader_index = lambda_
    m = num_of_days
    n = num_of_users
    intersection = p_overlap

    prob_set = np.zeros((m,n))
    Jaccard = np.zeros((m-1,1));
    q=np.zeros((m-1,1));
    u_index = range(num_of_users)
    intersection1 = np.zeros((m-1,1));

    for i in xrange(0,m-1):      
        Jaccard[i] = intersection[i] / (p[i]+p[i+1]-intersection[i])
    
        leader_fraction = intersection[i] * leader_index
        intersection1[i] = intersection[i] - leader_fraction
        q[i] = p[i] - leader_fraction
    
        leader_set = u_index[:int(math.ceil(n*leader_fraction))]
        user_set = list_subset(u_index,leader_set,exclusive=True)
        n1=len(user_set)
    
        # active users day 1
        if i==0:       
            prob_set[i,leader_set] = np.ones(len(leader_set)) 
            prob_set[i,user_set] = binornd(n1,q[i]) # in Nelly code it was p(i) -> q[i] is the correct value!!!
    
        # active users day i+1
        p_mix1 = intersection1[i] / q[i] # probability of non-leader occuring on next day (OK)
        p_mix0 = (p[i+1]-intersection[i]) / (1-q[i])
        prob_set[i+1,leader_set] = np.ones(len(leader_set))
        prob_set[i+1,user_set] = prob_set[i,user_set] * binornd(n1,p_mix1) + (1-prob_set[i,user_set]) * binornd(n1,p_mix0);

    return X * prob_set, prob_set

# Experiments

In [None]:
def plot_correlations(num_of_days, values, labels, caption, figsize=(10,5)):
    plt.figure(figsize=figsize)
    plt.title(caption)
    for i in range(len(values)):
        plt.plot(range(num_of_days-1),values[i],'-o',label=labels[i])
    plt.ylim(-0.55,1.1)
    plt.legend()
    plt.show()

In [None]:
import scipy.stats as stats

def get_correlations(A, num_of_days):
    pearson, spearman = [], []
    for i in xrange(1,num_of_days):
        pearson.append(stats.pearsonr(A[i-1,:],A[i,:])[0])
        spearman.append(stats.spearmanr(A[i-1,:],A[i,:])[0])
    return pearson, spearman

## 1.) correlations without Markov model

In [None]:
pearson, spearman = get_correlations(X, num_of_days)
plot_correlations(num_of_days,[spearman,pearson],['spearman','pearson'],"Correlations")
plot_correlations(num_of_days,[spearman,data_spearman],['model','data'],"Spearman's rho")

## 2.) correlations with Markov model

**Problems:**

   * Spearman correlation is still positive
   * Pearson correlation indicates independence between consecutive days!

In [None]:
X_act = get_centrality_with_markov(0.0, X, p, p_overlap, num_of_days, num_of_users)[0]

X_act.T[:50,:2]

In [None]:
pearson, spearman = get_correlations(X_act, num_of_days)
plot_correlations(num_of_days,[spearman,pearson],['spearman','pearson'],"Correlations")
plot_correlations(num_of_days,[spearman,data_spearman],['model','data'],"Spearman's rho")

## 3.) correlations with Markov model and Leaders

**Problems:**

   * Spearman correlation is still positive


In [None]:
X_act_leaders = get_centrality_with_markov(0.1, X, p, p_overlap, num_of_days, num_of_users)[0]

In [None]:
pearson, spearman = get_correlations(X_act_leaders, num_of_days)
plot_correlations(num_of_days,[spearman,pearson],['spearman','pearson'],"Correlations")
plot_correlations(num_of_days,[spearman,data_spearman],['model','data'],"Spearman's rho")

# Export centrality scores

In [None]:
import os

def export_daily_scores(output_folder, M):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i in range(num_of_days):
        f = open(output_folder + '/centrality_scores_%i.txt' % i,'w')
        for j in range(num_of_users):
            if M[i,j] > 0.0:
                f.write('%i %f\n' % (j,M[i,j]))
        f.close()
    print 'Daily scores were exported to files.'                

In [None]:
export_daily_scores('../correlation_experiments/%s_nelly_model/centrality_scores/' % dataset_id, X)

In [None]:
export_daily_scores('../correlation_experiments/%s_nelly_model_markov/centrality_scores/' % dataset_id, X_act)

In [None]:
export_daily_scores('../correlation_experiments/%s_nelly_model_leaders/centrality_scores/' % dataset_id, X_act_leaders)

In [None]:
recomp_df = pd.read_csv('../nelly_corr_markov_leader.txt',sep=" ", names=["idx","spearman","w_spearman"])
recomp_df.head()

In [None]:
recomp_spearman = list(recomp_df["spearman"])

In [None]:
plot_correlations(num_of_days,[recomp_spearman,data_spearman],['model','data'],"Spearman's rho")