In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np
import re, math, itertools

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="Droid Sans",font_scale = 2)
sns.set_style("whitegrid")
sns.set_color_codes("dark")

In [None]:
import sys, os
sys.path.insert(0,'../python/')
import correlation.correlation_utils as cu
import popularity_model.popularity_model as pm
import popularity_model.popularity_model_utils as pmu

# Read score files

dataset_id = "maidan"
plot_title = "Maidan"

dataset_id = "15o"
plot_title = "15o"

dataset_id = "yo"
plot_title = "Yosoy"

In [None]:
dataset_id = "oc"
plot_title = "Occupy"

In [None]:
num_of_days = 21

In [None]:
score_dir = '/mnt/idms/rank_correlation_common/results/new_experiments/centrality_output_for_datasets_normalized/%s/centrality_scores/' % dataset_id

In [None]:
daily_scores = []
user_ids = set([])
for i in range(num_of_days):
    daily_scores.append(pd.read_csv(score_dir + '/pagerank_scores_%i.txt_s' % i,sep=" ", names=["id","score"]))
    user_ids = user_ids.union(set(daily_scores[i]['id']))
    print len(daily_scores[i])
user_ids = list(user_ids)

In [None]:
print len(daily_scores), len(user_ids)

# 1. Merge daily active users

In [None]:
def join_consecutive_days(day_1, day_2):
    d_1 = day_1 #d1.set_index("id")
    d_2 = day_2.set_index("id")
    joined = pd.concat([d_1,d_2],join='outer',axis=1)
    joined = joined.fillna(0.0)
    return joined

def merge_all_days(score_list):
    merged = score_list[0]
    merged = merged.set_index("id")
    for i in range(1,len(score_list)):
        merged = join_consecutive_days(merged, score_list[i])
        #print merged
    merged.columns = ['score_%i' % i for i in range(len(score_list))]
    for i in range(len(score_list)):
        merged['score_%i' % i] = merged['score_%i' % i].astype('float64')
    merged['AGGR_SCORE'] = merged.sum(axis=1)
    merged.sort('AGGR_SCORE',axis=0,inplace=True,ascending=False)
    return merged

d1 = pd.DataFrame(np.array([[10,2.3],[13,0.2],[17,4.3]]), columns=["id","score"])
d2 = pd.DataFrame(np.array([[12,5.3],[10,0.1],[13,3.9]]), columns=["id","score"])
d3 = pd.DataFrame(np.array([[18,5.5],[10,0.1],[14,3.9]]), columns=["id","score"])
merged = merge_all_days([d1,d2,d3])
print merged

In [None]:
merged_daily_scores = merge_all_days(daily_scores)

In [None]:
merged_daily_scores.head()

## Export aggregated score

In [None]:
aggr_score = merged_daily_scores['AGGR_SCORE']

f, ax = plt.subplots(figsize=(7, 7))
sns.distplot(aggr_score)

merged_daily_scores.to_csv('../../correlation_experiments/aggr_pagerank/%s_aggregated_pagerank.csv' % dataset_id, columns=["AGGR_SCORE"])

# 2. Compute pearson and jaccard

In [None]:
import scipy.stats as stats

def get_list_pair(M,day_idx):
    idx = day_idx
    day_one = M[:,idx]
    day_two = M[:,idx+1]
    
    ind_one=np.nonzero(day_one)[0];
    ind_two=np.nonzero(day_two)[0];
    ind=np.union1d(ind_one,ind_two)
    
    jaccard = float(len(np.intersect1d(ind_one,ind_two))) / len(ind)  
    return day_one[ind], day_two[ind], jaccard

def get_pearson_and_jacard(A, num_days):
    res = []
    for i in xrange(1,num_days):
        list_0, list_1, jaccard = get_list_pair(A,i-1)
        pearson_corr = stats.pearsonr(list_0,list_1)[0]
        res.append([pearson_corr,jaccard])
    return np.array(res)

In [None]:
only_scores = merged_daily_scores[list(merged_daily_scores.columns)[:-1]]
score_mx = only_scores.as_matrix()
only_scores.head()

In [None]:
res = get_pearson_and_jacard(score_mx,num_of_days)
pearson = res[:,0]
jaccard = res[:,1]

# 3. Visualize correlation measures

In [None]:
dataset_stat_file = "../../correlation_experiments/%s_pagerank_results.csv" % dataset_id
stat_df = pd.read_csv(dataset_stat_file, sep=" ")

In [None]:
spearman = list(stat_df["kendall"])[:num_of_days-1]
kendall = list(stat_df["kendall"])[:num_of_days-1]
w_kendall = list(stat_df["w_kendall"])[:num_of_days-1]

marker = itertools.cycle(("o", "s", "^", "v", ">", "<", "D","*")) 
def plot_correlations(num_of_days, values, labels, caption, figsize=(12,8)):
    plt.figure(figsize=figsize)
    for i in range(len(values)):
        plt.plot(range(num_of_days-1),values[i],'-o',label=labels[i],marker=marker.next(),markersize='10')
    plt.ylim(-1.0,1.1)
    plt.legend()
    plt.savefig('../../correlation_experiments/%s.png' % dataset_id)
    plt.show()

In [None]:
marker = itertools.cycle(("o", "s", "^", "v", ">", "<", "D","*")) 
def plot_correlations(num_of_days, values, labels, caption, figsize=(12,8)):
    plt.figure(figsize=figsize)
    plt.title(plot_title)
    ax = plt.subplot(111)
    for i in range(len(values)):
        ax.plot(range(num_of_days-1),values[i],'-o',label=labels[i],marker=marker.next(),markersize='10')
    plt.ylim(-1.0,1.1)
    ax.legend(loc='lower center',bbox_to_anchor=(0.5, 0.0),ncol=3,fancybox=True,shadow=True)
    plt.savefig('../../correlation_experiments/%s.png' % dataset_id)
    plt.show()

In [None]:
label_list = ["pearson","spearman","kendall","weighted kendall","jaccard"]
plot_correlations(num_of_days,[pearson,spearman,kendall,w_kendall,jaccard],label_list,dataset_id)