In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import pandas as pd
import numpy as np

In [None]:
import sys
sys.path.insert(0,'../python/')

import correlation_new.correlation_computer as cc

# Util functions

In [None]:
def load_score_map(input_prefix ,day, measure):
    """The centrality maps were pre-sorted in decreasing order!!!"""
    scores = pd.read_csv(input_prefix + '/%s_scores_%i.txt_s' % (measure,day), sep=" ", names=["id","score"])
    scores = scores.set_index("id")
    return scores

In [None]:
def result2file(result_list,file_name):
    with open(file_name, 'w') as f:
        #f.write('index value\n')
        for i in xrange(len(result_list)):
            f.write('%i %f\n' % (i, result_list[i]))
    print 'Done'

In [None]:
def calculate_corr_for_a_day(input_prefix, day, corr_type, measure):
    prev_day = load_score_map(input_prefix, day-1, measure)
    current_day = load_score_map(input_prefix, day, measure)
    corr = None
    if corr_type=="pearson":
        corr = cc.corr_pearson(prev_day,current_day)[0]
    elif corr_type=="spearman":
        corr = cc.corr_spearman(prev_day,current_day)[0]
    elif corr_type=="kendall":
        corr = cc.corr_kendalltau(prev_day,current_day)[0]
    elif corr_type=="w_kendall":
        corr = cc.corr_weighted_kendalltau(prev_day,current_day)[0]
    else:
        raise RuntimeError("Invalid correlation type: %s!" % corr_type)
    return corr
        
def calculate_corr_for_days(input_prefix, days, corr_type, measure_type):
    return map(lambda x: calculate_corr_for_a_day(input_prefix, x, corr_type=corr_type, measure=measure_type), days)

In [None]:
def calculate_stats_for_a_day(input_prefix, day, measure, only_positive=False):
    prev_day = load_score_map(input_prefix, day-1, measure)
    current_day = load_score_map(input_prefix, day, measure)
    if only_positive:
        prev_day = prev_day.query("score>0")
        current_day = current_day.query("score>0")
    overlap = set(prev_day.index).intersection(set(current_day.index))
    return [len(current_day), len(prev_day), len(overlap)]

def calculate_stats_for_days(input_prefix, days, measure_type):
    return map(lambda x: calculate_stats_for_a_day(input_prefix, x, measure=measure_type), days)

# Define dataset and experiment parameters

In [None]:
dataset_id = '15o'
measure_id = 'pagerank'
data_path = '/mnt/idms/fberes/NETWORK/DATA/temporal_centralities/centrality_output_for_datasets/%s/centrality_scores/' % dataset_id
output_path = '/mnt/idms/fberes/NETWORK/andreas_article/correlations/'

days = range(1,22)
print days

# Active vertices and overlaps

In [None]:
stat_df = pd.DataFrame(np.array(calculate_stats_for_days(data_path, days, measure_id)), columns=["curr_day_count","prev_day_count","overlap_count"])
stat_df.to_csv(output_path + "/%s_%s.stats" % (dataset_id,measure_id), sep=" ")
stat_df.head()

# Daily correlations

In [None]:
%%time
pearson_result = calculate_corr_for_days(data_path, days,corr_type="pearson",measure_type=measure_id)
result2file(pearson_result, output_path + "/%s_%s.pearson" % (dataset_id,measure_id))
print pearson_result

In [None]:
%%time
spearman_result = calculate_corr_for_days(data_path, days,corr_type="spearman",measure_type=measure_id)
result2file(spearman_result, output_path + "/%s_%s.spearman" % (dataset_id,measure_id))
print spearman_result

In [None]:
%%time
kendall_result = calculate_corr_for_days(data_path, days,corr_type="kendall",measure_type=measure_id)
result2file(kendall_result, output_path + "/%s_%s.kendall" % (dataset_id,measure_id))
print kendall_result

%%time
calculate_corr_for_days(days,corr_type="w_kendall")