In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/CorrelationComputation.json",sys.argv)

In [None]:
import pandas as pd
import os

# Define dataset and experiment parameters

In [None]:
experiment_folder = ph.get("experiment_folder")

In [None]:
dataset_id = ph.get("dataset_id")
measure_id = ph.get("measure_id")
data_path = '/mnt/idms/fberes/network/DATA/temporal_centralities/centrality_output_for_datasets/%s/centrality_scores/' % dataset_id
correlation_and_stat_path = '%s/correlations/' % experiment_folder
result_folder = '%s/results/corr_and_stats/' % experiment_folder
corr_types = ph.get("corr_types")

# Collect statistics about the number of active users and overlaps for consecutive days

In [None]:
result_df = pd.read_csv(correlation_and_stat_path + "/%s_%s.stats" % (dataset_id, measure_id), sep=" ")
result_df['day'] = range(1,len(result_df)+1)
result_df = result_df[["day","curr_day_count","prev_day_count","overlap_count"]]
print result_df.index
result_df.head()

# Normalization with total vertex count

In [None]:
def load_score_map_indices(input_prefix, day, measure):
    """The centrality maps were pre-sorted in decreasing order!!!"""
    scores = pd.read_csv(input_prefix + '/%s_scores_%i.txt_s' % (measure,day), sep=" ", names=["id","score"])
    scores = scores.set_index("id")
    return scores.index

def get_total_vertex_count(input_prefix, days, measure):
    all_indices = set(load_score_map_indices(input_prefix, days[0], measure))
    for i in xrange(1,len(days)):
        #print i, len(all_indices)
        curr_indices = set(load_score_map_indices(input_prefix, days[i], measure))
        all_indices = all_indices.union(curr_indices)
    return len(all_indices)

In [None]:
total_vertex_count = get_total_vertex_count(data_path, range(len(result_df)+1), measure_id)

In [None]:
print total_vertex_count

In [None]:
with open(result_folder + '/%s_total_vertex_count.txt' % dataset_id, 'w') as f:
    f.write(str(total_vertex_count))

In [None]:
result_df['curr_day_frac'] = result_df['curr_day_count'] / total_vertex_count

In [None]:
result_df['prev_day_frac'] = result_df['prev_day_count'] / total_vertex_count

In [None]:
result_df['overlap_frac'] = result_df['overlap_count'] / total_vertex_count

# Collect weighted kendall results

   * calculating weighted kendall is time consuming
   * that is why we compute weighted kendall for consecutive days in parallel
   * the weighted kendall scores for different say pairs are collected here

In [None]:
w_kendall_tmp_folder = "%s/correlations_tmp" % experiment_folder

In [None]:
if "w_kendall" in corr_types:
    with open("%s/correlations/%s_%s.w_kendall" % (experiment_folder,dataset_id,measure_id),"w") as f_out:
        for idx in xrange(len(result_df)):
            f_partial = "%s/%s_%s_%i.w_kendall" % (w_kendall_tmp_folder,dataset_id,measure_id,idx)
            if os.path.exists(f_partial):
                with open(f_partial) as f:
                    for line in f:
                        f_out.write("%i %f\n" % (idx,float(line)))
            else:
                f_out.write("%i %f\n" % (idx,-2))
        print "Partial w_kendall results were collected!"

# Collect correlation results

In [None]:
for corr_type in corr_types:
    corr_df = pd.read_csv(correlation_and_stat_path + "/%s_%s.%s" % (dataset_id, measure_id,corr_type), sep=" ", names=["idx",corr_type])[corr_type]
    result_df = result_df.join(corr_df)

# Export results

In [None]:
result_df.to_csv(result_folder + '/%s_%s.csv' % (dataset_id, measure_id), sep=" ", index=False)

In [None]:
result_df