In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import sys
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/CorrelationComputation.json",sys.argv)

In [None]:
import pandas as pd
import numpy as np

In [None]:
import sys
sys.path.insert(0,'../../python/')

import correlation_new.correlation_utils_new as cu

# Util functions

## Notes

   * Formerly, I filtered daily users for positive score
   * For PageRank it did not made any difference as all nodes have positive PageRank (by definition)
   * But for Indegree I excluded many nodes for a given day.
   * From now on, I do NOT filter nodes with zero daily score. Because despite zero score they are all part of the daily graph snapshot

In [None]:
def calculate_stats_for_a_day(input_prefix, day, measure, only_positive=False):
    prev_day = cu.load_score_map(input_prefix, day-1, measure)
    current_day = cu.load_score_map(input_prefix, day, measure)
    if only_positive:
        prev_day = prev_day.query("score>0")
        current_day = current_day.query("score>0")
    overlap = set(prev_day.index).intersection(set(current_day.index))
    return [len(current_day), len(prev_day), len(overlap)]

def calculate_stats_for_days(input_prefix, days, measure_type):
    return map(lambda x: calculate_stats_for_a_day(input_prefix, x, measure=measure_type), days)

# Define dataset and experiment parameters

In [None]:
experiment_folder = ph.get("experiment_folder")

In [None]:
dataset_id = ph.get("dataset_id")
measure_id = ph.get("measure_id")
data_path = '/mnt/idms/fberes/network/DATA/temporal_centralities/centrality_output_for_datasets/%s/centrality_scores/' % dataset_id
output_path = '%s/correlations/' % experiment_folder
corr_types = ph.get("corr_types")
N_THREADS = ph.get("num_workers")
days = range(1,22)
print days

# Active vertices and overlaps

In [None]:
stat_df = pd.DataFrame(np.array(calculate_stats_for_days(data_path, days, measure_id)), columns=["curr_day_count","prev_day_count","overlap_count"])
stat_df.to_csv(output_path + "/%s_%s.stats" % (dataset_id,measure_id), sep=" ")
stat_df.head()

# Daily correlations

In [None]:
%%time
if 'pearson' in corr_types:
    pearson_result = cu.calculate_corr_for_days(data_path, days,corr_type="pearson", measure_type=measure_id, n_threads=N_THREADS)
    cu.result2file(pearson_result, output_path + "/%s_%s.pearson" % (dataset_id,measure_id))
    print pearson_result

In [None]:
%%time
if 'spearman' in corr_types:
    spearman_result = cu.calculate_corr_for_days(data_path, days,corr_type="spearman", measure_type=measure_id)
    cu.result2file(spearman_result, output_path + "/%s_%s.spearman" % (dataset_id,measure_id))
    print spearman_result

In [None]:
%%time
if 'kendall' in corr_types:
    kendall_result = cu.calculate_corr_for_days(data_path, days,corr_type="kendall", measure_type=measure_id)
    cu.result2file(kendall_result, output_path + "/%s_%s.kendall" % (dataset_id,measure_id))
    print kendall_result

In [None]:
%%time
if 'w_kendall' in corr_types:
    w_kendall_result = cu.calculate_corr_for_days(data_path, days,corr_type="w_kendall", measure_type=measure_id)
    cu.result2file(w_kendall_result, output_path + "/%s_%s.w_kendall" % (dataset_id,measure_id))
    print w_kendall_result