In [None]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [None]:
import sys
sys.path.insert(0,"../../python/")
from parametrization import ParamHelper

In [None]:
import os, shutil
import numpy as np
import pandas as pd

In [None]:
sys.path.append('../../python/')
import prediction_utils.rg_visu as rgv

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
paper_rc = {'lines.linewidth': 5,'lines.markersize': 20}              
sns.set_context("paper", rc = paper_rc, font_scale = 4.25)
sns.set_style("whitegrid")

In [None]:
custom_palette = sns.color_palette("Set2", 8)
m_palette = sns.color_palette(['#5cd65c','#ff6666','#ff944d'])
custom_palette = m_palette + custom_palette
#sns.palplot(custom_palette)
sns.set_palette(m_palette + custom_palette)

# 1. Load experiment parameters

In [None]:
ph = ParamHelper("../../pipelines/USOpen.json", "ipython/experiments/uso_predict_player.ipynb")

In [None]:
exclude_media_nodes = ph.get("exclude_media_nodes")
include_only_players = ph.get("include_only_players")

In [None]:
rg_root_dir = ph.get("uso_root_dir")
use_binary_labels = ph.get("use_binary_labels")
FIRST_SNAPSHOT = ph.get("eval_first_snapshot")
LAST_SNAPSHOT = ph.get("eval_last_snapshot")
use_c_ndcg = ph.get("use_cumulative_ndcg")
N_THREADS = ph.get("num_of_threads")

In [None]:
dir_postfix = "_binary%s" % use_binary_labels
experiment_path = rg_root_dir + "/daily_in_advance/"
tennis_players_source_path = rg_root_dir + "/tennis_players%s/" % dir_postfix
original_experiment_path = experiment_path + "/tennis_players%s_copied" % dir_postfix
similarity_root = experiment_path + "similarity_metrics%s" % dir_postfix
prediction_experiment_path = "../../data/centrality_scores/usopen_epoch_t505_d3600/original/"

In [None]:
lookback_size = 30
num_of_days = ph.get("num_of_days")
num_of_intervals = num_of_days*24

In [None]:
print(tennis_players_source_path,original_experiment_path,prediction_experiment_path)

### Load included/excluded nodes (media accounts)

In [None]:
media_accounts_file_path = "%s/recoded_media_accounts.txt" % rg_root_dir
player_accounts_file_path = "%s/recoded_player_accounts.txt" % tennis_players_source_path

In [None]:
if include_only_players:
    raise RuntimeError("Not supported option!")
elif exclude_media_nodes:
    raise RuntimeError("Not supported option!")
else:
    sim_res_folder = similarity_root + "/results_with_media_nodes/"
    excluded_accounts = None
    included_accounts = None

img_dir = sim_res_folder + "/img_from%i_to%i" % (FIRST_SNAPSHOT, LAST_SNAPSHOT)
print(img_dir)

In [None]:
for p in (experiment_path, sim_res_folder, img_dir):
    if not os.path.exists(p):
        os.makedirs(p)
        print("Directory was created: %s" % p)

### Set other parameters for similarity computation

In [None]:
score_folders = ph.get("score_folders")
print(score_folders)

In [None]:
days = range(0,num_of_days) # eval all days
print(days)
print(num_of_intervals, num_of_days)

### Copy tennis player score files to all directory

In [None]:
def get_interval_bounds(lookback_size=2*24):
    interval_bounds = []
    for day_idx in days:
        upper_bound = (day_idx+1)*24
        lower_bound = upper_bound - lookback_size
        interval_subset = [max(0,lower_bound),upper_bound]
        interval_bounds += [(day_idx,interval_subset)]
    return interval_bounds

def duplicate_label_files(interval_bounds):
    for day_idx, bounds in interval_bounds:
        target_folder = "%s/%i" % (original_experiment_path, day_idx)
        if not os.path.exists(target_folder):
            os.makedirs(target_folder)
        full_src_file = "%s/players_%i.csv" % (tennis_players_source_path, day_idx)
        for i in range(bounds[0],bounds[1]):
            dest = "%s/players_%i.csv" % (target_folder, i)
            shutil.copy(full_src_file, dest)
        print("Labels for the %ith day were duplicated!" % day_idx)

In [None]:
interval_bounds = get_interval_bounds(lookback_size)

In [None]:
interval_bounds

In [None]:
duplicate_label_files(interval_bounds)

# 2. Calculate similarity metrics

In [None]:
metric_id = ph.get("metric_id")

In [None]:
### calculate prediction similarities ###

import evaluation_utils.eval_utils as eu

def is_enabled_by_filter(score, filter_keys):
    if filter_keys != None:
        is_enabled = False
        for f_key in filter_keys:
            if f_key in score:
                is_enabled = True
                break
    else:
        is_enabled = True
    return is_enabled

def load_or_calculate_prediction_result(input_path_prefixes, score, met, intervals, similarity_result_folder, excluded_indices, restricted_indices, n_threads):
    if not os.path.exists(similarity_result_folder):
        os.makedirs(similarity_result_folder)
    similarity_result_file = "%s/%s.txt" % (similarity_result_folder,met)
    if os.path.exists(similarity_result_file):
        res = list(np.loadtxt(similarity_result_file))
        print("Results were loaded from file: %s" % similarity_result_file)
    else:
        res = eu.calculate_measure_for_days(input_path_prefixes, measure_type=met, days=intervals, is_sequential=False, excluded_indices=excluded_indices, restricted_indices=restricted_indices, n_threads=n_threads)
        np.savetxt(similarity_result_file,res)
        print("%s: '%s' was calculated." % (score, met))
    return res

def calculate_metrics_for_prediction(similarity_map, measure_id, score_folders, interval_bounds, experiment_paths, similarity_result_folder, excluded_indices=None, restricted_indices=None, filter_keys=None, n_threads=1):
    similarity_map[measure_id] = {}
    for day_idx, _ in interval_bounds:
        similarity_map[measure_id][day_idx] = {}
    for score in score_folders:
        if measure_id == score.split("_")[0]:
            if not is_enabled_by_filter(score, filter_keys):
                continue
            if not os.path.exists(similarity_result_folder):
                os.makedirs(similarity_result_folder)
            for day_idx, bound in interval_bounds:
                input_path_prefixes = []
                input_path_prefixes.append("%s/%i/players" %  (experiment_paths[0],day_idx)) # label prefix
                input_path_prefixes.append("%s/%s/%s" % (experiment_paths[1], score, measure_id if measure_id != "nbm" else "ndm")) # prediction file prefix
                similarity_res_dir = "%s/%i/%s" % (similarity_result_folder, day_idx, score)
                similarity_map[measure_id][day_idx][score] = load_or_calculate_prediction_result(input_path_prefixes, score, metric_id, range(bound[0],bound[1]), similarity_res_dir, excluded_indices, restricted_indices, n_threads)
    print("prediction analysis was FINISHED")


In [None]:
prediction_results, score_stat_results = {}, {}

In [None]:
experiment_paths = [original_experiment_path, prediction_experiment_path]

## OnlineRank

In [None]:
olr_filters = None

In [None]:
%%time
calculate_metrics_for_prediction(prediction_results, "olr", score_folders, interval_bounds, experiment_paths, sim_res_folder, filter_keys=olr_filters, excluded_indices=excluded_accounts, restricted_indices=included_accounts, n_threads=N_THREADS)

## Online Indegree

In [None]:
%%time
calculate_metrics_for_prediction(prediction_results, "olid", score_folders, interval_bounds, experiment_paths, sim_res_folder, filter_keys=olr_filters, excluded_indices=excluded_accounts, restricted_indices=included_accounts, n_threads=N_THREADS)

## Temporal PageRank

In [None]:
%%time
calculate_metrics_for_prediction(prediction_results, "tpr", score_folders, interval_bounds, experiment_paths, sim_res_folder, excluded_indices=excluded_accounts, restricted_indices=included_accounts, n_threads=N_THREADS)

## Static PageRank

In [None]:
%%time
calculate_metrics_for_prediction(prediction_results, "spr", score_folders, interval_bounds, experiment_paths, sim_res_folder, excluded_indices=excluded_accounts, restricted_indices=included_accounts, n_threads=N_THREADS)

## Static Indegree

In [None]:
%%time
calculate_metrics_for_prediction(prediction_results, "indeg", score_folders, interval_bounds, experiment_paths, sim_res_folder, excluded_indices=excluded_accounts, restricted_indices=included_accounts, n_threads=N_THREADS)

## Static Negative beta-measure

In [None]:
%%time
calculate_metrics_for_prediction(prediction_results, "nbm", score_folders, interval_bounds, experiment_paths, sim_res_folder, excluded_indices=excluded_accounts, restricted_indices=included_accounts, n_threads=N_THREADS)

## Static Harmonic centrality

In [None]:
%%time
calculate_metrics_for_prediction(prediction_results, "hc", score_folders, interval_bounds, experiment_paths, sim_res_folder, excluded_indices=excluded_accounts, restricted_indices=included_accounts, n_threads=N_THREADS)

# 3. Visualization

In [None]:
visu_metric_id = metric_id
visu_metric_id = visu_metric_id.upper()
visu_metric_id

In [None]:
markers = ["s","*","o","^","v",">","D",]

def pred_perf_plot(score_visu_list,day_idx,offset=0):
    l_bound, u_bound = interval_bounds[day_idx-offset][1][0], interval_bounds[day_idx-offset][1][1]
    x = range(0,u_bound-l_bound)
    visu_args = []
    for i,score in enumerate(score_visu_list):
        score_pref = score.split("_")[0]
        m = markers[i % len(markers)]
        y = prediction_results[score_pref][day_idx][score]
        visu_args += [x,y,"%s-" % m]
    res = plt.plot(*visu_args)
    x_ticks = list(reversed(-np.array(range(0,len(y)+1,5))))
    plt.xticks(range(0,u_bound-l_bound,5),x_ticks)#,rotation="vertical")
    return res

def get_cumulative_performance(df, metric_id, first_snapshot=None, last_snapshot=None):
    print(len(df))
    if first_snapshot == None:
        first_snapshot = df["snapshot"].min()
    else:
        df = df[df["snapshot"] >= first_snapshot]
    if last_snapshot == None:
        last_snapshot = df["snapshot"].max()
    else:
        df = df[df["snapshot"] <= last_snapshot]
    print(len(df))
    cumulative_parts = []
    for max_snapshot in range(first_snapshot, last_snapshot+1):
        tmp_df = df[df["snapshot"] <= max_snapshot]
        aggr_perf_df = tmp_df.groupby(by=["score","day"])[metric_id].sum().reset_index()
        aggr_perf_df[metric_id] = aggr_perf_df[metric_id] / (max_snapshot - first_snapshot + 1.0)
        aggr_perf_df["snapshot"] = max_snapshot
        cumulative_parts.append(aggr_perf_df)
    return pd.concat(cumulative_parts)

def visu_pred_perf_per_day(score_visu_list, days, offset=0):
    print(days)
    num_plots = len(days)
    n_rows, n_cols = num_plots // 2 + 1, 2
    print(n_rows, n_cols, num_plots)
    fig = plt.figure(figsize=(n_cols*10,n_rows*5))
    lines = None
    for i in range(num_plots):
        plt.subplot(n_rows,n_cols,i+1)
        lines = pred_perf_plot(score_visu_list,days[i],offset=offset)
        plt.ylim((0.0,1.0))
        plt.ylabel(visu_metric_id)
        plt.title(dates[i])
    detailed_relabel = ph.get("is_detailed_relabel")
    fig.legend(lines,tuple([rgv.relabel(score,detailed_relabel) for score in score_visu_list]),(0.55,0.065))
    plt.savefig(img_dir + "/%s/detailed_%s.png" % (ph.get("img_dir"),metric_id))   
        
def visu_mean_behaviour(visu_index_list,day_indexes,first_snapshot=FIRST_SNAPSHOT,last_snapshot=LAST_SNAPSHOT,pref="mixed",metric=visu_metric_id,title_text="",ci_val=0.5):
    time_series = []
    dir_name = img_dir + "/" + ph.get("img_dir")
    if not os.path.exists(dir_name):
            os.makedirs(dir_name)
    for score in visu_index_list:
        if pref != "mixed" and pref not in score:
            continue
        score_pref = score.split("_")[0]
        for day_idx in days:
            perf_values = prediction_results[score_pref][day_idx][score]
            interval_idx = list(reversed(-np.array(range(1,len(perf_values)+1))))
            time_series += list(zip([score for i in interval_idx],[day_idx for i in interval_idx],interval_idx,perf_values))
    if len(time_series) > 0:
        time_series_df = pd.DataFrame(time_series,columns=["score","day","snapshot",metric])
        rgv.extract_params(time_series_df)
        time_series_df.to_csv("%s/full_table_%s_%s.csv" % (dir_name,pref,visu_metric_id), sep=";", index=False)
        if use_c_ndcg:
            time_series_df = get_cumulative_performance(time_series_df, metric, first_snapshot=first_snapshot, last_snapshot=last_snapshot)
            tmp_metric = "cumulative_%s" % metric
            time_series_df[tmp_metric] = time_series_df[metric]
        else:
            time_series_df = time_series_df[(time_series_df["snapshot"] >= first_snapshot) & (time_series_df["snapshot"] <= last_snapshot)]
            tmp_metric = metric
        detailed_relabel = False#ph.get("is_detailed_relabel")
        time_series_df["score"] = time_series_df["score"].apply(lambda x: rgv.relabel(x,detailed_relabel))
        print(len(time_series_df))
        time_series_df = time_series_df[time_series_df["day"].isin(day_indexes)]
        print(len(time_series_df))
        plt.figure(figsize=(22,14))
        score_vals = time_series_df["score"].unique()
        for i,val in enumerate(score_vals):
            c, m = custom_palette[i % len(custom_palette)], markers[i % len(markers)]
            sns.tsplot(data=time_series_df[time_series_df["score"]==val], time="snapshot", unit="day", condition="score", value=tmp_metric, ci=ci_val, color=c, marker=m)
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.savefig("%s/mean_%s_%s.png" % (dir_name,pref,tmp_metric))
        if not use_c_ndcg:
            out_df = time_series_df.groupby("score")[tmp_metric].mean().reset_index().sort_values(tmp_metric, ascending=False)
            out_df.to_csv("%s/mean_%s_%s.csv" % (dir_name,pref,tmp_metric), sep="\t", index=False, header=False)
            return out_df
    else:
        print("No data to visualize!")

## Select subset of days for evaluation

#first_day_idx = 1 # 2017-08-22
#last_day_index = num_of_days # 2017-09-10
day_indexes = list(range(1,5)) + list(range(7,num_of_days)) # from 2017-08-22 to 2017-09-10 (excluding: 2017-08-26 and 2017-08-27)
print(day_indexes)

In [None]:
#from 2017-08-22 to 2017-09-10 (excluding: 2017-08-26 and 2017-08-27)
day_indexes = list(range(15,num_of_days)) # last 6 days
#day_indexes = list(range(1,5)) # first 4 days (qualidfications)
print(day_indexes)

dates = ["2017-08-%.2i" % i for i in range(21,32)]
dates += ["2017-09-%.2i" % i for i in range(1,11)]
dates = [dates[i] for i in day_indexes]
dates

day_indexes = list(range(0,num_of_days))
print(day_indexes)

dates = ["2017-08-%.2i" % i for i in range(28,32)]
dates += ["2017-09-%.2i" % i for i in range(1,11)]
dates = [dates[i] for i in day_indexes]
dates

## OnlineRank

In [None]:
try:
    title_text = ph.get("title_text")
except:
    title_text = ""

visu_mean_behaviour(score_folders, day_indexes, pref="olr", title_text=title_text)

## Online Indegree

In [None]:
visu_mean_behaviour(score_folders, day_indexes, pref="olid", title_text=title_text)

## Temporal PageRank

In [None]:
visu_mean_behaviour(score_folders, day_indexes, pref="tpr")

## PageRank

In [None]:
visu_mean_behaviour(score_folders, day_indexes, pref="spr", title_text="of Static PageRank models")

## Indegree

In [None]:
visu_mean_behaviour(score_folders, day_indexes, pref="indeg", title_text="of Static Indegree models")

## Negative beta_measure

In [None]:
visu_mean_behaviour(score_folders, day_indexes, pref="nbm", title_text="of Static Negative beta-measure models")

## Harmonic centrality

In [None]:
visu_mean_behaviour(score_folders, day_indexes, pref="hc", title_text="of Static Harmonic centrality models")

import matplotlib

SMALL_SIZE = 26
MEDIUM_SIZE = 26
BIGGER_SIZE = 28

matplotlib.rc('font', size=SMALL_SIZE)          # controls default text sizes
matplotlib.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
matplotlib.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
matplotlib.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
matplotlib.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
matplotlib.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
matplotlib.rc('figure', titlesize=BIGGER_SIZE) 

## Mixed

In [None]:
visu_mean_behaviour(score_folders, day_indexes)

### By Day

In [None]:
paper_rc = {'lines.linewidth': 3,'lines.markersize': 10}              
sns.set_context("paper", rc = paper_rc, font_scale = 3)
sns.set_style("whitegrid")

In [None]:
visu_pred_perf_per_day(score_folders, day_indexes)