In [None]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [None]:
import os, shutil
import numpy as np
import pandas as pd

In [None]:
import sys
sys.path.insert(0,"../../python/")
from parametrization import ParamHelper

In [None]:
import prediction_utils.rg_prediction_sim as rgps
import prediction_utils.rg_prediction_visu as rgpv

# 1. Load experiment parameters

In [None]:
ph = ParamHelper("../../pipelines/Evaluation.json", "ipython/experiments/roland_garros_olr_const_ratios.ipynb")

In [None]:
rg_root_dir = ph.get("rg_root_dir")
experiment_path = rg_root_dir + "/conts_ratios/"
original_experiment_path = experiment_path + "/tennis_players_copied"
tennis_players_source_path = ph.get("tennis_players_source_path")
prediction_experiment_path = "../../data/centrality_scores/rg17_epoch_t457_d3600/original/"
num_of_intervals = ph.get("num_of_intervals")
img_dir = experiment_path + "/img"
N_THREADS = ph.get("num_of_threads")

In [None]:
if not os.path.exists(img_dir):
    os.makedirs(img_dir)

In [None]:
print(tennis_players_source_path,original_experiment_path,prediction_experiment_path)

### Set other parameters for similarity computation

In [None]:
score_folders = [
    "olr_a0.05_Ray(s1.000,n:7200.000)",
    "olr_a0.05_Ray(s1.000,n:18000.000)",
    "olr_a0.05_Ray(s1.000,n:36000.000)",
    "olr_a0.05_Const(1.00)"
]

In [None]:
intervals = range(0,num_of_intervals)
print num_of_intervals

# 2. Copy tennis player score files to all directory

In [None]:
num_of_days = 19
if num_of_intervals % num_of_days != 0:
    raise RuntimeError("The number of intervals per day is not consistent!")
duplication_factor = num_of_intervals / num_of_days
print("Duplication factor: %i" % duplication_factor)

if not os.path.exists(original_experiment_path):
    os.makedirs(original_experiment_path)
    for i in intervals:
        src_file_idx = i // duplication_factor if duplication_factor > 1 else i
        full_src_file = "%s/players_%i.csv" % (tennis_players_source_path, src_file_idx)
        dest = "%s/players_%i.csv" % (original_experiment_path, i)
        shutil.copy(full_src_file, dest)
    print("Label files were duplicated!")

In [None]:
original_experiment_path

# 3. Count the number of nodes with non-constant olr score

In [None]:
score_stat_results = {}

## OnlineRank

In [None]:
%%time
rgps.calculate_bigger_than_const_ratio(score_stat_results, "olr", score_folders, intervals, prediction_experiment_path, const=0.051, n_threads=N_THREADS)

# 4. Analyze Mention Data

In [None]:
mention_data_path = "../../data/raw/rg17_mentions.csv"
mentions_df = pd.read_csv(mention_data_path, sep=" ", names=["time","src","trg"])
len(mentions_df)

In [None]:
min_time = 1495576800  # 2017 May 24  0:00:00 GMT+2 time

In [None]:
mentions_df["snapshot_id"] = mentions_df["time"].apply(lambda x: (x-min_time) // 3600)

In [None]:
mentions_df.head()

#### Exclude data after June 11

In [None]:
mentions_df = mentions_df[mentions_df["snapshot_id"] < 456]
len(mentions_df)

## Extract number of nodes in mention network

In [None]:
def get_node_count(df):
    src = set(df["src"])
    trg = set(df["trg"])
    nodes = src.union(trg)
    return len(nodes)

In [None]:
num_nodes_so_far = []
for i in range(456):
    partial_df = mentions_df[mentions_df["snapshot_id"] <= i]
    num_nodes_so_far.append(get_node_count(partial_df))

## Extract mention volume

In [None]:
hourly_num_mentions = mentions_df["snapshot_id"].value_counts()

In [None]:
num_of_mentions = [hourly_num_mentions[i] for i in intervals]

# 5. Visualization

In [None]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
custom_palette = sns.color_palette("Set2", 8)
m_palette = sns.color_palette(['#5cd65c','#ff6666','#ff944d'])
custom_palette = m_palette + custom_palette
sns.set_palette(m_palette + custom_palette)

In [None]:
paper_rc = {'lines.linewidth': 5,'lines.markersize': 20}              
sns.set_context("paper", rc = paper_rc, font_scale = 4.25)
sns.set_style("whitegrid")

## Select subset of days for evaluation

In [None]:
dates = ["2017-05-%.2i" % i for i in range(24,32)]
dates += ["2017-06-%.2i" % i for i in range(1,12)]
dates

## OnlineRank

In [None]:
num_of_intervals, len(intervals)

In [None]:
rgpv.plot_non_const_ratio(intervals, dates, score_stat_results,num_nodes_so_far,"m","Number of nodes", custom_palette)
plt.savefig("%s/non_const_with_node_count.png" % img_dir)

In [None]:
rgpv.plot_non_const_ratio(intervals, dates, score_stat_results,num_of_mentions,"b","Mention volume", custom_palette)
plt.savefig("%s/non_const_with_mention_volume.png" % img_dir)