In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="DejaVu Sans")

# Load Player Accounts

In [None]:
with open("/mnt/idms/fberes/network/online_ranker/roland_garros_updated_schedule/filtered_true_matches.json") as f:
    player_account_map = json.load(f)

## !!! Select main account for players: later we should decide whether we will enable multiple accounts in the evaluation

In [None]:
player_account_map["Stan Wawrinka"] = ["stanwawrinka"]
player_account_map["Novak Djokovic"] = ["djokernole"]
player_account_map["Caroline Garcia"] = ["carogarcia"]
player_account_map["Caroline Wozniacki"] = ["carowozniacki"]
player_account_map["Marin Cilic"] = ["cilicmarin"]
player_account_map["Kristina Mladenovic"] = ["kikimladenovic"]
player_account_map["Dominic Thiem"] = ["thiemdomi"]
player_account_map["Rafael Nadal"] = ["rafaelnadal"]
player_account_map["Timea Bacsinszky"] = ["timeaofficial"]
player_account_map["Pablo Carreno Busta"] = ["pablocarreno91"]
player_account_map["Simona Halep"] = ["simonahalep"]
player_account_map["Andy Murray"] = ["andymurray"]

#### with underscore

player_account_map["Stan Wawrinka"] = ["stanwawrinka"]
player_account_map["Novak Djokovic"] = ["djokernole"]
player_account_map["Caroline Garcia"] = ["carogarcia"]
player_account_map["Caroline Wozniacki"] = ["carowozniacki"]
player_account_map["Marin Cilic"] = ["cilic_marin"]
player_account_map["Kristina Mladenovic"] = ["kikimladenovic"]
player_account_map["Dominic Thiem"] = ["thiemdomi"]
player_account_map["Rafael Nadal"] = ["rafaelnadal"]
player_account_map["Timea Bacsinszky"] = ["timeaofficial"]
player_account_map["Pablo Carreno Busta"] = ["pablocarreno91"]
player_account_map["Simona Halep"] = ["simona_halep"]
player_account_map["Andy Murray"] = ["andy_murray"]

# Load Schedule

In [None]:
schedule_df = pd.read_csv("/mnt/idms/fberes/network/roland_garros/data/schedule_new_df.csv", sep="|")

In [None]:
excluded_categories = ["boy", "girl", "wheelchair", "legends over 45"]

# Filter Schedule

   * only Single matches are kept
   * only important categories are kept (Men's, Women's, Legends under 45)

In [None]:
def filter_categories(match_cat, excluded_cats=excluded_categories):
    match_cat_lower = match_cat.lower()
    keep_this = True
    for cat in excluded_cats:
        if cat in match_cat_lower:
            keep_this = False
            break
    if not ("final" in match_cat_lower and "single" in match_cat_lower):
        keep_this = False
    return keep_this

In [None]:
finals_df = schedule_df[schedule_df["matchHeader"].apply(filter_categories)]

In [None]:
len(schedule_df), len(finals_df)

## Single finals

   * **canceled** matches are not excluded because people may talk about this events as well 

In [None]:
finals_df

# Player name parts

In [None]:
players = list(set(finals_df["playerName active"]).union(finals_df["playerName opponent"]))

In [None]:
len(players)

# TODO: later do NOT remove underscore

In [None]:
player_info_map = {}
for player in players:
    player_info_map[player] = {
        "name_parts": [p.lower() for p in player.split()],
        "accounts": ["@" + et.transform_account_name(a, remove_digits=False, remove_under_score=True) for a in player_account_map[player]]
    }

## Show multi-account players

In [None]:
for player, info in player_info_map.items():
    if len(info["accounts"]) > 1:
        print(player, info["accounts"])

In [None]:
player_info_map

# Co-occurences

In [None]:
pair_occs_df = pd.read_csv("/mnt/idms/fberes/network/roland_garros/occs_with_scores.csv", sep="|")

In [None]:
pair_occs_df.head(2)

In [None]:
word_2_set = set(pair_occs_df["word_2"].unique())
word_1_set = set(pair_occs_df["word_1"].unique())
len(word_1_set), len(word_2_set)

In [None]:
def show_missing_words(info_key, word_set):
    for player, info in player_info_map.items():
        diff = list(set(info[info_key]).difference(word_set))
        if len(diff) != 0:
            print("%s: %s missing!" % (player, diff))
            
def show_matching_words(info_key, word_set):
    for player, info in player_info_map.items():
        match = list(set(info[info_key]).intersection(word_set))
        print("%s: %s" % (player, match))

## a.) Checking names (All names are present)

In [None]:
show_missing_words("name_parts", word_2_set)

In [None]:
show_missing_words("name_parts", word_1_set)

show_matching_words("name_parts", word_1_set)

## b.) Checking account names

#### After dropping the additional player accounts only "@cilicmarin" is missing!!!

In [None]:
show_missing_words("accounts", word_2_set)

In [None]:
show_missing_words("accounts", word_1_set)

show_matching_words("accounts", word_1_set)

# Relevant player words

## TODO: add players who play at the same time (only with smaller relevance!!!)

In [None]:
finals_df.head(2)

In [None]:
finals_df[finals_df["matchScore"] == "Cancelled"]

In [None]:
relevant_infos = []
for idx, row in finals_df.iterrows():
    date, time, winner, loser, category, score = row["date"], row["startDate"], row["playerName active"], row["playerName opponent"], row["matchHeader"], row["matchScore"]
    if score == "Cancelled":
        to_iterate_on = [(winner, loser, None),(loser, winner, None)]
    else:
        to_iterate_on = [(winner, loser, 1),(loser, winner, 0)]
    for p1, p2, is_winner in to_iterate_on:
        n1, n2 = player_info_map[p1]["name_parts"], player_info_map[p2]["name_parts"]
        acc1, acc2 = player_info_map[p1]["accounts"], player_info_map[p2]["accounts"]
        for w in n1+acc1:
            player_relevant = n1+acc1
            if "men" in category.lower():
                player_relevant.append("men")
            else:
                player_relevant.append("women")
            player_relevant.remove(w) # removing the examined key word
            relevant_infos.append([date, time, p1, is_winner, w, player_relevant, p2, n2+acc2])
relevant_df = pd.DataFrame(relevant_infos, columns=["date","time","player","is_winner","key_word", "key_relevant_words", "opponent","opp_relevant_words"])

In [None]:
relevant_df.head(5)

# TODO: winner evaluation

   * for words like (win, won, triumph, victory etc.)
   * see whether winner name parts are present or not!!!

# NDCG calculator depends on the fact that we have binary relevances!!!

## Random shuffle is not implemented yet for ties!!!

In [None]:
def dcg(relevant_set, pred_order, k=None):
    if k == None:
        k = len(pred_order)
    dcg_score = 0.0
    for i in range(k):
        word = pred_order[i]
        if word in relevant_set:
            # all relevance score is 1.0 here!
            dcg_score += 1.0 / np.log(i+2)
    return dcg_score

def ndcg(relevant_set, pred_order, k=None):
    #if k == None or (k > len(pred_order) or k > len(relevant_set)):
    #    k = min(len(pred_order),len(relevant_set))
    if len(pred_order) == 0.0:
        return 0.0
    else:
        dcg_val, idcg_val = dcg(relevant_set,pred_order,k=k), dcg(relevant_set,relevant_set,k=k)
        return float(dcg_val) / idcg_val 

print(ndcg(["alma","korte"], ["szilva","alma"], k=1))
print(ndcg(["alma","korte"], ["korte","alma"], k=1))

In [None]:
def get_ndcg_for_relevant_record(rel_rec, time_id, score_col, top_k=None, general_words=None):
    snapshot_id = "%sT%s" % (rel_rec["date"], time_id)
    #relevant_words = rel_rec["key_relevant_words"] + rel_rec["opp_relevant_words"]
    relevant_words = rel_rec["opp_relevant_words"]
    if general_words != None:
        relevant_words += general_words
    pred_words = list(et.get_toplist(pair_occs_df, [rel_rec["key_word"]], [snapshot_id], score_col=score_col)["word_2"])
    ndcg_score = ndcg(relevant_words, pred_words, k=top_k)
    return (snapshot_id, rel_rec["date"], time_id, score_col, rel_rec["key_word"], ndcg_score)

# Experimental Scores

In [None]:
pair_occs_df.head(2)

## a.) occ_score_3

In [None]:
pair_occs_df["occ_score_3"] = pair_occs_df["count"] + pair_occs_df["occ_score"]

## b.) occ_score_4

In [None]:
def rayleigh(x, s=1.0):
    var = s**2
    return x / var * np.exp(-1.0 / (2*var) * x)

In [None]:
x = np.arange(0,1,0.01)
y = rayleigh(x, s=0.35)
plt.plot(x,y)

### Average fraction for word_2

In [None]:
word_2_counts = pair_occs_df.groupby(by=["word_2"])["count"].mean()

In [None]:
word_2_counts.mean()

In [None]:
word_2_counts["@simonahalep"], word_2_counts["simona"], word_2_counts["halep"]

In [None]:
word_2_counts["@rafaelnadal"], word_2_counts["rafael"], word_2_counts["nadal"]

In [None]:
word_2_counts["win"], word_2_counts["champion"], word_2_counts["defeat"]

In [None]:
word_2_counts["men"], word_2_counts["women"]

#### Based on former values s=0.35 seems to be a resonable choice

In [None]:
s_val = 0.35
pair_occs_df["rayleigh_%.2f" % s_val] = pair_occs_df["word_2"].apply(lambda x: rayleigh(word_2_counts[x], s=s_val))

### occ_score_4 computation

In [None]:
pair_occs_df["occ_score_4"] = pair_occs_df["count"] *  pair_occs_df["rayleigh_0.35"]
pair_occs_df["occ_score_5"] = pair_occs_df["count"] + pair_occs_df["rayleigh_0.35"]

# Results

In [None]:
import sys, os, multiprocessing, functools

#### General words can improve results

In [None]:
def get_ndcg_single_thread(top_k, row, general_words, param_tuple):
    time_id, score_col = param_tuple
    return get_ndcg_for_relevant_record(row, time_id, score_col, general_words=general_words)

def get_ndcg_from_threads(top_k, time_ids, score_cols, general_words, n_threads=1):
    param_tuples = [(time_id, score_col) for time_id in time_ids for score_col in score_cols]
    ndcg_info_list = []
    for idx, row in relevant_df.iterrows():
        f_partial = functools.partial(get_ndcg_single_thread, top_k, row, general_words)
        pool = multiprocessing.Pool(processes=n_threads)
        res = pool.map(f_partial, param_tuples)
        pool.close()
        pool.join()
        ndcg_info_list += res
    ndcg_df = pd.DataFrame(ndcg_info_list, columns=["snapshot_id","date","time","score_id","key_word","ndcg"])
    return ndcg_df   

In [None]:
%%time
general_words = ["play","match"]
time_ids = ["%.2i:00" % t for t in [0,3,6,9,12,15,18,21]]
score_cols = ["count","occ_score","occ_score_2","occ_score_3","occ_score_4","occ_score_5"]
ndcg_df = get_ndcg_from_threads(20, time_ids, score_cols, general_words, n_threads=20)

In [None]:
len(ndcg_df)

In [None]:
ndcg_df.head()

## i.) Compare co-occurence scores for date

   * the baseline score is the best (the fraction of co-occurences)
   * occ_score_2 gives similar results...

In [None]:
sns.factorplot(data=ndcg_df, x="date", y="ndcg", hue="score_id", size=8)

In [None]:
#score_col = "occ_score_5"
score_col = "count"
et.get_toplist(pair_occs_df, ["simona"], ["2017-06-10T12:00"], score_col=score_col)[["word_1","word_2",score_col]]

pair_occs_df[pair_occs_df["date"] == "2017-06-10"]["word_1"].value_counts()

## ii.)  Compare co-occurence scores for time of day

In [None]:
sns.factorplot(data=ndcg_df, x="time", y="ndcg", hue="score_id", size=8)

### We should evaluate at 9:00 AM (UTC -> 11:00 AM UTC+2) - it gives the most stable performance

score_2 = ndcg_df[ndcg_df["score_id"] == "occ_score_2"]
sns.factorplot(data=score_2, x="date", y="ndcg", hue="time", size=8)

# difference between players - Huge differences :(

In [None]:
key_words = ["nadal","wawrinka","ostapenko","halep","murray","djokovic"]
score_filtered = ndcg_df[ndcg_df["key_word"].isin(key_words)]
sns.factorplot(data=score_filtered, x="date", y="ndcg", hue="key_word", size=8)

### Some user accounts has very low performance - Maybe we should only examine name parts???

In [None]:
key_words = ["@rafaelnadal","@stanwawrinka","@ostapenkofc","@simonahalep","@andymurray","@djokernole"]
score_filtered = ndcg_df[ndcg_df["key_word"].isin(key_words)]
sns.factorplot(data=score_filtered, x="date", y="ndcg", hue="key_word", size=8)