In [None]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

In [None]:
%matplotlib inline
import seaborn as sns

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/TrendApproximation.json", sys.argv)

In [None]:
player_name_with_account_file_path = ph.get("player_name_with_accounts_file_path")
schedule_file_path = ph.get("schedule_file_path")
w2v_model_dir = ph.get("w2v_root_folder")
experiment_id = ph.get("experiment_id")
TIME_HOUR_VALS = ph.get("time_hour_vals")
RELEVANCE_TYPE = ph.get("relevance_type")
RELEVANCE_SUBSET = "positive" #ph.get("relevance_subset")

# Load Player Accounts

In [None]:
with open(player_name_with_account_file_path) as f:
    player_account_map = json.load(f)

In [None]:
# for finals
player_account_map["Stan Wawrinka"] = ["stanwawrinka"]
player_account_map["Novak Djokovic"] = ["DjokerNole"]
player_account_map["Caroline Garcia"] = ["CaroGarcia"]
player_account_map["Caroline Wozniacki"] = ["CaroWozniacki"]
player_account_map["Marin Cilic"] = ["cilic_marin"]
player_account_map["Kristina Mladenovic"] = ["KikiMladenovic"]
player_account_map["Dominic Thiem"] = ["ThiemDomi"]
player_account_map["Rafael Nadal"] = ["RafaelNadal"]
player_account_map["Timea Bacsinszky"] = ["TimeaOfficial"]
player_account_map["Pablo Carreno Busta"] = ["pablocarreno91"]
player_account_map["Simona Halep"] = ["Simona_Halep"]
player_account_map["Andy Murray"] = ["andy_murray"]
# for others
player_account_map["Tommy Robredo"] = ['TRobredo']
player_account_map["Sebastien Grosjean"] = ['sebboca29']
player_account_map["Mona Barthel"] = ['BarthelMona']
player_account_map["Arnaud Clement"] = ['arnaudclement']
player_account_map["Anett Kontaveit"] = ['Vamosanett']#'@AnettKontaveit'
player_account_map["David Goffin"] = ['David__Goffin']
player_account_map["Audrey Albie"] = ['DreyAlbie']
player_account_map["Jo-Wilfried Tsonga"] = ['tsonga7']

### TODO: include in player matches .json files!!!
player_account_map["Ernests Gulbis"] = ["egulbisfans"]#['@ernestgulbis', '@ErnestsGulbisFC']

player_account_map["Petra Martic"] = ['PetraMartic1991']
player_account_map["Venus Williams"] = ['Venuseswilliams']
player_account_map["Marion Bartoli"] = ['bartoli_marion']
player_account_map["Francesca Schiavone"] = ['Schiavone_Fra']
player_account_map["Garbiñe Muguruza"] = ['GarbiMuguruza']
player_account_map["Fabio Fognini"] = ['fabiofogna']
player_account_map["Elise Mertens"] = ['elise_mertens']
player_account_map["Borna Coric"] = ['borna_coric']
player_account_map["Camila Giorgi"] = ['CamilaGiorgi_it']
player_account_map["Nikoloz Basilashvili"] = ['NikaBasil']

### TODO: include in player matches .json files!!!
player_account_map["Alexander Zverev"] = ["FanZverev"] #['@saschazverev123', '@AlexZverev123', '@zverevtennis']

player_account_map["Dustin Brown"] = ['DreddyTennis']

### TODO: include in player matches .json files!!!
player_account_map["Donald Young"] = ['Yimlife1313'] #['@DonaldYoungUSA', '@DonaldYoungATP', '@DonaldYoung']

player_account_map["Martina Hingis"] = ['mhingis']

# there is no account ???
player_account_map["Andrey Kuznetsov"] = []#['@AKandreyln', '@AndreyKuznetsov']

player_account_map["Frances Tiafoe"] = ['FTiafoe']
player_account_map["Gael Monfils"] = ['Gael_Monfils']#, '@gmonfils']

# there is no account ???
player_account_map["Bernard Tomic"] = []#['@BTomicOfficial', '@BernardTomicAU', '@BernardTomicFC']

player_account_map["Benoit Paire"] = ['benoitpaire']
player_account_map["Angelique Kerber"] = ['AngeliqueKerber']

# Load Schedule

In [None]:
schedule_df = pd.read_csv(schedule_file_path, sep="|")

In [None]:
excluded_categories = ["boy", "girl", "wheelchair", "legends over 45"]

## Convert start dates to UTC for the proper evaluation

In [None]:
schedule_df["startDate"].value_counts()

In [None]:
utc_hour_map = {
    "11:00 AM" : 9,
    "10:00 AM" : 8,
    "12:00 PM" : 10,
    "2:00 PM" : 12,
    "11:30 AM" : 10, # hour was rounded up
    "3:00 PM" : 13,
    "12:45 PM" : 11 # hour was rounded up
}

In [None]:
schedule_df["utc_start_hour"] = schedule_df["startDate"].apply(lambda x: utc_hour_map[x])

In [None]:
schedule_df["utc_start_hour"].value_counts()

In [None]:
schedule_df.head()

# Filter Schedule

   * only Single matches are kept
   * only important categories are kept (Men's, Women's, Legends under 45)

In [None]:
def filter_categories(match_cat, excluded_cats=excluded_categories):
    match_cat_lower = match_cat.lower()
    keep_this = True
    for cat in excluded_cats:
        if cat in match_cat_lower:
            keep_this = False
            break
    return keep_this

In [None]:
matches_df = schedule_df[schedule_df["matchHeader"].apply(filter_categories)]

In [None]:
matches_df = matches_df[matches_df["date"] > "2017-05-27"]

In [None]:
dates = matches_df["date"].unique()
daily_tennis_players = {}
for d in dates:
    daily_df = matches_df[matches_df["date"] == d]
    daily_players = list(set(daily_df["playerName active"]).union(set(daily_df["playerName opponent"])))
    daily_tennis_players[d] = daily_players

In [None]:
len(matches_df), len(matches_df)

# Player name parts

In [None]:
players = list(set(matches_df["playerName active"]).union(matches_df["playerName opponent"]))

In [None]:
len(players)

In [None]:
import re

player_info_map = {}
players_without_account = []
for player in players:
    player_info_map[player] = {}
    player_info_map[player]["name_parts"] = [p.lower() for p in re.compile("[\s,-]+").split(player)]
    if player in player_account_map:
        player_info_map[player]["accounts"] = ["@" + et.transform_account_name(a, remove_digits=False, remove_under_score=False, to_lower=False) for a in player_account_map[player]]
    else:
        player_info_map[player]["accounts"] = None
        players_without_account.append(player)

In [None]:
len(players_without_account)

# TODO: Should we exclude player accounts from evaluation now???

for key in player_info_map:
    print(key)

In [None]:
player_info_map["Garbiñe Muguruza"]["name_parts"] = ['garbine', 'muguruza']

## Show multi-account players

#### Due to the pre-filtering there is no duplication

In [None]:
for player, info in player_info_map.items():
    if info["accounts"] != None and len(info["accounts"]) > 1:
        print('player_account_map["%s"] =' % player, info["accounts"])

# Relevant players per day

In [None]:
def list2relevance(values, relevance):
    return dict(zip(values, relevance * np.ones(len(values))))

In [None]:
key_words = ["play","match"]
time_hour_vals = ph.get('time_hour_vals')

In [None]:
relevant_arr = []
for d in dates:
    daily_player_names, daily_player_accounts = [], []
    for p in daily_tennis_players[d]:
        daily_player_names += player_info_map[p]["name_parts"]
        if player_info_map[p]["accounts"] != None:
            daily_player_accounts += player_info_map[p]["accounts"]
    daily_player_names_map = list2relevance(daily_player_names, 1)
    daily_player_accounts_map = list2relevance(daily_player_accounts, 1)
    for kw in key_words:
        for h in time_hour_vals:
            relevant_arr.append([d, "%.2i:00" % h, kw, daily_player_names_map, daily_player_accounts_map])
relevant_df = pd.DataFrame(relevant_arr, columns=["date", "time", "key_word", "names_parts", "accounts"])

In [None]:
relevant_df.head()

# Co-occurences

In [None]:
pair_occs_df = pd.read_csv("/mnt/idms/fberes/network/combined_occ/occ_scores/%s_with_scores.csv" % experiment_id, sep="|")

et.get_toplist(pair_occs_df, ["play"], ["2017-06-06T13:00"], score_col="rel_count_0")

In [None]:
pair_occs_df.head()

In [None]:
word_2_set = set(pair_occs_df["word_2"].unique())
word_1_set = set(pair_occs_df["word_1"].unique())
len(word_1_set), len(word_2_set)

In [None]:
def show_missing_words(info_key, word_set):
    num_missing = 0
    for player, info in player_info_map.items():
        diff = list(set(info[info_key]).difference(word_set))
        if len(diff) != 0:
            num_missing += 1
            print("%s: %s missing!" % (player, diff))
    print(num_missing)
            
def set_matching_words(info_key, word_set):
    num_missing = 0
    for player, info in player_info_map.items():
        match = list(set(info[info_key]).intersection(word_set))
        player_info_map[player][info_key] = match
        if len(match) == 0:
            num_missing += 1
    print(num_missing)

## a.) Checking names (All names are present)

### Formerly

   * 160 player is missing some words
   * Most of them missing only first name! Because name parts were not included in top important words!!!
   * Only 72 player has no name part mentioned.
   
### Updated

   * 67 player missing some words
   * 17 player has no name part mentioned.

In [None]:
show_missing_words("name_parts", word_2_set)

In [None]:
player_info_map["Katerina Siniakova"]["accounts"] = ["@SiniakovaSquad"] # "Siniakova" is in rg17_tweets_eng.csv AND eng_stemmed !!!
player_info_map["Matwe Middelkoop"]["accounts"] = ["@Mside83"] # !!! 
player_info_map["Edouard Roger-Vasselin"]["accounts"] = ["@ERogerVasselin"] # 'edouard' not found BUT 'Edouard' is present!!! WHY???
# Nikola Mektic 'mektic' word is in the data!!!

In [None]:
player_info_map

In [None]:
set_matching_words("name_parts", word_2_set)

In [None]:
et.get_toplist(pair_occs_df, ["match"], ["2017-06-01T07:00"], score_col="snapshot_val")

# Experimental Scores

In [None]:
len(pair_occs_df)

### word_2 frequencies

In [None]:
stat_cols = ["global_val","snapshot_val"]

In [None]:
word_2_stats = pair_occs_df.groupby(by=["word_2"])[stat_cols].mean()
word_2_counts = pair_occs_df.groupby(by=["word_2"])["date"].count()
word_2_counts_norm = word_2_counts / word_2_counts.max()

In [None]:
freq_factor = np.floor(np.log(word_2_counts / 5) / np.log(10))

In [None]:
pair_occs_df["frequency_val"] = pair_occs_df["word_2"].apply(lambda x: 0.0 if freq_factor[x] < 1 else 1.0 / freq_factor[x])

### Calculate normalization coefficient

In [None]:
snapshot_weight = ph.get("snapshot_weight")
frequency_weight = ph.get("frequency_weight")
print(snapshot_weight, frequency_weight)

In [None]:
pair_occs_df["r"] = (1.0 - (snapshot_weight + frequency_weight)) * pair_occs_df["global_val"] + snapshot_weight * pair_occs_df["snapshot_val"] + frequency_weight * pair_occs_df["frequency_val"]

## a.) rel_count_c

In [None]:
score_vals = ph.get("score_c_vals")
print(score_vals)

In [None]:
def rel_count_calculator(row, c=0.0, ):
    num = row["word_2_count"] + row["r"] * c
    denom = row["word_1_count"] + c
    return num / denom

In [None]:
%%time
for c in score_vals:
    pair_occs_df["rel_count_c%i" % c] = pair_occs_df.apply(lambda x: rel_count_calculator(x, c=c), axis=1)
    print(c)

In [None]:
pair_occs_df["rel_count_c5"].hist(bins=100)

## b.) norm_c

In [None]:
%%time
for c in score_vals:
    pair_occs_df["norm_c%i" % c] = pair_occs_df["rel_count_c%i" % c] / pair_occs_df["r"]
    print(c)

In [None]:
pair_occs_df["norm_c5"].hist(bins=100)

# Load Word2Vec models

In [None]:
w2v_models = et.load_w2v_models("%s/dim_%i/" % (w2v_model_dir, ph.get("w2v_model_dim")))

# Load Jaccard and Cosine distances

In [None]:
distance_root_folder = ph.get("distance_root_folder")
jaccard_distances = et.load_distance_model("%s/jaccard.dist" % distance_root_folder)
cosine_distances = et.load_distance_model("%s/cosine.dist" % distance_root_folder)

# Results

In [None]:
def get_ndcg_for_relevant_record(rel_rec, rel_cols, score_col, exclude_player_words=True, top_k=None, general_words=None, verbose=False):
    """'general_words' must be a relevance dictionary."""
    time_id, key_word = rel_rec["time"], rel_rec["key_word"]
    snapshot_id = "%sT%s" % (rel_rec["date"], time_id)
    # define relevant words
    relevant_words = dict()
    for rc in rel_cols:
        relevant_words.update(rel_rec[rc])
    if general_words != None:
        relevant_words.update(general_words)
    # define words to be excluded from the toplist    
    if exclude_player_words and "key_exclude_words" in rel_rec:
        to_be_excluded = rel_rec["key_exclude_words"]
    else:
        to_be_excluded = None
    # get toplist
    if score_col == "word_2_vec":
        pred_words = list(et.get_w2v_toplist(w2v_models, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded)["word_2"])
    elif score_col == "jaccard":
        pred_words = list(et.get_distance_toplist(jaccard_distances, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded )["word_2"])
    elif score_col == "cosine":
        pred_words = list(et.get_distance_toplist(cosine_distances, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded )["word_2"])
    else:
        pred_words = list(et.get_toplist(pair_occs_df, [key_word], [snapshot_id], score_col=score_col, excluded_words=to_be_excluded)["word_2"])
    if verbose:
        print(pred_words)
        print(relevant_words)
    ndcg_score = et.ndcg(relevant_words, pred_words, k=top_k)
    return (snapshot_id, rel_rec["date"], time_id, score_col, key_word, ndcg_score)

In [None]:
import multiprocessing, functools

def get_ndcg_single_thread(top_k, row, rel_cols, general_words, score_col):
    return get_ndcg_for_relevant_record(row, rel_cols, score_col, top_k=top_k, general_words=general_words)

def get_ndcg_from_threads(top_k, rel_cols, relevant_df, time_ids, score_cols, general_words, n_threads=1):
    print(len(relevant_df))
    filtered_relevant_df = relevant_df[relevant_df["time"].isin(time_ids)]
    print(len(filtered_relevant_df))
    ndcg_info_list = []
    if n_threads > 1:
        print("Calculating NDCG on %i threads" % n_threads)
    for idx, row in filtered_relevant_df.iterrows():
        if n_threads == 1:
            for score_col in score_cols:
                ndcg_info_list += [get_ndcg_single_thread(top_k, row, rel_cols, general_words, score_col)]
        else:
            f_partial = functools.partial(get_ndcg_single_thread, top_k, row, rel_cols, general_words)
            pool = multiprocessing.Pool(processes=n_threads)
            res = pool.map(f_partial, score_cols)
            pool.close()
            pool.join()
            ndcg_info_list += res
    ndcg_df = pd.DataFrame(ndcg_info_list, columns=["snapshot_id","date","time","score_id","key_word","ndcg"])
    return ndcg_df   

## Setting general words

In [None]:
COMMON_WORD_RELEVANCE = -1.0
general_words = {}

if RELEVANCE_SUBSET == "discriminative":
    general_words.update({
        "rg17":COMMON_WORD_RELEVANCE,
        "rg2017":COMMON_WORD_RELEVANCE,
        "rolandgarros":COMMON_WORD_RELEVANCE,
        "roland":COMMON_WORD_RELEVANCE,
        "garros":COMMON_WORD_RELEVANCE,
        "rolandgarros2017":COMMON_WORD_RELEVANCE,
        "frenchopen":COMMON_WORD_RELEVANCE,
        "french":COMMON_WORD_RELEVANCE,
        "open":COMMON_WORD_RELEVANCE,
        "clay":COMMON_WORD_RELEVANCE,
        "slam":COMMON_WORD_RELEVANCE,
        "set":COMMON_WORD_RELEVANCE,
        "round":COMMON_WORD_RELEVANCE      
    })
print(general_words)

## Setting score types

In [None]:
score_cols = ["word_2_vec", "jaccard", "cosine"]
#score_cols += ["rel_count_c%i_plus_ray" % i for i in [0,1,2,5,10]]
#score_cols += ["norm_c%i_plus_ray" % i for i in [0,1,2,5,10]]
score_cols += ["rel_count_c%i" % i for i in score_vals]
score_cols += ["norm_c%i" % i for i in score_vals]
print(score_cols)

## Setting time of days

In [None]:
#time_ids = ["%.2i:00" % t for t in TIME_HOUR_VALS]
time_ids = ["%.2i:00" % t for t in [4,7,10,13,16,19]]
time_ids

## Calculate NDCG in parallel

In [None]:
%%time
ndcg_df = get_ndcg_from_threads(100, ["names_parts","accounts"], relevant_df, time_ids, score_cols, general_words, n_threads=10)

In [None]:
len(ndcg_df)

In [None]:
ndcg_df.head()

# Mean NDCG performance for score types

In [None]:
ndcg_df.groupby(by="score_id")["ndcg"].mean().sort_values(ascending=False)

In [None]:
ndcg_for_plots = ndcg_df[ndcg_df["score_id"].isin(["word_2_vec","cosine","jaccard","rel_count_c5","norm_c5"])]

In [None]:
paper_rc = {'lines.linewidth': 5,'lines.markersize': 20}              
sns.set_context("paper", rc = paper_rc, font_scale = 4.25)
#sns.set_style("whitegrid")
#sns.set(font="DejaVu Sans")

## i.) Compare co-occurence scores for date

In [None]:
g = sns.factorplot(data=ndcg_for_plots, x="date", y="ndcg", hue="score_id", size=10, aspect=3)
g.set_xticklabels(rotation=90)

## ii.)  Compare co-occurence scores for time of day

In [None]:
sns.factorplot(data=ndcg_for_plots, x="time", y="ndcg", hue="score_id", size=10, aspect=3)

## iii.) Difference between players keywords

In [None]:
def show_player_perf(key_words):
    score_filtered = ndcg_for_plots[ndcg_for_plots["key_word"].isin(key_words)]
    score_filtered = score_filtered[score_filtered["score_id"] == "norm_c5"]
    #score_filtered = score_filtered[score_filtered["date"].isin(["2017-06-08","2017-06-09","2017-06-10","2017-06-11"])]
    score_filtered = score_filtered[score_filtered["date"].isin(["2017-05-28","2017-05-29","2017-05-30","2017-05-31","2017-06-01","2017-06-02","2017-06-03"])]
    g = sns.factorplot(data=score_filtered, x="snapshot_id", y="ndcg", hue="key_word", size=10, aspect=3)
    g.set_xticklabels(rotation=90)

In [None]:
show_player_perf(["match","play"])