In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/TrendApproximation.json", sys.argv)

In [None]:
player_name_with_account_file_path = ph.get("player_name_with_accounts_file_path")
schedule_file_path = ph.get("schedule_file_path")
w2v_model_dir = ph.get("w2v_root_folder")
experiment_id = ph.get("experiment_id")
TIME_HOUR_VALS = ph.get("time_hour_vals")

# Load Player Accounts

In [None]:
with open(player_name_with_account_file_path) as f:
    player_account_map = json.load(f)

In [None]:
# for finals
player_account_map["Stan Wawrinka"] = ["stanwawrinka"]
player_account_map["Novak Djokovic"] = ["DjokerNole"]
player_account_map["Caroline Garcia"] = ["CaroGarcia"]
player_account_map["Caroline Wozniacki"] = ["CaroWozniacki"]
player_account_map["Marin Cilic"] = ["cilic_marin"]
player_account_map["Kristina Mladenovic"] = ["KikiMladenovic"]
player_account_map["Dominic Thiem"] = ["ThiemDomi"]
player_account_map["Rafael Nadal"] = ["RafaelNadal"]
player_account_map["Timea Bacsinszky"] = ["TimeaOfficial"]
player_account_map["Pablo Carreno Busta"] = ["pablocarreno91"]
player_account_map["Simona Halep"] = ["Simona_Halep"]
player_account_map["Andy Murray"] = ["andy_murray"]
# for others
player_account_map["Tommy Robredo"] = ['TRobredo']
player_account_map["Sebastien Grosjean"] = ['sebboca29']
player_account_map["Mona Barthel"] = ['BarthelMona']
player_account_map["Arnaud Clement"] = ['arnaudclement']
player_account_map["Anett Kontaveit"] = ['Vamosanett']#'@AnettKontaveit'
player_account_map["David Goffin"] = ['David__Goffin']
player_account_map["Audrey Albie"] = ['DreyAlbie']
player_account_map["Jo-Wilfried Tsonga"] = ['tsonga7']

### TODO: include in player matches .json files!!!
player_account_map["Ernests Gulbis"] = ["egulbisfans"]#['@ernestgulbis', '@ErnestsGulbisFC']

player_account_map["Petra Martic"] = ['PetraMartic1991']
player_account_map["Venus Williams"] = ['Venuseswilliams']
player_account_map["Marion Bartoli"] = ['bartoli_marion']
player_account_map["Francesca Schiavone"] = ['Schiavone_Fra']
player_account_map["Garbiñe Muguruza"] = ['GarbiMuguruza']
player_account_map["Fabio Fognini"] = ['fabiofogna']
player_account_map["Elise Mertens"] = ['elise_mertens']
player_account_map["Borna Coric"] = ['borna_coric']
player_account_map["Camila Giorgi"] = ['CamilaGiorgi_it']
player_account_map["Nikoloz Basilashvili"] = ['NikaBasil']

### TODO: include in player matches .json files!!!
player_account_map["Alexander Zverev"] = ["FanZverev"] #['@saschazverev123', '@AlexZverev123', '@zverevtennis']

player_account_map["Dustin Brown"] = ['DreddyTennis']

### TODO: include in player matches .json files!!!
player_account_map["Donald Young"] = ['Yimlife1313'] #['@DonaldYoungUSA', '@DonaldYoungATP', '@DonaldYoung']

player_account_map["Martina Hingis"] = ['mhingis']

# there is no account ???
player_account_map["Andrey Kuznetsov"] = []#['@AKandreyln', '@AndreyKuznetsov']

player_account_map["Frances Tiafoe"] = ['FTiafoe']
player_account_map["Gael Monfils"] = ['Gael_Monfils']#, '@gmonfils']

# there is no account ???
player_account_map["Bernard Tomic"] = []#['@BTomicOfficial', '@BernardTomicAU', '@BernardTomicFC']

player_account_map["Benoit Paire"] = ['benoitpaire']
player_account_map["Angelique Kerber"] = ['AngeliqueKerber']

# Load Schedule

In [None]:
schedule_df = pd.read_csv(schedule_file_path, sep="|")

In [None]:
excluded_categories = ["boy", "girl", "wheelchair", "legends over 45"]

## Convert start dates to UTC for the proper evaluation

In [None]:
schedule_df["startDate"].value_counts()

In [None]:
utc_hour_map = {
    "11:00 AM" : 9,
    "10:00 AM" : 8,
    "12:00 PM" : 10,
    "2:00 PM" : 12,
    "11:30 AM" : 10, # hour was rounded up
    "3:00 PM" : 13,
    "12:45 PM" : 11 # hour was rounded up
}

In [None]:
schedule_df["utc_start_hour"] = schedule_df["startDate"].apply(lambda x: utc_hour_map[x])

In [None]:
schedule_df["utc_start_hour"].value_counts()

In [None]:
schedule_df.head()

# Filter Schedule

   * only Single matches are kept
   * only important categories are kept (Men's, Women's, Legends under 45)

In [None]:
def filter_categories(match_cat, excluded_cats=excluded_categories):
    match_cat_lower = match_cat.lower()
    keep_this = True
    for cat in excluded_cats:
        if cat in match_cat_lower:
            keep_this = False
            break
    return keep_this

In [None]:
matches_df = schedule_df[schedule_df["matchHeader"].apply(filter_categories)]

In [None]:
matches_df = matches_df[matches_df["date"] > "2017-05-27"]

In [None]:
matches_df["date"].unique()

In [None]:
len(matches_df), len(matches_df)

# Player name parts

In [None]:
players = list(set(matches_df["playerName active"]).union(matches_df["playerName opponent"]))

In [None]:
len(players)

In [None]:
import re

player_info_map = {}
players_without_account = []
for player in players:
    player_info_map[player] = {}
    player_info_map[player]["name_parts"] = [p.lower() for p in re.compile("[\s,-]+").split(player)]
    if player in player_account_map:
        player_info_map[player]["accounts"] = ["@" + et.transform_account_name(a, remove_digits=False, remove_under_score=False, to_lower=False) for a in player_account_map[player]]
    else:
        player_info_map[player]["accounts"] = None
        players_without_account.append(player)

In [None]:
len(players_without_account)

# TODO: Should we exclude player accounts from evaluation now???

for key in player_info_map:
    print(key)

In [None]:
player_info_map["Garbiñe Muguruza"]["name_parts"] = ['garbine', 'muguruza']

## Show multi-account players

#### Due to the pre-filtering there is no duplication

In [None]:
for player, info in player_info_map.items():
    if info["accounts"] != None and len(info["accounts"]) > 1:
        print('player_account_map["%s"] =' % player, info["accounts"])

# Co-occurences

In [None]:
pair_occs_df = pd.read_csv("/mnt/idms/fberes/network/combined_occ/occ_scores/%s_with_scores.csv" % experiment_id, sep="|")

In [None]:
et.get_toplist(pair_occs_df, ["play"], ["2017-06-06T13:00"], score_col="rel_count_0")