In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/TrendApproximation.json", sys.argv)

In [None]:
player_name_with_account_file_path = ph.get("player_name_with_accounts_file_path")
schedule_file_path = ph.get("schedule_file_path")

# Load Player Accounts

In [None]:
with open(player_name_with_account_file_path) as f:
    player_account_map = json.load(f)

In [None]:
player_account_map["Stan Wawrinka"] = ["stanwawrinka"]
player_account_map["Novak Djokovic"] = ["DjokerNole"]
player_account_map["Caroline Garcia"] = ["CaroGarcia"]
player_account_map["Caroline Wozniacki"] = ["CaroWozniacki"]
player_account_map["Marin Cilic"] = ["cilic_marin"]
player_account_map["Kristina Mladenovic"] = ["KikiMladenovic"]
player_account_map["Dominic Thiem"] = ["ThiemDomi"]
player_account_map["Rafael Nadal"] = ["RafaelNadal"]
player_account_map["Timea Bacsinszky"] = ["TimeaOfficial"]
player_account_map["Pablo Carreno Busta"] = ["pablocarreno91"]
player_account_map["Simona Halep"] = ["Simona_Halep"]
player_account_map["Andy Murray"] = ["andy_murray"]

# Load Schedule

In [None]:
schedule_df = pd.read_csv(schedule_file_path, sep="|")

In [None]:
excluded_categories = ["boy", "girl", "wheelchair", "legends over 45"]

## Convert start dates to UTC for the proper evaluation

In [None]:
schedule_df["startDate"].value_counts()

In [None]:
utc_hour_map = {
    "11:00 AM" : 9,
    "10:00 AM" : 8,
    "12:00 PM" : 10,
    "2:00 PM" : 12,
    "11:30 AM" : 10, # hour was rounded up
    "3:00 PM" : 13,
    "12:45 PM" : 11 # hour was rounded up
}

In [None]:
schedule_df["utc_start_hour"] = schedule_df["startDate"].apply(lambda x: utc_hour_map[x])

In [None]:
schedule_df["utc_start_hour"].value_counts()

# Filter Schedule

   * only Single matches are kept
   * only important categories are kept (Men's, Women's, Legends under 45)

In [None]:
def filter_categories(match_cat, excluded_cats=excluded_categories):
    match_cat_lower = match_cat.lower()
    keep_this = True
    for cat in excluded_cats:
        if cat in match_cat_lower:
            keep_this = False
            break
    if not ("final" in match_cat_lower and "single" in match_cat_lower):
        keep_this = False
    return keep_this

In [None]:
finals_df = schedule_df[schedule_df["matchHeader"].apply(filter_categories)]

In [None]:
len(schedule_df), len(finals_df)

# Single finals

   * **canceled** matches are not excluded because people may talk about this events as well 

In [None]:
finals_df

# Player name parts

In [None]:
players = list(set(finals_df["playerName active"]).union(finals_df["playerName opponent"]))

In [None]:
len(players)

In [None]:
player_info_map = {}
for player in players:
    player_info_map[player] = {
        "name_parts": [p.lower() for p in player.split()],
        "accounts": ["@" + et.transform_account_name(a, remove_digits=False, remove_under_score=False, to_lower=False) for a in player_account_map[player]]
    }

## Show multi-account players

#### Due to the pre-filtering there is no duplication

In [None]:
for player, info in player_info_map.items():
    if len(info["accounts"]) > 1:
        print(player, info["accounts"])

In [None]:
player_info_map