In [None]:
import numpy as np
import pandas as pd
import json, datetime, sys, os
from collections import Counter

In [None]:
sys.path.insert(0,"../../python")
from centrality_utils.base_computer import scores2file

# 1. Parse HTML schedule files

In [None]:
%%bash
pushd ../../
mkdir -p ./data/preprocessed
echo "Parsing HTML files..."
python ./scripts/roland_garross_schedule_parser.py
popd

# 2. Load data

In [None]:
output_prefix = "../../data/preprocessed/"

## a.) Load mention events

In [None]:
mentions_df = pd.read_csv("../../data/raw/rg17_mentions.csv", sep=" ", names=["epoch","src","trg"])

In [None]:
mentions_df.head(3)

In [None]:
len(mentions_df)

## b.) Load schedule events

In [None]:
schedule_df = pd.read_csv("../../data/preprocessed/schedule_df.csv", sep="|")

In [None]:
schedule_df.head(3)

## c.) Load tennis player matches

In [None]:
tennis_player_matches = pd.read_csv("../../data/raw/tennis_player_matches.csv", sep="|")

In [None]:
tennis_player_matches.head()

In [None]:
players_dict = {}
for idx, row in tennis_player_matches.iterrows():
    player_name, screen_name = row["player_name"], row["screen_name"]
    if not player_name in players_dict:
        players_dict[player_name] = []
    players_dict[player_name].append(screen_name)

In [None]:
screen_name_to_player = dict(zip(tennis_player_matches["screen_name"],tennis_player_matches["player_name"]))

In [None]:
generated_id_to_player = dict(zip(tennis_player_matches["generated_id"],tennis_player_matches["player_name"]))

#### sanity check (VERIFIED - there is 61 multiplied account)

In [None]:
multiple_acc_count = 0
for name in players_dict:
    if len(players_dict[name]) > 1:
        multiple_acc_count += len(players_dict[name]) - 1
print(multiple_acc_count, len(players_dict) + multiple_acc_count)

# 3. Extract found player accounts for each day

In [None]:
daily_players = {}
for index, row in schedule_df.iterrows():
    date, winner, loser = row["date"], row["playerName active"], row["playerName opponent"]
    court, match = row["courtName"], row["orderNumber"]
    match_id = "%s_%i" % (court, match)
    if not date in daily_players:
        daily_players[date] = {}
    daily_players[date][winner] = match_id
    daily_players[date][loser] = match_id
        
daily_players_grouped = [(key, list(daily_players[key].keys())) for key in daily_players]
daily_players_df = pd.DataFrame(daily_players_grouped, columns=["date", "players"])
daily_players_df = daily_players_df.sort_values("date")

### Filter for found player accounts

In [None]:
daily_players_df["found_players"] = daily_players_df["players"].apply(lambda p_list: [p for p in p_list if p in players_dict])

In [None]:
daily_players_df.head()

In [None]:
daily_found_player_dict = dict(zip(daily_players_df["date"],daily_players_df["found_players"]))

In [None]:
daily_found_player_dict["2017-06-06"]

# 4. Extract daily active users

In [None]:
def epoch2date(epoch):
    dt = datetime.datetime.fromtimestamp(epoch)
    return "%i-%.2i-%.2i" % (dt.year, dt.month, dt.day)

epoch2date(1498431714)

In [None]:
mentions_df["date"] = mentions_df["epoch"].apply(epoch2date)

In [None]:
mentions_df.head()

In [None]:
collected_dates = mentions_df["date"].unique()

In [None]:
def get_active_users(df):
    return list(set(df["src"]).union(set(df["trg"])))

In [None]:
daily_active_users_dict = {}
for date in sorted(collected_dates):
    print(date)
    daily_df = mentions_df[mentions_df["date"] == date]
    daily_active_users_dict[date] = get_active_users(daily_df)
    if date == "2017-06-11":
        break

### Note: There was no tennis match on 2017-05-27!

In [None]:
daily_label_dicts = {}
for date in sorted(collected_dates):
    print(date)
    label_dict, daily_actives = {}, daily_active_users_dict[date]
    for user_id in daily_actives:
        if date == "2017-05-27":
            label_dict[user_id] = 0
        else:
            label_dict[user_id] = 1 if (user_id in generated_id_to_player and generated_id_to_player[user_id] in daily_found_player_dict[date]) else 0
    daily_label_dicts[date] = label_dict
    if date == "2017-06-11":
        break

# 5. Export files

In [None]:
full_path = "%s/%s" % (output_prefix, "tennis_players")
if not os.path.exists(full_path):
    os.makedirs(full_path)
    print("%s folder was created." % full_path)

## i.) Export daily found players (binary encoding)

In [None]:
i = 0
for date in sorted(collected_dates):
    scores2file(list(daily_label_dicts[date].items()),"%s/tennis_players/players_%i.csv" % (output_prefix, i))
    i += 1
    if date == "2017-06-11":
        break

## ii.) Export the ids of the found players

In [None]:
with open("%s/recoded_player_accounts.txt" % output_prefix,'w') as f:
    for val in generated_id_to_player:
        f.write("%i\n" % val)