In [None]:
import numpy as np
import pandas as pd
import json, datetime, sys, os, pytz
from collections import Counter

In [None]:
sys.path.insert(0,"../../python/")
from parametrization import ParamHelper

In [None]:
sys.path.insert(0,"../../python")
import data_processing.player_labeling as pl
import data_processing.tennis_player_processing as tpp

# 1. Load data

In [None]:
dataset_id = "uo17"

In [None]:
if dataset_id == "uo17":
    # There was no tennis match on 2017-08-26,27!
    min_epoch = 1503288000
    last_date = "2017-09-10"
    missing_dates = ["2017-08-21","2017-08-26","2017-08-27"]
    time_zone = pytz.timezone('America/New_York')
    sep = ";"
elif dataset_id == "rg17":
    # There was no tennis match on 2017-05-27!
    min_epoch = 1495584000
    last_date = "2017-06-11"
    missing_dates = ["2017-05-27"]
    time_zone = pytz.timezone('Europe/Paris')
    sep = "|"
else:
    raise RuntimeError("Invalid dataset!")
print(pl.epoch2date(min_epoch, time_zone))

In [None]:
output_prefix = "../../experiments/%s" % dataset_id

## a.) Load mention events

In [None]:
mentions_df = pd.read_csv("../../data/%s_data/raw/%s_mentions.csv" % (dataset_id, dataset_id), sep=" ", names=["epoch","src","trg"])

In [None]:
print(len(mentions_df))
mentions_df = mentions_df[mentions_df["epoch"] >= min_epoch]
print(len(mentions_df))

In [None]:
mentions_df.head(3)

## b.) Load schedule events

"Canceled" events are kept because users may mention them anyway

In [None]:
schedule_df = pd.read_csv("../../data/%s_data/raw/%s_schedule_df.csv" % (dataset_id, dataset_id), sep=sep)

In [None]:
schedule_df.head(3)

## c.) Load tennis player matches

In [None]:
tennis_player_matches = pd.read_csv("../../data/%s_data/raw/%s_tennis_player_matches.csv" % (dataset_id, dataset_id), sep="|")

In [None]:
tennis_player_matches.head()

In [None]:
players_dict = {}
for idx, row in tennis_player_matches.iterrows():
    player_name, screen_name = row["player_name"], row["screen_name"]
    if not player_name in players_dict:
        players_dict[player_name] = []
    players_dict[player_name].append(screen_name)

In [None]:
screen_name_to_id = dict(zip(tennis_player_matches["screen_name"],tennis_player_matches["generated_id"]))

In [None]:
screen_name_to_player = dict(zip(tennis_player_matches["screen_name"],tennis_player_matches["player_name"]))

## d.) Load recoded user ids

**FAKE step:** in this version recoded user id-s are used. So the dictionary below will be an identify mapping

In [None]:
recoder_dict = dict(zip(tennis_player_matches["generated_id"],tennis_player_matches["generated_id"]))

# 2. Extract found player accounts for each day

In [None]:
_, daily_players_df = tpp.get_daily_players(schedule_df, players_dict)

In [None]:
daily_players_df.head()

## Set empty list for days without games

In [None]:
daily_found_player_dict = dict(zip(daily_players_df["date"],daily_players_df["found_players"]))
for d in missing_dates:
    daily_found_player_dict[d] = []

In [None]:
daily_found_player_dict[last_date]

# 3. Setting date based on timezone

In [None]:
mentions_df["date"] = mentions_df["epoch"].apply(lambda x: pl.epoch2date(x,time_zone))

In [None]:
mentions_df.head()

In [None]:
collected_dates = sorted(mentions_df["date"].unique())

In [None]:
collected_dates

# 4. Label daily active users

   * both players and other users from the Twitter mention dataset
   * binary labeling: daily active players get 1 and others 0

In [None]:
use_binary_labels = True
if use_binary_labels:
    label_value_dict = {"current":1.0, "previous":0.0, "next":0.0}
else:
    label_value_dict = {"current":2.0, "previous":1.0, "next":1.0}

In [None]:
mapper_dicts = (recoder_dict, screen_name_to_player, screen_name_to_id, daily_found_player_dict)
daily_label_dicts = pl.get_daily_label_dicts(label_value_dict, collected_dates, mentions_df, mapper_dicts, last_date, missing_dates)

# 5. Export files

## i.) Export daily found players (binary encoding)

In [None]:
dir_name = "tennis_players_binary%s" % use_binary_labels
player_output_dir = "%s/%s" % (output_prefix, dir_name)
print(player_output_dir)

In [None]:
pl.export_label_files(player_output_dir, collected_dates, daily_label_dicts, last_date, only_pos_label=True)

## ii.) Export the ids of the found players

### Filtering player account for only those who are present in the dataset:

   * many players who participated in Roland Garros 2017 are not present in USOpen17

In [None]:
pl.export_recoded_player_ids(player_output_dir, screen_name_to_id, screen_name_to_player, recoder_dict)

## iii.) Export dictionaries

with open("%s/recoder_dict.txt" % player_output_dir, 'w') as f:
    for k in sorted(recoder_dict.keys()):
        f.write("%i %i\n" % (k, recoder_dict[k]))

with open("%s/screen_name_to_player.txt" % player_output_dir, 'w') as f:
    for k in sorted(screen_name_to_player.keys()):
        f.write("%s %s\n" % (k, screen_name_to_player[k]))