In [None]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [None]:
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline
import seaborn as sns
sns.set(font="DejaVu Sans")
import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../", "TrendApproximation", sys.argv)

In [None]:
experiment_dir = ph.get("experiment_dir")
shedule_file_path = ph.get("schedule_file_path")
screen_names_file_path = ph.get("player_screen_names_file_path")

# Load player accounts

In [None]:
player_accounts = et.load_player_accounts(screen_names_file_path, remove_digits=False, remove_under_score=False, to_lower=True)

In [None]:
player_accounts[:10]

# Load occurances file

In [None]:
occurences_pd = pd.read_csv("%s/occ_table.csv" % experiment_dir, sep="|")
len(occurences_pd)

In [None]:
occurences_pd.head()

## Dropping irrelevant snapshots

In [None]:
occurences_pd["date"] = occurences_pd["start_time"].apply(lambda x: x.split("T")[0])
occurences_pd = occurences_pd[~occurences_pd["date"].isin(['2017-06-12','2017-06-13','2017-06-14','2017-06-15','2017-06-16'])]
len(occurences_pd)

# TODO: lot of missing players? duplications?

#### Simona Halep
   * @Simona_Halep korábban nem volt előfordulása - most már van
   * de @simonahalep-nak továbbra is vannak...

In [None]:
len(player_accounts)

In [None]:
len(set(occurences_pd["key_word"].unique()).intersection(set(player_accounts)))

In [None]:
set(player_accounts).difference(set(occurences_pd["key_word"].unique()))

In [None]:
word_cols = [str(i) for i in range(1,201,2)]
count_cols = [str(i) for i in range(2,201,2)]

# Additional words to examine

In [None]:
special_words = []
special_words += ["rolandgarros", "frenchopen"]
special_words += ["match", "play", "game", "fight", "face", "result", "court", "now", "today", "tomorrow", "injury", "shock"]
special_words += ["qualifi", "surviv", "elimin", "domin"]
special_words += ["birthday"]
special_words += ["wins", "win", "won", "champion", "champ", "king", "beat", "trophy", "triumph", "overcom"]
special_words += ["lose", "loss", "lost", "defeat", "beaten", "broken", "break"]

### Men, Women Finalists in Single tournaments

In [None]:
special_words += ["rafa", "nadal", "rafael", "@RafaelNadal".lower()]
special_words += ["stan", "wawrinka", "@stanwawrinka".lower()]
special_words += ["andy", "murray", "@andy_murray".lower()]
special_words += ["thiem", "dominic", "@ThiemDomi".lower()]
special_words += ["novak", "djokovic", "@DjokerNole".lower()]
special_words += ['svitolina', 'elina', "@ElinaSvitolina".lower()]
special_words += ['carreno', 'busta', 'pablo', "@pablocarreno91".lower()]
special_words += ['timea', 'bacsinszky', "@TimeaOfficial".lower()]
special_words += ['jelena', 'ostapenko', "OstapenkoFC".lower()]
special_words += ['halep', 'simona', "@Simona_Halep".lower()]
special_words += ['karolina', 'pliskova', "@KaPliskova".lower()]
special_words += ['mladenovic', 'kristina', "@KikiMladenovic".lower()]
special_words += ['caroline', 'wozniacki', "@CaroWozniacki".lower()]
special_words += ['caroline', 'garcia', "@CaroGarcia".lower()]
special_words += ['nishikori', 'kei', "@keinishikori".lower()]
special_words += ['cilic', 'marin', "@cilic_marin".lower()]

In [None]:
occurences_pd = occurences_pd[occurences_pd["key_word"].isin(player_accounts+special_words)]

In [None]:
len(occurences_pd)

In [None]:
occurences_pd.tail()

# Restructure data

In [None]:
pair_occs_df = pd.DataFrame()
size = len(word_cols)
for idx, row in occurences_pd.iterrows():
    start_time, key_word, key_word_count = row["start_time"], row["key_word"], row["count"]
    hour = start_time.split("T")[1][:2]
    hours, times, key_words, key_word_counts = zip(*((hour, start_time, key_word, key_word_count) for i in range(size)))
    values = [times, hours, key_words, key_word_counts, row[word_cols], row[count_cols]]
    cols = ["time","hour","word_1", "word_1_count", "word_2", "word_2_count"]
    some_occs = list(zip(*values))
    tmp_df = pd.DataFrame(some_occs, columns=cols)
    # exclude no hits
    tmp_df = tmp_df[tmp_df["word_2_count"] > 0]
    # exclude self occurences
    tmp_df = tmp_df[tmp_df["word_2"] != key_word]
    pair_occs_df = pd.concat([pair_occs_df, tmp_df])

In [None]:
pair_occs_df = pair_occs_df.reset_index()
del pair_occs_df["index"]
print(len(pair_occs_df))

In [None]:
pair_occs_df.to_csv("%s/full_pair_occs.csv" % experiment_dir, sep="|", index=False)

raise RuntimeError("Reading from file!!!")

pair_occs_df = pd.read_csv("%s/full_pair_occs.csv" % experiment_dir, sep="|")

# TODO: Dropping excluded words: @rolandgarros?

In [None]:
pair_occs_df = pair_occs_df[~pair_occs_df["word_2"].isin(["@rolandgarros"])]
print(len(pair_occs_df))

# Calculate co-occurence statistics for snapshots

In [None]:
pair_occs_df["date"] = pair_occs_df["time"].apply(lambda x: x.split("T")[0])

In [None]:
pair_occs_for_stats_df = pair_occs_df#[~pair_occs_df["date"].isin(['2017-06-11'])]
print(len(pair_occs_for_stats_df))

# TODO: Actually I should compute the occ_scores temporally (only taking into account the past occurences...)

## a.) Calculate global mean

In [None]:
pair_occs_for_stats_df.head(2)

In [None]:
global_count_occs = pair_occs_for_stats_df.groupby(by=["word_1","word_2"])["time"].count().reset_index()

### Calculating the mean of the fractions for all snapshots

In [None]:
count_cols = ["word_1_count","word_2_count"]

In [None]:
global_mean_occs = pair_occs_for_stats_df.groupby(by=["word_1","word_2"])[count_cols].sum().reset_index()
global_mean_occs["global"] = global_mean_occs["word_2_count"] / global_mean_occs["word_1_count"]

# TODO: If (word_1,word_2) occurring in only 2-3 snapshot is not excluded then global distribution has spikes at 1.0, 0.5, 1/3 etc. - we get a clean distribution only if I remove infrequent word_2-s

global_mean_occs["global"].value_counts()

In [None]:
global_mean_occs["global"].hist(bins=100)

In [None]:
global_mean_occs["key"] = list(zip(global_mean_occs["word_1"],global_mean_occs["word_2"]))
GLOBAL_MEANS = dict(zip(global_mean_occs["key"],global_mean_occs["global"]))

In [None]:
len(global_mean_occs), len(GLOBAL_MEANS)

GLOBAL_MEANS

## b.) Calculate snapshot mean

In [None]:
snapshot_mean_occs = pair_occs_for_stats_df.groupby(by=["word_1","word_2","hour"])[count_cols].sum().reset_index()
snapshot_mean_occs["snapshot"] = snapshot_mean_occs["word_2_count"] / snapshot_mean_occs["word_1_count"]

# The distribution is not clean due to the fact that I do not filter for pairs occurring in at least 2-3 same time of day snapshot!!!

In [None]:
snapshot_mean_occs["snapshot"].hist(bins=100)

In [None]:
snapshot_mean_occs["key"] = list(zip(snapshot_mean_occs["word_1"],snapshot_mean_occs["word_2"],snapshot_mean_occs["hour"]))
SNAPSHOT_MEANS = dict(zip(snapshot_mean_occs["key"],snapshot_mean_occs["snapshot"]))

SNAPSHOT_MEANS

# Calculate occurence score

In [None]:
def fill_with_global_score(row):
    key = (row["word_1"],row["word_2"])
    return GLOBAL_MEANS[key] if key in GLOBAL_MEANS else 0.0

In [None]:
pair_occs_df["global_val"] = pair_occs_df.apply(fill_with_global_score, axis=1)

In [None]:
def fill_with_snapshot_score(row):
    key = (row["word_1"],row["word_2"],row["hour"])
    return SNAPSHOT_MEANS[key] if key in SNAPSHOT_MEANS else 0.0

In [None]:
pair_occs_df["snapshot_val"] = pair_occs_df.apply(fill_with_snapshot_score, axis=1)

In [None]:
pair_occs_df.min()

# Export data

In [None]:
pair_occs_df.to_csv("%s/occ_pairs_with_scores.csv" % experiment_dir, index=False, sep="|")

In [None]:
pair_occs_df.head()

# Analyze results

def calculate_norm_score(row, c_val, alpha=0, eps=0.0):
    val_key = "rel_count_%i" % c_val
    global_norm = row["global_val"][val_key] if row["global_val"] != 0 else 0
    snapshot_norm = row["snapshot_val"][val_key] if row["snapshot_val"] != 0 else 0
    # both normalization constant is missing
    if global_norm == 0 and snapshot_norm == 0:
        return 0.0
    else:
        return (eps + (2.0 + alpha) * row[val_key]) / (eps + global_norm + snapshot_norm)

c_val, alpha_val = 1, 0
pair_occs_df["norm_c%i_a%i" % (c_val, alpha_val)] = pair_occs_df.apply(lambda x: calculate_norm_score(x,c_val=1, alpha=alpha_val), axis=1)

pair_occs_df["day"] = pair_occs_df["time"].apply(lambda x: x.split("T")[0])

pair_occs_df.head()

def show_score(w1, w2, score_name="norm_c1_a0"):
    filtered_df = pair_occs_df[(pair_occs_df["word_1"] == w1) & (pair_occs_df["word_2"] == w2)]
    pivot_scores = pd.pivot_table(filtered_df, values=score_name, index="hour", columns="day")
    fig, ax = plt.subplots(figsize=(30,5))
    plt.title("%s->%s: %i record" % (w1, w2, len(filtered_df)))
    sns.heatmap(pivot_scores, ax=ax)
    plt.show()

## Load schedule

schedule_df = pd.read_csv(shedule_file_path, sep="|")
schedule_df.head()

## Rafa nadal matches

   * the first few matches of Nadal has high scores

name = "Rafael Nadal"
schedule_df[(schedule_df["playerName active"] == name) | (schedule_df["playerName opponent"] == name)]

show_score("@RafaelNadal","match")

show_score("@RafaelNadal","win")

## How often are the two finalist mentioned together (Men's single final on 06-11)

show_score("@RafaelNadal","@stanwawrinka")

## The winner is Nadal (Men's single final on 06-11)

show_score("@RafaelNadal","champion")

show_score("@stanwawrinka","champion")

## The loser is Wawrinka (Men's single final on 06-11)

show_score("@stanwawrinka","beat")

show_score("@RafaelNadal","beat")

## Nadal birthday: June 3

show_score("@RafaelNadal","birthday")

## Novak Djokovic lost on 06-07 - occurences score diminishes after this day

name = "Novak Djokovic"
schedule_df[(schedule_df["playerName active"] == name) | (schedule_df["playerName opponent"] == name)]

show_score("@DjokerNole","match")

# Toplists

pair_occs_df.head()

def get_toplist(w1, snapshot_id, score_name="norm_c1_a0"):
    filtered_df = pair_occs_df[(pair_occs_df["word_1"] == w1) & (pair_occs_df["time"] == snapshot_id)]
    return filtered_df.sort_values(score_name, ascending=False)

#### "@stanwawrinka" and "final" is in top5

get_toplist("@RafaelNadal","2017-06-11T07:00").head(20)

get_toplist("@rafaelnadal","2017-06-10T18:00").head(20)

#### "@stanwawrinka" is in top1

get_toplist("@RafaelNadal","2017-06-11T10:00").head(20)

#### "win", "title" and "champion" is in top words + "congrat"

get_toplist("@RafaelNadal","2017-06-11T13:00").head(20)

get_toplist("@RafaelNadal","2017-06-11T16:00").head(20)

#### nadal birthday

get_toplist("@RafaelNadal","2017-06-03T16:00").head(20)

get_toplist("play","2017-06-04T16:00").head(20)