In [None]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [None]:
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline
import seaborn as sns
sns.set(font="DejaVu Sans")
import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/TrendApproximation.json", sys.argv)

In [None]:
experiment_id = ph.get("experiment_id")
shedule_file_path = ph.get("schedule_file_path")
screen_names_file_path = ph.get("player_screen_names_file_path")

# Load player accounts

In [None]:
player_accounts = et.load_player_accounts(screen_names_file_path, remove_digits=False, remove_under_score=False, to_lower=False)

In [None]:
player_accounts[:10]

# Load occurances file

In [None]:
occurences_pd = pd.read_csv("/mnt/idms/fberes/network/combined_occ/occ_tables/%s.csv" % experiment_id, sep="|")
len(occurences_pd)

In [None]:
occurences_pd.head()

## Dropping irrelevant snapshots

In [None]:
occurences_pd["date"] = occurences_pd["start_time"].apply(lambda x: x.split("T")[0])
occurences_pd = occurences_pd[~occurences_pd["date"].isin(['2017-06-12','2017-06-13','2017-06-14','2017-06-15','2017-06-16'])]
len(occurences_pd)

# TODO: lot of missing players? duplications?

#### Simona Halep
   * @Simona_Halep korábban nem volt előfordulása - most már van
   * de @simonahalep-nak továbbra is vannak...

In [None]:
len(player_accounts)

In [None]:
len(set(occurences_pd["key_word"].unique()).intersection(set(player_accounts)))

In [None]:
set(player_accounts).difference(set(occurences_pd["key_word"].unique()))

In [None]:
word_cols = [str(i) for i in range(1,201,2)]
count_cols = [str(i) for i in range(2,201,2)]

# Additional words to examine

In [None]:
special_words = []
special_words += ["rolandgarros", "frenchopen"]
special_words += ["match", "play", "fight", "face", "result", "court", "now", "today", "tomorrow", "injury", "shock"]
special_words += ["qualifi", "surviv", "elimin", "domin"]
special_words += ["birthday"]
special_words += ["wins", "win", "won", "champion", "champ", "king", "beat", "trophy", "triumph", "overcom"]
special_words += ["lose", "loss", "lost", "defeat", "beaten", "broken", "break"]

### Men, Women Finalists in Single tournaments

In [None]:
special_words += ["rafa","nadal","rafael","rafaelnadal"]
special_words += ["stan","wawrinka","stantheman"]
special_words += ["andy","murray","andymurray"]
special_words += ["thiem","dominic"]
special_words += ["novak","djokovic"]
special_words += ['svitolina', 'elina']
special_words += ['carreno', 'busta', 'pablo']
special_words += ['timea', 'bacsinszky']
special_words += ['jelena', 'ostapenko']
special_words += ['halep', 'simona']
special_words += ['karolina', 'pliskova']
special_words += ['mladenovic', 'kristina']
special_words += ['caroline', 'wozniacki']
special_words += ['caroline', 'garcia']
special_words += ['nishikori', 'kei']
special_words += ['cilic', 'marin']

In [None]:
occurences_pd = occurences_pd[occurences_pd["key_word"].isin(player_accounts+special_words)]

In [None]:
len(occurences_pd)

# Restructure data

In [None]:
def get_rel_count_with_offset(counts, kw_count, c):
    return (c + np.array(counts)) / (c + kw_count)

# TODO: use float column names rather!!!

In [None]:
c_const_list = [0.001,1.0,2.0,5.0,10.0]
rel_count_cols = ["rel_count_%i" % c for c in c_const_list]

In [None]:
pair_occs_df = pd.DataFrame()
size = len(word_cols)
for idx, row in occurences_pd.iterrows():
    start_time, key_word, key_word_count = row["start_time"], row["key_word"], row["count"]
    hour = start_time.split("T")[1][:2]
    hours, times, key_words = zip(*((hour,start_time,key_word) for i in range(size)))
    values = [times,hours,key_words,row[word_cols]] + [get_rel_count_with_offset(row[count_cols], key_word_count, c) for c in c_const_list]
    cols = ["time","hour","word_1","word_2"] + rel_count_cols
    some_occs = list(zip(*values))
    tmp_df = pd.DataFrame(some_occs, columns=cols)
    # exclude no hits
    tmp_df = tmp_df[~tmp_df["word_2"].isnull()]
    # exclude self occurences
    tmp_df = tmp_df[tmp_df["word_2"] != key_word]
    pair_occs_df = pd.concat([pair_occs_df, tmp_df])

In [None]:
pair_occs_df = pair_occs_df.reset_index()
del pair_occs_df["index"]
print(len(pair_occs_df))

In [None]:
pair_occs_df.head()

## Drop words that did not occur in more than 2 snapshots

In [None]:
word2_freqs = pair_occs_df.groupby(by=["word_2"])["time"].count()

In [None]:
word2_freqs.sort_values(ascending=False)

In [None]:
(word2_freqs > 2).value_counts()

In [None]:
filtered_word_2 = list(word2_freqs[word2_freqs > 2].index)

In [None]:
word2_freqs.mean()

In [None]:
pair_occs_df = pair_occs_df[pair_occs_df["word_2"].isin(filtered_word_2)]
print(len(pair_occs_df))

# TODO: Dropping excluded words: @rolandgarros?

In [None]:
pair_occs_df = pair_occs_df[~pair_occs_df["word_2"].isin(["@rolandgarros"])]
print(len(pair_occs_df))

# Calculate co-occurence statistics for snapshots

### We don't want include the last day into statistics

In [None]:
pair_occs_df["date"] = pair_occs_df["time"].apply(lambda x: x.split("T")[0])
pair_occs_for_stats_df = pair_occs_df[~pair_occs_df["date"].isin(['2017-06-11'])]
print(len(pair_occs_for_stats_df))

# TODO: Actually I should compute the occ_scores temporally (only taking into account the past occurences...)

## a.) Calculate global mean

In [None]:
global_count_occs = pair_occs_for_stats_df.groupby(by=["word_1","word_2"])[rel_count_cols[0]].count().reset_index()

### Drop occurences that occur in only one snapshot

In [None]:
print(len(global_count_occs))
count_tmp_df = global_count_occs[global_count_occs[rel_count_cols[0]] > 1]
print(len(count_tmp_df))

In [None]:
filtered_word_pairs = set(zip(count_tmp_df["word_1"],count_tmp_df["word_2"]))

In [None]:
pair_occs_df.head()

In [None]:
filtered_pair_occs_df = pair_occs_for_stats_df[pair_occs_for_stats_df.apply(lambda x: (x["word_1"], x["word_2"]) in filtered_word_pairs, axis=1)]

In [None]:
len(filtered_pair_occs_df)

### Calculating the mean of the fractions for all snapshots

In [None]:
global_mean_occs = filtered_pair_occs_df.groupby(by=["word_1","word_2"])[rel_count_cols].mean().reset_index()

In [None]:
len(global_mean_occs) == len(count_tmp_df)

In [None]:
global_mean_occs[rel_count_cols].hist()

In [None]:
global_mean_occs["key"] = list(zip(global_mean_occs["word_1"],global_mean_occs["word_2"]))
GLOBAL_MEANS = global_mean_occs[rel_count_cols+["key"]].set_index("key").T.to_dict()

GLOBAL_MEANS

## b.) Calculate snapshot mean

#### Still using only (word1,word2) pairs that occured in multiple snapshots

In [None]:
snapshot_mean_occs = filtered_pair_occs_df.groupby(by=["word_1","word_2","hour"])[rel_count_cols].mean().reset_index()

In [None]:
snapshot_mean_occs["key"] = list(zip(snapshot_mean_occs["word_1"],snapshot_mean_occs["word_2"],snapshot_mean_occs["hour"]))
SNAPSHOT_MEANS = snapshot_mean_occs[rel_count_cols+["key"]].set_index("key").T.to_dict()

SNAPSHOT_MEANS

# Calculate occurence score

In [None]:
def fill_with_global_score(row):
    key = (row["word_1"],row["word_2"])
    return GLOBAL_MEANS[key] if key in GLOBAL_MEANS else 0.0

In [None]:
pair_occs_df["global_val"] = pair_occs_df.apply(fill_with_global_score, axis=1)

In [None]:
def fill_with_snapshot_score(row):
    key = (row["word_1"],row["word_2"],row["hour"])
    return SNAPSHOT_MEANS[key] if key in SNAPSHOT_MEANS else 0.0

In [None]:
pair_occs_df["snapshot_val"] = pair_occs_df.apply(fill_with_snapshot_score, axis=1)

# Export data

In [None]:
pair_occs_df.to_csv("/mnt/idms/fberes/network/combined_occ/occ_scores/%s_with_scores.csv" % experiment_id, index=False, sep="|")

In [None]:
pair_occs_df

# Analyze results

In [None]:
def calculate_norm_score(row, c_val, alpha=0, eps=0.0):
    val_key = "rel_count_%i" % c_val
    global_norm = row["global_val"][val_key] if row["global_val"] != 0 else 0
    snapshot_norm = row["snapshot_val"][val_key] if row["snapshot_val"] != 0 else 0
    # both normalization constant is missing
    if global_norm == 0 and snapshot_norm == 0:
        return 0.0
    else:
        return (eps + (2.0 + alpha) * row[val_key]) / (eps + global_norm + snapshot_norm)

In [None]:
c_val, alpha_val = 1, 0
pair_occs_df["norm_c%i_a%i" % (c_val, alpha_val)] = pair_occs_df.apply(lambda x: calculate_norm_score(x,c_val=1, alpha=alpha_val), axis=1)

In [None]:
pair_occs_df["day"] = pair_occs_df["time"].apply(lambda x: x.split("T")[0])

In [None]:
pair_occs_df.head()

In [None]:
def show_score(w1, w2, score_name="norm_c1_a0"):
    filtered_df = pair_occs_df[(pair_occs_df["word_1"] == w1) & (pair_occs_df["word_2"] == w2)]
    pivot_scores = pd.pivot_table(filtered_df, values=score_name, index="hour", columns="day")
    fig, ax = plt.subplots(figsize=(30,5))
    plt.title("%s->%s: %i record" % (w1, w2, len(filtered_df)))
    sns.heatmap(pivot_scores, ax=ax)
    plt.show()

## Load schedule

In [None]:
schedule_df = pd.read_csv(shedule_file_path, sep="|")
schedule_df.head()

## Rafa nadal matches

   * the first few matches of Nadal has high scores

In [None]:
name = "Rafael Nadal"
schedule_df[(schedule_df["playerName active"] == name) | (schedule_df["playerName opponent"] == name)]

In [None]:
show_score("@RafaelNadal","match")

In [None]:
show_score("@RafaelNadal","win")

## How often are the two finalist mentioned together (Men's single final on 06-11)

In [None]:
show_score("@RafaelNadal","@stanwawrinka")

## The winner is Nadal (Men's single final on 06-11)

In [None]:
show_score("@RafaelNadal","champion")

In [None]:
show_score("@stanwawrinka","champion")

## The loser is Wawrinka (Men's single final on 06-11)

In [None]:
show_score("@stanwawrinka","beat")

In [None]:
show_score("@RafaelNadal","beat")

## Nadal birthday: June 3

In [None]:
show_score("@RafaelNadal","birthday")

## Novak Djokovic lost on 06-07 - occurences score diminishes after this day

In [None]:
name = "Novak Djokovic"
schedule_df[(schedule_df["playerName active"] == name) | (schedule_df["playerName opponent"] == name)]

In [None]:
show_score("@DjokerNole","match")

# Toplists

In [None]:
pair_occs_df.head()

In [None]:
def get_toplist(w1, snapshot_id, score_name="norm_c1_a0"):
    filtered_df = pair_occs_df[(pair_occs_df["word_1"] == w1) & (pair_occs_df["time"] == snapshot_id)]
    return filtered_df.sort_values(score_name, ascending=False)

#### "@stanwawrinka" and "final" is in top5

In [None]:
get_toplist("@RafaelNadal","2017-06-11T07:00").head(20)

get_toplist("@rafaelnadal","2017-06-10T18:00").head(20)

#### "@stanwawrinka" is in top1

In [None]:
get_toplist("@RafaelNadal","2017-06-11T10:00").head(20)

#### "win", "title" and "champion" is in top words + "congrat"

In [None]:
get_toplist("@RafaelNadal","2017-06-11T13:00").head(20)

In [None]:
get_toplist("@RafaelNadal","2017-06-11T16:00").head(20)

#### nadal birthday

In [None]:
get_toplist("@RafaelNadal","2017-06-03T16:00").head(20)

In [None]:
len(pair_occs_df)

In [None]:
get_toplist("play","2017-06-04T16:00").head(20)