In [None]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [None]:
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline
import seaborn as sns
sns.set(font="DejaVu Sans")
import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

# Load player accounts

   * to lowercase
   * '_' character removed
   * numerical characters were removed as well - **NOT YET!!!**

In [None]:
player_accounts = et.load_player_accounts(remove_digits=False)

In [None]:
player_accounts[:10]

# Load occurances file

In [None]:
occurences_pd = pd.read_csv("/mnt/idms/fberes/network/roland_garros/rg17_20171003_8000_en-players_3h_accurate.csv",sep="|")
len(occurences_pd)

In [None]:
occurences_pd.head()

# TODO: lot of missing players

In [None]:
len(player_accounts)

In [None]:
len(set(occurences_pd["key_word"].unique()).intersection(set(player_accounts)))

In [None]:
set(player_accounts).difference(set(occurences_pd["key_word"].unique()))

In [None]:
word_cols = [str(i) for i in range(1,201,2)]
count_cols = [str(i) for i in range(2,201,2)]

# Additional words to examine

In [None]:
special_words = ["rolandgarros", "frenchopen", "match", "play", "fight", "face", "result", "court", "now", "today", "tomorrow", "injury", "shock"]
special_words += ["qualifi", "surviv", "elimin", "domin"]
special_words += ["birthday"]
special_words += ["wins", "win", "won", "champion", "champ", "king", "beat", "trophy", "triumph", "overcom"]
special_words += ["lose", "loss", "lost", "defeat", "beaten", "broken", "break"]

In [None]:
special_words += ["rafa","nadal","rafael","rafaelnadal"]
special_words += ["stan","wawrinka","stantheman"]
special_words += ["andy","murray","andymurray"]
special_words += ["thiem","dominique"]
special_words += ["novak","djokovic"]

In [None]:
occurences_pd = occurences_pd[occurences_pd["key_word"].isin(player_accounts+special_words)]

In [None]:
len(occurences_pd)

### Normalization of counts in each snapshot: a word can even occur in all messages in the given timestamp

In [None]:
occurences_pd.head()

In [None]:
"win" in occurences_pd["3"].unique()

In [None]:
for col in count_cols:
    occurences_pd[col] = occurences_pd[col] / occurences_pd["count"]

In [None]:
occurences_pd.head()

### Restructure data

In [None]:
pair_occs_df = pd.DataFrame()
size = len(word_cols)
for idx, row in occurences_pd.iterrows():
    hour = row[0].split("T")[1][:2]
    hours = [hour for i in range(size)]
    times = [row[0] for i in range(size)]
    key_words = [row[2] for i in range(size)]
    some_occs = list(zip(times,hours,key_words,row[word_cols],row[count_cols]))
    tmp_df = pd.DataFrame(some_occs, columns=["time","hour","word_1","word_2","count"])
    tmp_df = tmp_df[~tmp_df["word_2"].isnull()]
    pair_occs_df = pd.concat([pair_occs_df, tmp_df])

In [None]:
pair_occs_df = pair_occs_df.reset_index()
del pair_occs_df["index"]

In [None]:
pair_occs_df.head()

In [None]:
word2_freqs = pair_occs_df.groupby(by=["word_2"])["time"].count()

word2_freqs.sort_values(ascending=False)

In [None]:
(word2_freqs > 2).value_counts()

In [None]:
filtered_word_2 = list(word2_freqs[word2_freqs > 2].index)

word2_freqs.mean()

%matplotlib inline
import matplotlib.pyplot as plt

s = 50.0
var = s**2
occ = np.array(range(1000))
exponents = - np.power(occ,1.3) / (2*var)
lin_part = occ / var
vals = lin_part * np.exp(exponents)
plt.plot(occ,vals)

In [None]:
pair_occs_df = pair_occs_df[pair_occs_df["word_2"].isin(filtered_word_2)]

### Excluding stop words

In [None]:
excluded_words = ["@rolandgarros","the", "and", "that", "this", "is", "are", "what", "who", "which", "was", "were"]

In [None]:
pair_occs_df = pair_occs_df[~pair_occs_df["word_2"].isin(excluded_words)]

### Dropping self-mentions

In [None]:
print(len(pair_occs_df))
pair_occs_df = pair_occs_df[~(pair_occs_df["word_1"] == pair_occs_df["word_2"])]
print(len(pair_occs_df))

# Calculate co-occurence statistics for snapshots

# TODO: Actually I should compute the occ_scores temporally (only taking into account the past occurences...)

In [None]:
pair_occs_df["date"] = pair_occs_df["time"].apply(lambda x: x.split("T")[0])

In [None]:
pair_occs_for_stats_df = pair_occs_df[~pair_occs_df["date"].isin(['2017-06-11','2017-06-12','2017-06-13','2017-06-14','2017-06-15','2017-06-16'])]

In [None]:
len(pair_occs_for_stats_df)

## a.) Calculate global mean

In [None]:
global_count_occs = pair_occs_for_stats_df.groupby(by=["word_1","word_2"])["count"].count().reset_index()

### Drop occurences that occur in only one snapshot

In [None]:
print(len(global_count_occs))
count_tmp_df = global_count_occs[global_count_occs["count"] > 1]
print(len(count_tmp_df))

In [None]:
filtered_word_pairs = set(zip(count_tmp_df["word_1"],count_tmp_df["word_2"]))

In [None]:
pair_occs_df.head()

In [None]:
filtered_pair_occs_df = pair_occs_for_stats_df[pair_occs_for_stats_df.apply(lambda x: (x["word_1"], x["word_2"]) in filtered_word_pairs, axis=1)]

In [None]:
len(filtered_pair_occs_df)

### Calculating the mean of the fractions for all snapshots

In [None]:
global_mean_occs = filtered_pair_occs_df.groupby(by=["word_1","word_2"])["count"].mean().reset_index()

In [None]:
len(global_mean_occs) == len(count_tmp_df)

In [None]:
global_mean_occs["count"].hist()

In [None]:
global_mean_occs["key"] = list(zip(global_mean_occs["word_1"],global_mean_occs["word_2"]))
GLOBAL_MEANS = dict(zip(global_mean_occs["key"],global_mean_occs["count"]))

## b.) Calculate snapshot mean

#### Still using only (word1,word2) pairs that occured in multiple snapshots

In [None]:
snapshot_mean_occs = filtered_pair_occs_df.groupby(by=["word_1","word_2","hour"])["count"].mean().reset_index()

In [None]:
snapshot_mean_occs["key"] = list(zip(snapshot_mean_occs["word_1"],snapshot_mean_occs["word_2"],snapshot_mean_occs["hour"]))
SNAPSHOT_MEANS = dict(zip(snapshot_mean_occs["key"],snapshot_mean_occs["count"]))

# Calculate occurence score

In [None]:
"win" in pair_occs_df["word_2"].unique()

pair_occs_df.head()

In [None]:
def fill_with_global_score(row):
    key = (row["word_1"],row["word_2"])
    return GLOBAL_MEANS[key] if key in GLOBAL_MEANS else 0.0

In [None]:
pair_occs_df["global_val"] = pair_occs_df.apply(fill_with_global_score, axis=1)

In [None]:
def fill_with_snapshot_score(row):
    key = (row["word_1"],row["word_2"],row["hour"])
    return SNAPSHOT_MEANS[key] if key in SNAPSHOT_MEANS else 0.0

In [None]:
pair_occs_df["snapshot_val"] = pair_occs_df.apply(fill_with_snapshot_score, axis=1)

In [None]:
def calculate_occ_score(row):
    return (1.0 + 2.0 * row["count"]) / (1.0 + row["global_val"] + row["snapshot_val"])

In [None]:
pair_occs_df["occ_score"] = pair_occs_df.apply(calculate_occ_score, axis=1)

In [None]:
pair_occs_df["day"] = pair_occs_df["time"].apply(lambda x: x.split("T")[0])

In [None]:
pair_occs_df.head()

# Analyze results

In [None]:
schedule_df = pd.read_csv("/mnt/idms/fberes/network/roland_garros/data/schedule_new_df.csv", sep="|")
schedule_df.head()

In [None]:
def show_occ_score(w1, w2):
    filtered_df = pair_occs_df[(pair_occs_df["word_1"] == w1) & (pair_occs_df["word_2"] == w2)]
    pivot_scores = pd.pivot_table(filtered_df, values="occ_score", index="hour", columns="day")
    fig, ax = plt.subplots(figsize=(30,5))
    plt.title("%s->%s: %i record" % (w1, w2, len(filtered_df)))
    sns.heatmap(pivot_scores, ax=ax)
    plt.show()

## Rafa nadal matches

   * the first few matches of Nadal has high scores

In [None]:
name = "Rafael Nadal"
schedule_df[(schedule_df["playerName active"] == name) | (schedule_df["playerName opponent"] == name)]

In [None]:
show_occ_score("@rafaelnadal","match")

In [None]:
show_occ_score("@rafaelnadal","win")

## How often are the two finalist mentioned together (Men's single final on 06-11)

In [None]:
show_occ_score("@rafaelnadal","@stanwawrinka")

## The winner is Nadal (Men's single final on 06-11)

In [None]:
show_occ_score("@rafaelnadal","champion")

In [None]:
show_occ_score("@stanwawrinka","champion")

## The loser is Wawrinka (Men's single final on 06-11)

In [None]:
show_occ_score("@stanwawrinka","beat")

In [None]:
show_occ_score("@rafaelnadal","beat")

## Nadal birthday: June 3

In [None]:
show_occ_score("@rafaelnadal","birthday")

## Novak Djokovic lost on 06-07 - occurences score diminishes after this day

In [None]:
name = "Novak Djokovic"
schedule_df[(schedule_df["playerName active"] == name) | (schedule_df["playerName opponent"] == name)]

In [None]:
show_occ_score("@djokernole","match")

# Toplists

In [None]:
pair_occs_df.head()

In [None]:
def get_toplist(w1, snapshot_id):
    filtered_df = pair_occs_df[(pair_occs_df["word_1"] == w1) & (pair_occs_df["time"] == snapshot_id)]
    return filtered_df.sort_values("occ_score", ascending=False)

#### "@stanwawrinka" and "final" is in top5

In [None]:
get_toplist("@rafaelnadal","2017-06-11T09:00").head(20)

get_toplist("@rafaelnadal","2017-06-10T18:00").head(20)

#### "@stanwawrinka" is in top1

In [None]:
get_toplist("@rafaelnadal","2017-06-11T12:00").head(20)

#### "win", "title" and "champion" is in top words + "congrat"

In [None]:
get_toplist("@rafaelnadal","2017-06-11T15:00").head(20)

In [None]:
get_toplist("@rafaelnadal","2017-06-11T18:00").head(20)

#### nadal birthday

In [None]:
get_toplist("@rafaelnadal","2017-06-03T18:00").head(20)

In [None]:
len(pair_occs_df)

In [None]:
get_toplist("play","2017-06-04T18:00").head(20)

# Export data

In [None]:
pair_occs_df.to_csv("/mnt/idms/fberes/network/roland_garros/occs_with_scores.csv", index=False, sep="|")