In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font="DejaVu Sans")

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/TrendApproximation.json", sys.argv)

In [None]:
player_name_with_account_file_path = ph.get("player_name_with_accounts_file_path")
schedule_file_path = ph.get("schedule_file_path")
w2v_model_dir = ph.get("w2v_root_folder")
experiment_id = ph.get("experiment_id")
TIME_HOUR_VALS = ph.get("time_hour_vals")

# Load Player Accounts

In [None]:
with open(player_name_with_account_file_path) as f:
    player_account_map = json.load(f)

TODO: should we use nicknames???
    
   * rafa
   * djoko
   * penko
   * sveta

## !!! Select main account for players: later we should decide whether we will enable multiple accounts in the evaluation

In [None]:
player_account_map["Stan Wawrinka"] = ["stanwawrinka"]
player_account_map["Novak Djokovic"] = ["DjokerNole"]
player_account_map["Caroline Garcia"] = ["CaroGarcia"]
player_account_map["Caroline Wozniacki"] = ["CaroWozniacki"]
player_account_map["Marin Cilic"] = ["cilic_marin"]
player_account_map["Kristina Mladenovic"] = ["KikiMladenovic"]
player_account_map["Dominic Thiem"] = ["ThiemDomi"]
player_account_map["Rafael Nadal"] = ["RafaelNadal"]
player_account_map["Timea Bacsinszky"] = ["TimeaOfficial"]
player_account_map["Pablo Carreno Busta"] = ["pablocarreno91"]
player_account_map["Simona Halep"] = ["Simona_Halep"]
player_account_map["Andy Murray"] = ["andy_murray"]

# Load Schedule

In [None]:
schedule_df = pd.read_csv(schedule_file_path, sep="|")

In [None]:
excluded_categories = ["boy", "girl", "wheelchair", "legends over 45"]

## Convert start dates to UTC for the proper evaluation

In [None]:
schedule_df["startDate"].value_counts()

In [None]:
utc_hour_map = {
    "11:00 AM" : 9,
    "10:00 AM" : 8,
    "12:00 PM" : 10,
    "2:00 PM" : 12,
    "11:30 AM" : 10, # hour was rounded up
    "3:00 PM" : 13,
    "12:45 PM" : 11 # hour was rounded up
}

In [None]:
schedule_df["utc_start_hour"] = schedule_df["startDate"].apply(lambda x: utc_hour_map[x])

In [None]:
schedule_df["utc_start_hour"].value_counts()

# Filter Schedule

   * only Single matches are kept
   * only important categories are kept (Men's, Women's, Legends under 45)

In [None]:
def filter_categories(match_cat, excluded_cats=excluded_categories):
    match_cat_lower = match_cat.lower()
    keep_this = True
    for cat in excluded_cats:
        if cat in match_cat_lower:
            keep_this = False
            break
    if not ("final" in match_cat_lower and "single" in match_cat_lower):
        keep_this = False
    return keep_this

In [None]:
finals_df = schedule_df[schedule_df["matchHeader"].apply(filter_categories)]

In [None]:
len(schedule_df), len(finals_df)

## Single finals

   * **canceled** matches are not excluded because people may talk about this events as well 

In [None]:
finals_df

# Player name parts

In [None]:
players = list(set(finals_df["playerName active"]).union(finals_df["playerName opponent"]))

In [None]:
len(players)

In [None]:
player_info_map = {}
for player in players:
    player_info_map[player] = {
        "name_parts": [p.lower() for p in player.split()],
        "accounts": ["@" + et.transform_account_name(a, remove_digits=False, remove_under_score=False, to_lower=False) for a in player_account_map[player]]
    }

## Show multi-account players

In [None]:
for player, info in player_info_map.items():
    if len(info["accounts"]) > 1:
        print(player, info["accounts"])

In [None]:
player_info_map

# Co-occurences

In [None]:
pair_occs_df = pd.read_csv("/mnt/idms/fberes/network/combined_occ/occ_scores/%s_with_scores.csv" % experiment_id, sep="|")

In [None]:
pair_occs_df.head(2)

In [None]:
word_2_set = set(pair_occs_df["word_2"].unique())
word_1_set = set(pair_occs_df["word_1"].unique())
len(word_1_set), len(word_2_set)

In [None]:
def show_missing_words(info_key, word_set):
    for player, info in player_info_map.items():
        diff = list(set(info[info_key]).difference(word_set))
        if len(diff) != 0:
            print("%s: %s missing!" % (player, diff))
            
def show_matching_words(info_key, word_set):
    for player, info in player_info_map.items():
        match = list(set(info[info_key]).intersection(word_set))
        print("%s: %s" % (player, match))

## a.) Checking names (All names are present)

In [None]:
show_missing_words("name_parts", word_2_set)

In [None]:
show_missing_words("name_parts", word_1_set)

show_matching_words("name_parts", word_1_set)

## b.) Checking account names (All main account are present)

In [None]:
show_missing_words("accounts", word_2_set)

In [None]:
show_missing_words("accounts", word_1_set)

show_matching_words("accounts", word_1_set)

# Relevant player words

In [None]:
finals_df.head(2)

In [None]:
finals_df[finals_df["matchScore"] == "Cancelled"]

In [None]:
def list2relevance(values, relevance):
    return dict(zip(values, relevance * np.ones(len(values))))

# TODO: add players who play at the same time but not with our player - with negative relevance!!!

# TODO: Other relevant events

   * birthday for nadal
   * injuries for other players?
   * wheather? rain? etc???
   * cancelled match?

### Winner Synonyms

In [None]:
winner_synonyms = ["win","won","victori","triumph"]#,"winner"
#winner_synonyms += ["champ","champion","king"]
#winner_synonyms += ["congrat","congratul"]
#winner_synonyms += ["title","trophi"]

### Loser Synonyms

In [None]:
loser_synonyms = ["lose", "lost", "beaten"]#, "loser"]
#loser_synonyms += ["defeat"]

### Assign relevances

In [None]:
def get_relevance_record(date, time, utc_hour, winner, loser, score):
    if score == "Cancelled":
        to_iterate_on = [(winner, loser, None),(loser, winner, None)]
    else:
        to_iterate_on = [(winner, loser, 1),(loser, winner, 0)]
    res = []
    for p1, p2, is_winner in to_iterate_on:
        n1, n2 = player_info_map[p1]["name_parts"], player_info_map[p2]["name_parts"]
        acc1, acc2 = player_info_map[p1]["accounts"], player_info_map[p2]["accounts"]
        for w in n1+acc1:
            player_relevant = dict()
            # set relevance for winner information (only after the match started)
            if time >= utc_hour:
                # in case of "Cancelled" there is only zero relevance
                if is_winner == 1:
                    player_relevant.update(list2relevance(winner_synonyms, 4.0))
                    player_relevant.update(list2relevance(loser_synonyms, -4.0))
                elif is_winner == 0:
                    player_relevant.update(list2relevance(winner_synonyms, -4.0))
                    player_relevant.update(list2relevance(loser_synonyms, -4.0))
            # set relevance for final categories
            if date in ["2017-06-06","2017-06-07"]:
                player_relevant.update(list2relevance(["quarters","quarter","final"], 3.0))
            elif date in ["2017-06-08","2017-06-09"]:
                player_relevant.update(list2relevance(["semi","final"], 3.0))
            elif date in ["2017-06-10","2017-06-11"]:
                player_relevant.update(list2relevance(["final"], 3.0))
            # set relevance for opponent information
            opponent_relevant = list2relevance(n2+acc2, 5.0)
            res.append([date, "%.2i:00" % time, p1, is_winner, w, player_relevant, p2, opponent_relevant])
    return res

In [None]:
relevant_infos = []
for idx, row in finals_df.iterrows():
    date, utc_hour, winner, loser, score = row["date"], row["utc_start_hour"], row["playerName active"], row["playerName opponent"], row["matchScore"]
    for time in TIME_HOUR_VALS:
        relevant_infos += get_relevance_record(date, time, utc_hour, winner, loser, score)
relevant_df = pd.DataFrame(relevant_infos, columns=["date","time","player","is_winner","key_word", "key_relevant_words", "opponent","opp_relevant_words"])

In [None]:
relevant_df.head(5)

In [None]:
len(relevant_df)

## Saving daily keywors to file

In [None]:
aggr_keyword_by_day = relevant_df.groupby(by="date")["key_word"].aggregate(lambda x: set(x))
aggr_keyword_by_day.to_csv(ph.get("keywords_for_eval_path"), sep="|")

### Sanity check: wawrinka loser information has relevance only in snapshots after match start

In [None]:
relevant_df[relevant_df["date"]=="2017-06-11"]

# Experimental Scores

## Average fraction for word_2

In [None]:
word_2_counts = pair_occs_df.groupby(by=["word_2"])["count"].mean()

In [None]:
word_2_counts.mean()

In [None]:
word_2_counts["@simonahalep"], word_2_counts["simona"], word_2_counts["halep"]

In [None]:
word_2_counts["@rafaelnadal"], word_2_counts["rafael"], word_2_counts["nadal"], word_2_counts["rafa"]

In [None]:
word_2_counts["win"], word_2_counts["champion"], word_2_counts["lose"]

In [None]:
word_2_counts["tennis"], word_2_counts["rolandgarros"], word_2_counts["frenchopen"]

## Filter pair_occs_df for keywords in order to save time

In [None]:
print(len(pair_occs_df))
key_words = list(relevant_df["key_word"].unique())
pair_occs_df = pair_occs_df[pair_occs_df["word_1"].isin(key_words)]
print(len(pair_occs_df))

In [None]:
pair_occs_df.head(2)

In [None]:
def calculate_occ_score(row, c=1.0, alpha=1.0):
    return (c + (1.0+alpha) * row["count"]) / (c + row["global_val"] + row["snapshot_val"])

## a.) occ_score with different coefficients

In [None]:
SCORE_CONST = ph.get("score_const")

In [None]:
pair_occs_df["occ_score"] = pair_occs_df.apply(lambda x: calculate_occ_score(x, c=SCORE_CONST), axis=1)

In [None]:
for a in [2.0, 3.0, 5.0, 10.0, 20.0, 50.0]:#, 100.0, 200.0, 500.0]:
    pair_occs_df["occ_score_alpha_%i" % a] = pair_occs_df.apply(lambda x: calculate_occ_score(x, c=SCORE_CONST, alpha=a), axis=1)

## b.) occ_score with additional member

In [None]:
pair_occs_df["occ_score_mul_count"] = pair_occs_df["count"] * pair_occs_df["occ_score"]

In [None]:
pair_occs_df["occ_score_plus_count"] = pair_occs_df["count"] + pair_occs_df["occ_score"]

## c.) using Rayleigh for word frequency normalization

## Rayleigh is used with modified exponent!!!

In [None]:
def rayleigh(x, s=1.0, exp=1):
    var = s**2
    return x / var * np.exp(-1.0 / (2*var) * np.power(x,exp))

In [None]:
x = np.arange(0,1,0.01)
plt.plot(x,rayleigh(x, s=0.35, exp=1.0), label="s=0.35, exp=1.0")
plt.plot(x,rayleigh(x, s=0.15, exp=2.7), label="s=0.15, exp=2.7")
plt.legend()

### i.) calculate rayleigh decay factors

In [None]:
s_val_1 = 0.35
pair_occs_df["rayleigh_%.2f" % s_val_1] = pair_occs_df["word_2"].apply(lambda x: rayleigh(word_2_counts[x], s=s_val_1, exp=1.0))

In [None]:
s_val_2 = 0.15
pair_occs_df["rayleigh_%.2f" % s_val_2] = pair_occs_df["word_2"].apply(lambda x: rayleigh(word_2_counts[x], s=s_val_2, exp=2.7))

### ii.) Rayleigh based scores

In [None]:
for s in [0.35, 0.15]:
    pair_occs_df["rayleigh_%.2f_mul_score" % s] = pair_occs_df["count"] *  pair_occs_df["rayleigh_%.2f" % s]
    pair_occs_df["rayleigh_%.2f_plus_score" % s] = pair_occs_df["count"] +  pair_occs_df["rayleigh_%.2f" % s]

# Load Word2Vec models

In [None]:
w2v_models = et.load_w2v_models("%s/dim_%i/" % (w2v_model_dir, ph.get("w2v_model_dim")))

In [None]:
et.get_w2v_toplist(w2v_models, ["nadal"], ["2017-06-11T10:00"], top_k=10)

# Load Jaccard and Cosine distances

In [None]:
def load_distance_model(distance_model_path):
    """Load distance model (Jaccard or Cosine) trained for snapshots"""
    return pd.read_csv(distance_model_path, sep="|")

def distance_query(distance_df, key_word, snapshot_id, top_k=None):
    """Handle distance request for only one key word and snapshot id."""
    out_df = distance_df[(distance_df["word_1"] == key_word) & (distance_df["snapshot_id"] == snapshot_id)]
    out_df = out_df.sort_values("distance", ascending=True)
    if top_k != None:
        out_df = out_df.head(top_k)
    out_df["time"] = out_df["snapshot_id"]
    return out_df[["time","word_1","word_2","distance"]]

def get_distance_toplist(distance_df, key_words, snapshot_ids, top_k):
    """Get most similar words based on 'distance_df'. 
    If more than 1 snapshot id is specified then there could be duplications in the data!"""
    dfs = [distance_query(distance_df, kw, sid, top_k) for kw in key_words for sid in snapshot_ids]
    return pd.concat(dfs).sort_values("distance", ascending=True)

In [None]:
distance_root_folder = ph.get("distance_root_folder")
jaccard_distances = load_distance_model("%s/jaccard.dist" % distance_root_folder)
cosine_distances = load_distance_model("%s/cosine.dist" % distance_root_folder)

In [None]:
get_distance_toplist(jaccard_distances, ["nadal"], ["2017-06-11T10:00"], top_k=10)

In [None]:
get_distance_toplist(cosine_distances, ["nadal"], ["2017-06-11T10:00"], top_k=10)

# Results

## Random shuffle is not implemented yet for ties!!!

In [None]:
def get_relevance_order(relevant_map):
    relevant_df = pd.DataFrame(list(relevant_map.items()), columns=["word","relevance"])
    relevant_df = relevant_df.sort_values("relevance", ascending=False)
    return list(relevant_df["word"])

def dcg(relevant_map, pred_order, k=None):
    if k == None:
        k = len(pred_order)
    k = min(k, len(pred_order))
    dcg_score = 0.0
    for i in range(k):
        word = pred_order[i]
        if word in relevant_map:
            dcg_score += relevant_map[word] / np.log(i+2)
    return dcg_score

def ndcg(relevant_map, pred_order, k=None):
    if len(pred_order) == 0.0:
        return 0.0
    else:
        relevance_order = get_relevance_order(relevant_map)
        dcg_val, idcg_val = dcg(relevant_map,pred_order,k=k), dcg(relevant_map,relevance_order,k=k)
        return float(dcg_val) / idcg_val
    
relevant_map = {"alma":1.0,"korte":2.0,"valami":-5.0}
print(get_relevance_order(relevant_map))
print(ndcg(relevant_map, ["szilva","alma"], k=1))
print(ndcg(relevant_map, ["alma","korte"], k=1))
print(ndcg(relevant_map, ["korte","alma"], k=1))

# TODO: remove player name related words from the predicted toplist...

# TODO: send warning if toplist is empty?

In [None]:
def get_ndcg_for_relevant_record(rel_rec, score_col, top_k=None, general_words=None):
    """'general_words' must be a relevance dictionary"""
    time_id, key_word = rel_rec["time"], rel_rec["key_word"]
    snapshot_id = "%sT%s" % (rel_rec["date"], time_id)
    relevant_words = dict()
    relevant_words.update(rel_rec["key_relevant_words"] )
    relevant_words.update(rel_rec["opp_relevant_words"])
    if general_words != None:
        relevant_words.update(general_words)
    if score_col == "w2v_score":
        pred_words = list(et.get_w2v_toplist(w2v_models, [key_word], [snapshot_id], top_k=top_k )["word_2"])
    elif score_col == "jaccard":
        pred_words = list(get_distance_toplist(jaccard_distances, [key_word], [snapshot_id], top_k=top_k )["word_2"])
    elif score_col == "cosine":
        pred_words = list(get_distance_toplist(cosine_distances, [key_word], [snapshot_id], top_k=top_k )["word_2"])
    else:
        pred_words = list(et.get_toplist(pair_occs_df, [key_word], [snapshot_id], score_col=score_col)["word_2"])
    ndcg_score = ndcg(relevant_words, pred_words, k=top_k)
    return (snapshot_id, rel_rec["date"], time_id, score_col, key_word, ndcg_score)

In [None]:
import multiprocessing, functools

def get_ndcg_single_thread(top_k, row, general_words, score_col):
    return get_ndcg_for_relevant_record(row, score_col, top_k=top_k, general_words=general_words)

def get_ndcg_from_threads(top_k, time_ids, score_cols, general_words, n_threads=1):
    print(len(relevant_df))
    filtered_relevant_df = relevant_df[relevant_df["time"].isin(time_ids)]
    print(len(filtered_relevant_df))
    ndcg_info_list = []
    for idx, row in filtered_relevant_df.iterrows():
        if n_threads == 1:
            for score_col in score_cols:
                ndcg_info_list += [get_ndcg_single_thread(top_k, row, general_words, score_col)]
        else:
            f_partial = functools.partial(get_ndcg_single_thread, top_k, row, general_words)
            pool = multiprocessing.Pool(processes=n_threads)
            res = pool.map(f_partial, score_cols)
            pool.close()
            pool.join()
            ndcg_info_list += res
    ndcg_df = pd.DataFrame(ndcg_info_list, columns=["snapshot_id","date","time","score_id","key_word","ndcg"])
    return ndcg_df   

In [None]:
score_cols = [
    "w2v_score",
    "jaccard",
    "cosine",
    "count",
    "occ_score",
    "occ_score_alpha_2",
    "occ_score_alpha_3",
    "occ_score_alpha_5",
    "occ_score_alpha_10",
    "occ_score_alpha_20",
    "occ_score_alpha_50",
    #"occ_score_alpha_100",
    #"occ_score_alpha_200",
    #"occ_score_alpha_500",
    "occ_score_mul_count",
    "occ_score_plus_count",
    "rayleigh_0.35_mul_score",
    "rayleigh_0.35_plus_score",
    "rayleigh_0.15_mul_score",
    "rayleigh_0.15_plus_score"
]

In [None]:
784*len(score_cols)

In [None]:
%%time
general_words = {"play":4.0, "match":4.0, "rolandgarros":-5.0, "frenchopen":-5.0, "tennis":-5.0}
#general_words = {"play":4.0, "match":4.0}
#time_ids = ["%.2i:00" % t for t in TIME_HOUR_VALS]
time_ids = ["07:00","10:00","13:00","16:00","19:00"]
ndcg_df = get_ndcg_from_threads(20, time_ids, score_cols, general_words, n_threads=5)

In [None]:
len(ndcg_df)

In [None]:
ndcg_df.head()

# Mean NDCG performance for score types

In [None]:
ndcg_df.groupby(by="score_id")["ndcg"].mean().sort_values(ascending=False)

## i.) Compare co-occurence scores for date

   * the baseline score is the best (the fraction of co-occurences)
   * occ_score_2 gives similar results...

In [None]:
sns.factorplot(data=ndcg_df, x="date", y="ndcg", hue="score_id", size=8)

#score_col = "occ_score"
score_col = "rayleigh_0.35_plus_score"
#score_col = "count"
et.get_toplist(pair_occs_df, ["nadal"], ["2017-06-11T16:00"], score_col=score_col)[["word_1","word_2",score_col]].head(10)

et.get_w2v_toplist(w2v_models, ["nadal"], ["2017-06-11T16:00"], top_k=10)[["word_1","word_2","w2v_score"]]

ndcg_df[(ndcg_df["snapshot_id"]=="2017-06-11T16:00") & (ndcg_df["key_word"]=="nadal")]

## ii.)  Compare co-occurence scores for time of day

In [None]:
sns.factorplot(data=ndcg_df, x="time", y="ndcg", hue="score_id", size=8)

# difference between players

In [None]:
key_words = ["nadal","wawrinka","ostapenko","halep","murray","djokovic","cilic","pliskova","bacsinszky"]
score_filtered = ndcg_df[ndcg_df["key_word"].isin(key_words)]
sns.factorplot(data=score_filtered, x="date", y="ndcg", hue="key_word", size=8)

### Formerly accounts had low performance, but after dropping stop words it improved a lot!

**@OstapenkoFC** is probably a bad account... less popular

In [None]:
key_words = ["@RafaelNadal","@stanwawrinka","@OstapenkoFC","@Simona_Halep","@andy_murray","@DjokerNole", "@cilic_marin", "@KaPliskova", "@TimeaOfficial"]
score_filtered = ndcg_df[ndcg_df["key_word"].isin(key_words)]
sns.factorplot(data=score_filtered, x="date", y="ndcg", hue="key_word", size=8)