In [None]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
import sys
sys.path.insert(0,"../../python/")
from rg17 import evaluate_toplist as et

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from datawand.parametrization import ParamHelper
ph = ParamHelper("../../pipelines/TrendApproximation.json", sys.argv)

In [None]:
player_name_with_account_file_path = ph.get("player_name_with_accounts_file_path")
schedule_file_path = ph.get("schedule_file_path")
w2v_model_dir = ph.get("w2v_root_folder")
experiment_id = ph.get("experiment_id")
TIME_HOUR_VALS = ph.get("time_hour_vals")
RELEVANCE_TYPE = ph.get("relevance_type")
RELEVANCE_SUBSET = ph.get("relevance_subset")

# Load Player Accounts

In [None]:
with open(player_name_with_account_file_path) as f:
    player_account_map = json.load(f)

In [None]:
player_account_map["Stan Wawrinka"] = ["stanwawrinka"]
player_account_map["Novak Djokovic"] = ["DjokerNole"]
player_account_map["Caroline Garcia"] = ["CaroGarcia"]
player_account_map["Caroline Wozniacki"] = ["CaroWozniacki"]
player_account_map["Marin Cilic"] = ["cilic_marin"]
player_account_map["Kristina Mladenovic"] = ["KikiMladenovic"]
player_account_map["Dominic Thiem"] = ["ThiemDomi"]
player_account_map["Rafael Nadal"] = ["RafaelNadal"]
player_account_map["Timea Bacsinszky"] = ["TimeaOfficial"]
player_account_map["Pablo Carreno Busta"] = ["pablocarreno91"]
player_account_map["Simona Halep"] = ["Simona_Halep"]
player_account_map["Andy Murray"] = ["andy_murray"]

# Load Schedule

In [None]:
schedule_df = pd.read_csv(schedule_file_path, sep="|")

In [None]:
excluded_categories = ["boy", "girl", "wheelchair", "legends over 45"]

## Convert start dates to UTC for the proper evaluation

In [None]:
schedule_df["startDate"].value_counts()

In [None]:
utc_hour_map = {
    "11:00 AM" : 9,
    "10:00 AM" : 8,
    "12:00 PM" : 10,
    "2:00 PM" : 12,
    "11:30 AM" : 10, # hour was rounded up
    "3:00 PM" : 13,
    "12:45 PM" : 11 # hour was rounded up
}

In [None]:
schedule_df["utc_start_hour"] = schedule_df["startDate"].apply(lambda x: utc_hour_map[x])

In [None]:
schedule_df["utc_start_hour"].value_counts()

# Filter Schedule

   * only Single matches are kept
   * only important categories are kept (Men's, Women's, Legends under 45)

In [None]:
def filter_categories(match_cat, excluded_cats=excluded_categories):
    match_cat_lower = match_cat.lower()
    keep_this = True
    for cat in excluded_cats:
        if cat in match_cat_lower:
            keep_this = False
            break
    if not ("final" in match_cat_lower and "single" in match_cat_lower):
        keep_this = False
    return keep_this

In [None]:
finals_df = schedule_df[schedule_df["matchHeader"].apply(filter_categories)]

In [None]:
len(schedule_df), len(finals_df)

## Single finals

   * **canceled** matches are not excluded because people may talk about this events as well 

In [None]:
finals_df

# Player name parts

In [None]:
players = list(set(finals_df["playerName active"]).union(finals_df["playerName opponent"]))

In [None]:
len(players)

In [None]:
player_info_map = {}
for player in players:
    player_info_map[player] = {
        "name_parts": [p.lower() for p in player.split()],
        "accounts": ["@" + et.transform_account_name(a, remove_digits=False, remove_under_score=False, to_lower=False) for a in player_account_map[player]]
    }

## Show multi-account players

In [None]:
for player, info in player_info_map.items():
    if len(info["accounts"]) > 1:
        print(player, info["accounts"])

In [None]:
player_info_map

# Co-occurences

In [None]:
pair_occs_df = pd.read_csv("/mnt/idms/fberes/network/combined_occ/occ_scores/%s_with_scores.csv" % experiment_id, sep="|")

### converting string to dict

In [None]:
pair_occs_df["global_val"] = pair_occs_df["global_val"].apply(eval)
pair_occs_df["snapshot_val"] = pair_occs_df["snapshot_val"].apply(eval)

In [None]:
pair_occs_df.head(2)

In [None]:
word_2_set = set(pair_occs_df["word_2"].unique())
word_1_set = set(pair_occs_df["word_1"].unique())
len(word_1_set), len(word_2_set)

In [None]:
def show_missing_words(info_key, word_set):
    for player, info in player_info_map.items():
        diff = list(set(info[info_key]).difference(word_set))
        if len(diff) != 0:
            print("%s: %s missing!" % (player, diff))
            
def show_matching_words(info_key, word_set):
    for player, info in player_info_map.items():
        match = list(set(info[info_key]).intersection(word_set))
        print("%s: %s" % (player, match))

## a.) Checking names (All names are present)

In [None]:
show_missing_words("name_parts", word_2_set)

In [None]:
show_missing_words("name_parts", word_1_set)

show_matching_words("name_parts", word_1_set)

## b.) Checking account names (All main account are present)

In [None]:
show_missing_words("accounts", word_2_set)

In [None]:
show_missing_words("accounts", word_1_set)

show_matching_words("accounts", word_1_set)

# Relevant player words

In [None]:
finals_df.head(2)

In [None]:
finals_df[finals_df["matchScore"] == "Cancelled"]

In [None]:
def list2relevance(values, relevance):
    return dict(zip(values, relevance * np.ones(len(values))))

# TODO: add players who play at the same time but not with our player - with negative relevance!!!

# TODO: Other relevant events

   * birthday for nadal
   * injuries for other players?
   * wheather? rain? etc???
   * cancelled match?

### Score values

In [None]:
if RELEVANCE_TYPE == "binary":
    OPPONENT_RELEVANCE = 1.0
    WINNER_INFO_RELEVANCE = 1.0
    MATCH_INFO_RELEVANCE = 1.0
    COMMON_WORD_RELEVANCE = -1.0    
else:
    OPPONENT_RELEVANCE = 5.0
    WINNER_INFO_RELEVANCE = 4.0
    MATCH_INFO_RELEVANCE = 3.0
    COMMON_WORD_RELEVANCE = -5.0

### Winner Synonyms

In [None]:
winner_synonyms = ["win","won","victori","triumph"]

### Loser Synonyms

In [None]:
loser_synonyms = ["lose", "lost", "beaten"]

### Assign relevances

In [None]:
def get_relevance_record(relevance_subset, date, time, utc_hour, winner, loser, score):
    if score == "Cancelled":
        to_iterate_on = [(winner, loser, None),(loser, winner, None)]
    else:
        to_iterate_on = [(winner, loser, 1),(loser, winner, 0)]
    res = []
    for p1, p2, is_winner in to_iterate_on:
        n1, n2 = player_info_map[p1]["name_parts"], player_info_map[p2]["name_parts"]
        acc1, acc2 = player_info_map[p1]["accounts"], player_info_map[p2]["accounts"]
        for w in n1+acc1:
            key_exclude = n1+acc1
            key_exclude.remove(w)
            player_relevant = dict()
            # set relevance for winner information (only after the match started)
            if time > utc_hour:
                # in case of "Cancelled" there is only zero relevance
                if is_winner == 1:
                    player_relevant.update(list2relevance(winner_synonyms, WINNER_INFO_RELEVANCE))
                    if relevance_subset != "positive":
                        player_relevant.update(list2relevance(loser_synonyms, -WINNER_INFO_RELEVANCE))
                elif is_winner == 0:
                    if relevance_subset != "positive":
                        player_relevant.update(list2relevance(winner_synonyms, -WINNER_INFO_RELEVANCE))
                    player_relevant.update(list2relevance(loser_synonyms, WINNER_INFO_RELEVANCE))
            # set relevance for final categories
            if date in ["2017-06-06","2017-06-07"]:
                player_relevant.update(list2relevance(["quarters","quarter","final"], MATCH_INFO_RELEVANCE))
            elif date in ["2017-06-08","2017-06-09"]:
                player_relevant.update(list2relevance(["semi","final"], MATCH_INFO_RELEVANCE))
            elif date in ["2017-06-10","2017-06-11"]:
                player_relevant.update(list2relevance(["final"], MATCH_INFO_RELEVANCE))
            # set relevance for opponent information
            opponent_relevant = list2relevance(n2+acc2, OPPONENT_RELEVANCE)
            res.append([date, "%.2i:00" % time, p1, is_winner, w, key_exclude, player_relevant,  p2, opponent_relevant])
    return res

In [None]:
relevant_infos = []
for idx, row in finals_df.iterrows():
    date, utc_hour, winner, loser, score = row["date"], row["utc_start_hour"], row["playerName active"], row["playerName opponent"], row["matchScore"]
    for time in TIME_HOUR_VALS:
        relevant_infos += get_relevance_record(RELEVANCE_SUBSET, date, time, utc_hour, winner, loser, score)
relevant_df = pd.DataFrame(relevant_infos, columns=["date","time","player","is_winner","key_word", "key_exclude_words", "key_relevant_words", "opponent","opp_relevant_words"])

In [None]:
relevant_df.head(5)

In [None]:
len(relevant_df)

## Saving daily keywors to file

In [None]:
aggr_keyword_by_day = relevant_df.groupby(by="date")["key_word"].aggregate(lambda x: set(x))
aggr_keyword_by_day.to_csv(ph.get("keywords_for_eval_path"), sep="|")

### Sanity check: wawrinka loser information has relevance only in snapshots after match start

In [None]:
relevant_df[relevant_df["date"]=="2017-06-11"]

# Experimental Scores

In [None]:
len(pair_occs_df)

In [None]:
score_vals = [0,1,2,5,10]

In [None]:
pair_occs_df.head()

In [None]:
for val in score_vals:
    pair_occs_df["global_val_%i" % val] = pair_occs_df["global_val"].apply(lambda x: x["rel_count_%i" % val] if x != 0 else 0.0)
    pair_occs_df["snapshot_val_%i" % val] = pair_occs_df["snapshot_val"].apply(lambda x: x["rel_count_%i" % val] if x != 0 else 0.0)

In [None]:

plt.subplots(len(score_vals), 3, figsize=(20,20))
for i, val in enumerate(score_vals):
    print(i, val)
    plt.subplot(len(score_vals),3,i*3+1)
    pair_occs_df["rel_count_%i" % val].hist(bins=50)
    plt.subplot(len(score_vals),3,i*3+2)
    pair_occs_df["global_val_%i" % val].hist(bins=50)
    plt.subplot(len(score_vals),3,i*3+3)
    pair_occs_df["snapshot_val_%i" % val].hist(bins=50)

## Average values for word_2

In [None]:
stat_cols = ['rel_count_0', 'rel_count_1', 'rel_count_2', 'rel_count_5', 'rel_count_10',
    'global_val_0', 'snapshot_val_0', 'global_val_1', 'snapshot_val_1', 'global_val_2', 'snapshot_val_2', 
    'global_val_5', 'snapshot_val_5', 'global_val_10', 'snapshot_val_10']

In [None]:
word_2_stats = pair_occs_df.groupby(by=["word_2"])[stat_cols].mean()
word_2_counts = pair_occs_df.groupby(by=["word_2"])["date"].count()

In [None]:
word_2_counts.sort_values().tail(100)

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go

def get_trace(name, words, const, color, size):
    x_arr = [word_2_stats.ix[w]["global_val_%i" % const] for w in words]
    #y_arr = [word_2_stats.ix[w]["snapshot_val_%i" % const] for w in words]
    y_arr = [word_2_counts[w] for w in words]
    z_arr = [word_2_stats.ix[w]["rel_count_%i" % const] for w in words]
    trace = go.Scatter(
        x = x_arr,
        y = y_arr,
        #z = z_arr,
        name = name,
        mode = 'markers',
        text = words,
        marker = dict(
            size = size,
            color = color,
            line = dict(width = 2)
        )
    )
    return trace

def get_layout(const):
    layout = go.Layout(
        title='Word statitics (c=%i)' % const,
        xaxis=dict(
            title='global_val_%i' % const,
            titlefont=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        ),
        yaxis=dict(
            title='#occurrences with other words (count)',
            titlefont=dict(
                family='Courier New, monospace',
                size=18,
                color='#7f7f7f'
            )
        )
    )
    return layout

In [None]:
pos_words = ["game","play","match","quarters","quarter","final"] + winner_synonyms + loser_synonyms
neg_words = ["rolandgarros","frenchopen","clay","slam","set"]
player_words = list(relevant_df["key_word"])

figs_eval = []
for const in score_vals:
    data = [
        get_trace("players", player_words, const, "blue", 5),
        get_trace("positive relevance", pos_words, const, "green", 10),
        get_trace("negative relevance", neg_words, const, "red", 10)
    ]
    figs_eval.append(go.Figure(data=data, layout=get_layout(const)))

In [None]:
py.iplot(figs_eval[0], filename='eval_word_stats_0')

In [None]:
py.iplot(figs_eval[1], filename='eval_word_stats_1')

In [None]:
py.iplot(figs_eval[2], filename='eval_word_stats_2')

In [None]:
py.iplot(figs_eval[3], filename='eval_word_stats_5')

In [None]:
py.iplot(figs_eval[4], filename='eval_word_stats_10')

In [None]:
pos_neg_words = list(np.union1d(pos_words, neg_words))
relevant_words = np.union1d(pos_neg_words, player_words)
irrelevant_words = list(set(word_2_counts.index).difference(set(relevant_words)))

figs_all = []
for const in score_vals:
    data = [
        get_trace("irrelevant", irrelevant_words, const, "yellow", 5),
        get_trace("relevant", relevant_words, const, "green", 10)
    ]
    figs_all.append(go.Figure(data=data, layout=get_layout(const)))

In [None]:
py.iplot(figs_all[0], filename='all_word_stats_0')

In [None]:
py.iplot(figs_all[1], filename='all_word_stats_1')

In [None]:
py.iplot(figs_all[2], filename='all_word_stats_2')

In [None]:
py.iplot(figs_all[3], filename='all_word_stats_5')

In [None]:
py.iplot(figs_all[4], filename='all_word_stats_10')

## Filter pair_occs_df for keywords in order to save time

print(len(pair_occs_df))
key_words = list(relevant_df["key_word"].unique())
pair_occs_df = pair_occs_df[pair_occs_df["word_1"].isin(key_words)]
print(len(pair_occs_df))

pair_occs_df.head(2)

In [None]:
def calculate_norm_score(row, c_val, alpha=0, eps=0.0):
    val_key = "rel_count_%i" % c_val
    global_norm = row["global_val"][val_key] if row["global_val"] != 0 else 0
    snapshot_norm = row["snapshot_val"][val_key] if row["snapshot_val"] != 0 else 0
    # both normalization constant is missing
    if global_norm == 0 and snapshot_norm == 0:
        return 0.0
    else:
        return (eps + (2.0 + alpha) * row[val_key]) / (eps + global_norm + snapshot_norm)

## a.) occ_score with different coefficients

In [None]:
SCORE_CONST = ph.get("score_const")
SCORE_EPS = ph.get("score_eps")
SCORE_ALPHAS = ph.get("score_alphas")

In [None]:
for a in SCORE_ALPHAS:
    pair_occs_df["norm_c%i_a%i" % (SCORE_CONST,a)] = pair_occs_df.apply(lambda x: calculate_norm_score(x, c_val=SCORE_CONST, alpha=a, eps=SCORE_EPS), axis=1)

## b.) occ_score with additional member (the former version is more sophisticated...)

pair_occs_df["occ_score_mul_count"] = pair_occs_df["count"] * pair_occs_df["occ_score"]

pair_occs_df["occ_score_plus_count"] = pair_occs_df["count"] + pair_occs_df["occ_score"]

## c.) using Rayleigh for word frequency normalization

def rayleigh(x, s=1.0, exp=1):
    var = s**2
    return x / var * np.exp(-1.0 / (2*var) * np.power(x,exp))

x = np.arange(0,1,0.01)
plt.plot(x,rayleigh(x, s=0.35, exp=1.0), label="s=0.35, exp=1.0")
plt.plot(x,rayleigh(x, s=0.15, exp=2.7), label="s=0.15, exp=2.7")
plt.legend()

### i.) calculate rayleigh decay factors

s_val_1 = 0.35
pair_occs_df["rayleigh_%.2f" % s_val_1] = pair_occs_df["word_2"].apply(lambda x: rayleigh(word_2_counts[x], s=s_val_1, exp=1.0))

s_val_2 = 0.15
pair_occs_df["rayleigh_%.2f" % s_val_2] = pair_occs_df["word_2"].apply(lambda x: rayleigh(word_2_counts[x], s=s_val_2, exp=2.7))

### ii.) Rayleigh based scores

for s in [0.35, 0.15]:
    #pair_occs_df["rayleigh_%.2f_mul_score" % s] = pair_occs_df["count"] *  pair_occs_df["rayleigh_%.2f" % s]
    pair_occs_df["rayleigh_%.2f_plus_score" % s] = pair_occs_df["count"] +  pair_occs_df["rayleigh_%.2f" % s]

### iii.) occ_score + rayleigh

for a in [1.0, 2.0, 5.0, 10.0, 20.0, 50.0]:
    pair_occs_df["occ_score_alpha_%i_plus_rayleigh_0.35" % a] = pair_occs_df["occ_score_alpha_%i" % a] + pair_occs_df["rayleigh_0.35"]

# Load Word2Vec models

In [None]:
w2v_models = et.load_w2v_models("%s/dim_%i/" % (w2v_model_dir, ph.get("w2v_model_dim")))

In [None]:
et.get_w2v_toplist(w2v_models, ["nadal"], ["2017-06-11T10:00"], top_k=10)

# Load Jaccard and Cosine distances

In [None]:
distance_root_folder = ph.get("distance_root_folder")
jaccard_distances = et.load_distance_model("%s/jaccard.dist" % distance_root_folder)
cosine_distances = et.load_distance_model("%s/cosine.dist" % distance_root_folder)

In [None]:
et.get_distance_toplist(jaccard_distances, ["nadal"], ["2017-06-11T10:00"], top_k=10)

In [None]:
et.get_distance_toplist(cosine_distances, ["nadal"], ["2017-06-11T10:00"], top_k=10)

# Results

In [None]:
def get_relevance_order(relevant_map):
    relevant_df = pd.DataFrame(list(relevant_map.items()), columns=["word","relevance"])
    relevant_df = relevant_df.sort_values("relevance", ascending=False)
    return list(relevant_df["word"])

def dcg(relevant_map, pred_order, k=None):
    if k == None:
        k = len(pred_order)
    k = min(k, len(pred_order))
    dcg_score = 0.0
    for i in range(k):
        word = pred_order[i]
        if word in relevant_map:
            dcg_score += relevant_map[word] / np.log(i+2)
    return dcg_score

def ndcg(relevant_map, pred_order, k=None):
    if len(pred_order) == 0.0:
        return 0.0
    else:
        relevance_order = get_relevance_order(relevant_map)
        dcg_val, idcg_val = dcg(relevant_map,pred_order,k=k), dcg(relevant_map,relevance_order,k=k)
        return float(dcg_val) / idcg_val
    
relevant_map = {"alma":1.0,"korte":2.0,"valami":-5.0}
print(get_relevance_order(relevant_map))
print(ndcg(relevant_map, ["szilva","alma"], k=1))
print(ndcg(relevant_map, ["alma","korte"], k=1))
print(ndcg(relevant_map, ["korte","alma"], k=1))

get_ndcg_for_relevant_record(relevant_df.ix[761], "norm_c%i_a5" % SCORE_CONST, exclude_player_words=True, top_k=None, general_words=None, verbose=True)

ndcg_df.tail(50)

relevant_df[relevant_df["key_word"]=="nadal"]

In [None]:
len(pair_occs_df)

In [None]:
def get_ndcg_for_relevant_record(rel_rec, score_col, exclude_player_words=True, top_k=None, general_words=None, verbose=False):
    """'general_words' must be a relevance dictionary."""
    time_id, key_word = rel_rec["time"], rel_rec["key_word"]
    snapshot_id = "%sT%s" % (rel_rec["date"], time_id)
    # define relevant words
    relevant_words = dict()
    relevant_words.update(rel_rec["key_relevant_words"])
    relevant_words.update(rel_rec["opp_relevant_words"])
    if general_words != None:
        relevant_words.update(general_words)
    # define words to be excluded from the toplist    
    if exclude_player_words:
        to_be_excluded = rel_rec["key_exclude_words"]
    else:
        to_be_excluded = None
    # get toplist
    if score_col == "word_2_vec":
        pred_words = list(et.get_w2v_toplist(w2v_models, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded)["word_2"])
    elif score_col == "jaccard":
        pred_words = list(et.get_distance_toplist(jaccard_distances, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded )["word_2"])
    elif score_col == "cosine":
        pred_words = list(et.get_distance_toplist(cosine_distances, [key_word], [snapshot_id], top_k=top_k, excluded_words=to_be_excluded )["word_2"])
    else:
        pred_words = list(et.get_toplist(pair_occs_df, [key_word], [snapshot_id], score_col=score_col, excluded_words=to_be_excluded)["word_2"])
    if verbose:
        print(pred_words)
        print(relevant_words)
    ndcg_score = ndcg(relevant_words, pred_words, k=top_k)
    return (snapshot_id, rel_rec["date"], time_id, score_col, key_word, ndcg_score)

In [None]:
import multiprocessing, functools

def get_ndcg_single_thread(top_k, row, general_words, score_col):
    return get_ndcg_for_relevant_record(row, score_col, top_k=top_k, general_words=general_words)

def get_ndcg_from_threads(top_k, time_ids, score_cols, general_words, n_threads=1):
    print(len(relevant_df))
    filtered_relevant_df = relevant_df[relevant_df["time"].isin(time_ids)]
    print(len(filtered_relevant_df))
    ndcg_info_list = []
    if n_threads > 1:
        print("Calculating NDCG on %i threads" % n_threads)
    for idx, row in filtered_relevant_df.iterrows():
        if n_threads == 1:
            for score_col in score_cols:
                ndcg_info_list += [get_ndcg_single_thread(top_k, row, general_words, score_col)]
        else:
            f_partial = functools.partial(get_ndcg_single_thread, top_k, row, general_words)
            pool = multiprocessing.Pool(processes=n_threads)
            res = pool.map(f_partial, score_cols)
            pool.close()
            pool.join()
            ndcg_info_list += res
    ndcg_df = pd.DataFrame(ndcg_info_list, columns=["snapshot_id","date","time","score_id","key_word","ndcg"])
    return ndcg_df   

## Setting general words

In [None]:
general_words = {
    "game":MATCH_INFO_RELEVANCE,
    "play":MATCH_INFO_RELEVANCE, 
    "match":MATCH_INFO_RELEVANCE,
}
if RELEVANCE_SUBSET == "discriminative":
    general_words.update({
        "rolandgarros":COMMON_WORD_RELEVANCE,
        "frenchopen":COMMON_WORD_RELEVANCE,
        "clay":COMMON_WORD_RELEVANCE,
        "slam":COMMON_WORD_RELEVANCE,
        "set":COMMON_WORD_RELEVANCE
    })
print(general_words)

## Setting score types

In [None]:
score_cols = ["word_2_vec","jaccard","cosine","rel_count_%i" % SCORE_CONST]
score_cols += ["norm_c%i_a%i" % (SCORE_CONST,i) for i in [0,1,2,5]]
print(score_cols)

## Setting time of days

In [None]:
#time_ids = ["%.2i:00" % t for t in TIME_HOUR_VALS]
time_ids = ["%.2i:00" % t for t in [4,7,10,13,16,19]]
time_ids

## Calculate NDCG in parallel

In [None]:
%%time
ndcg_df = get_ndcg_from_threads(20, time_ids, score_cols, general_words, n_threads=len(time_ids))

In [None]:
len(ndcg_df)

In [None]:
ndcg_df.head()

# Mean NDCG performance for score types

In [None]:
ndcg_df.groupby(by="score_id")["ndcg"].mean().sort_values(ascending=False)

In [None]:
ndcg_for_plots = ndcg_df[ndcg_df["score_id"].isin(["word_2_vec","cosine","jaccard","rel_count_%s" % SCORE_CONST,"norm_c%s_a0" % SCORE_CONST,"norm_c%s_a5" % SCORE_CONST])]

In [None]:
paper_rc = {'lines.linewidth': 5,'lines.markersize': 20}              
sns.set_context("paper", rc = paper_rc, font_scale = 4.25)
sns.set_style("whitegrid")
#sns.set(font="DejaVu Sans")

## i.) Compare co-occurence scores for snapshots

In [None]:
g = sns.factorplot(data=ndcg_for_plots, x="snapshot_id", y="ndcg", hue="score_id", size=10, aspect=3)
g.set_xticklabels(rotation=90)

## ii.) Compare co-occurence scores for date

In [None]:
g = sns.factorplot(data=ndcg_for_plots, x="date", y="ndcg", hue="score_id", size=10, aspect=3)

## iii.)  Compare co-occurence scores for time of day

In [None]:
sns.factorplot(data=ndcg_for_plots, x="time", y="ndcg", hue="score_id", size=10, aspect=3)

## iv.) Difference between players keywords

In [None]:
def show_player_perf(key_words):
    score_filtered = ndcg_for_plots[ndcg_for_plots["key_word"].isin(key_words)]
    score_filtered = score_filtered[score_filtered["score_id"] == "norm_c%i_a5" % SCORE_CONST]
    score_filtered = score_filtered[score_filtered["date"].isin(["2017-06-08","2017-06-09","2017-06-10","2017-06-11"])]
    g = sns.factorplot(data=score_filtered, x="snapshot_id", y="ndcg", hue="key_word", size=10, aspect=3)
    g.set_xticklabels(rotation=90)

In [None]:
last_names = ["nadal","wawrinka","ostapenko","halep","murray","djokovic","cilic","thiem","pliskova","bacsinszky"]
first_names = ["rafael","stan","jelena","simona","andy","novak","marin","dominic","karolina","timea"]
account_names = ["@RafaelNadal","@stanwawrinka","@OstapenkoFC","@Simona_Halep","@andy_murray","@DjokerNole", "@cilic_marin", "@ThiemDomi", "@KaPliskova", "@TimeaOfficial"]

In [None]:
show_player_perf(last_names)
show_player_perf(first_names)
show_player_perf(account_names)