In [62]:
import pycof as pc

from difflib import get_close_matches

import torch
from sklearn.decomposition import PCA
from scipy import spatial

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sportsAnalytics.handball import DataLoader, Predictor

In [6]:
gender = "F"
international = False

connection = "ssh"
cache = "1d"
device = "cpu"

In [26]:
hb_d = DataLoader(gender=gender, international=international, cache=cache, connection=connection)
hb_p = Predictor(hb_d, model_name='mlp', model_type='reg', device=device)
hb_p.load_model()

Reading cached data
Reading cached data


In [54]:
sql_plays = """SELECT P.id AS player_id, P.name, P.player_position AS position, C.name AS nationality
FROM HANDBALL_V2.D_PLAYERS AS P
LEFT JOIN HANDBALL_V2.D_COUNTRIES AS C on P.country_id = C.id
-- WHERE player_id IN (SELECT DISTINCT player_id
--	FROM HANDBALL.D_PLAYERS_HISTORY AS H
--		JOIN HANDBALL.D_TEAMS AS T ON H.club_id = T.id
--	WHERE T.gender = '{_g}'
--		AND T.national_team = {_int}
--	)""".format(_g=gender, _int=int(international))
players_df = pc.remote_execute_sql(sql_plays, connection=connection, cache=cache)
# Add a URL for player' name (for a search)
players_df["profile"] = '<a href=\"https://www.handball-base.com/women/players?name=' + players_df.name + '">' + players_df.name + '</a>'

Execute SQL query and cache the data


In [55]:
# Get player's token
players_df['player_token'] = players_df.player_id.map(hb_p.model.tokenizer.vocab).fillna(0).astype(int)
# And getting embedding from model
players_df['player_embed'] = players_df.player_token.apply(lambda x: hb_p.model.model.embedding_players.all_embeddings[0](torch.LongTensor([x])))
# Extract values (detach from torch)
players_df.player_embed = players_df.player_embed.apply(lambda x: x.detach().numpy()[0].tolist())
# And create one column per value
for i in range(len(players_df.player_embed[0])):
    players_df[f'out_{i}'] = players_df.player_embed.str[i]

In [56]:
pca = PCA()
Xt = pca.fit_transform(players_df[[c for c in players_df.columns if c.startswith('out_')]])

In [57]:
for i in range(Xt.shape[1]):
    players_df[f'PCA_{i}'] = Xt[:,i]

In [74]:
focus_players = ["kristina jorgensen", "Bruna De Paula", "Tyra Axnér", "Petra Vamos", "Louise Burgaard", "Laura Flippes"]
# focus_players = ["chloe-bouquet", "coralie-lassource"]
focus_players_id = [get_close_matches(_p, players_df.name.unique())[0] for _p in focus_players]

In [75]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_traces(
    px.scatter(players_df, x="PCA_1", y="PCA_2", color="position", hover_data=["name", "nationality", "position"]).data
)
# Iterate over the list of players to highlight
if focus_players != []:
    # Find the players' ID from proposed names
    fig.add_traces(
        px.scatter(players_df[players_df.name.isin(focus_players_id)], x="PCA_1", y="PCA_2", hover_data=["name", "nationality", "position"]).update_traces(marker_size=20, marker_color="black").data
    )
# Render the plot
fig

In [76]:
for p in focus_players_id:
    # Embedding for player of interest
    p_emb = players_df[players_df.name == p].player_embed.max()
    players_df["similarity_" + p] = players_df.player_embed.apply(lambda x: spatial.distance.cosine(x, p_emb))
    

In [78]:
for p in focus_players_id:
    # Get info for players of interest
    p_info = players_df[players_df.name == p].reset_index().iloc[0]
    # Find candidates with similar position as player of interest
    sim_candidates = players_df[(players_df.position == p_info.position) & (players_df.player_id != p)]
    # Sort by similarity
    sim_candidates = sim_candidates.sort_values(by="similarity_" + p, ascending=False).reset_index(drop=True)
    txt = []
    for i in range(3):
        top_sim = sim_candidates.iloc[i]
        txt += [top_sim['name'] + ' (' + top_sim['position'] + ', ' + top_sim['nationality'] + ')']
    print(p_info['name'] + ':', ' / '.join(txt))

Kristina Jørgensen: Bernadett Hornyak (CB, Hungary) / Alona Shupyk (CB, Ukraine) / Nuria Bucher (CB, Switzerland)
Bruna Aparecida Almeida De Paula: Annefleur Bruggeman (LB, Netherlands) / Bojana Milic (LB, Serbia) / Melissa Petren (LB, Sweden)
Tyra Axner: Sevgi Kalyoncuoglu (LB, Turkey) / Karolina Sparnauskaite (LB, Lithuania) / Lara Seidel (LB, Germany)
Petra Vamos: Helena Elver Hagesoe (CB, Denmark) / Valentina Landri (CB, Italy) / Nerea Pena Abaurrea (CB, Spain)
Louise Katharina Burgaard: Marlene Kalf (RB, Germany) / Katarina Dzaferovic (RB, Montenegro) / Dora Hornyak (RB, Hungary)
Laura Flippes: Isabelle Dos Santos Medeiros (RW, Brazil) / Lea Vukojevic (RW, Croatia) / Sev Albrecht (RW, Switzerland)
