# Final plots

In [None]:
import pandas as pd
import os
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.graph_objects as go
from ipywidgets import interact, IntSlider, Dropdown
from collections import defaultdict
from datetime import datetime
from ipywidgets import interact, widgets


# Elo Plots

In [None]:
# --- Fonctions ELO ---
def k_factor(matches_played):
    return 250 / (matches_played + 5) ** 0.4

def calc_exp_score(rA, rB):
    return 1 / (1 + 10 ** ((rB - rA) / 400))

def update_elo(old_elo, k, actual, expected):
    return old_elo + k * (actual - expected)


In [None]:
# --- Charger les données ---
df_all = pd.read_csv("Smashdata/Dataset/match_tennis_68_ajd.csv")
df_all = df_all[df_all["tourney_date"] >= 20000101]
df_all['surface'] = df_all['surface'].fillna("Unknown")

# --- Liste des surfaces uniques ---
surfaces = df_all['surface'].dropna().unique().tolist()
surfaces.sort()
surfaces = ["All"] + surfaces

In [None]:
# --- Fonction principale ---
def compute_elo_and_plot(top_n=10, surface="All"):
    df = df_all.copy()
    if surface != "All":
        df = df[df['surface'] == surface]

    # Init structures
    player_elo = defaultdict(lambda: 1500)
    player_matches = defaultdict(int)
    player_peak = {}
    player_last_match = {}
    elo_history = defaultdict(list)
    dates = []

    # Tri chronologique
    round_order = ['RR', 'R128', 'R64', 'R32', 'R16', 'QF', 'SF', 'F']
    df['round'] = pd.Categorical(df['round'], categories=round_order, ordered=True)
    df = df.sort_values(by=['tourney_date', 'round'])

    # Calculs ELO
    for _, row in df.iterrows():
        try:
            w, l = row['winner_name'], row['loser_name']
            date = int(row['tourney_date'])
        except:
            continue

        r_w = player_elo[w]
        r_l = player_elo[l]

        exp_w = calc_exp_score(r_w, r_l)
        exp_l = 1 - exp_w

        k_w = k_factor(player_matches[w])
        k_l = k_factor(player_matches[l])

        new_w = update_elo(r_w, k_w, 1, exp_w)
        new_l = update_elo(r_l, k_l, 0, exp_l)

        player_elo[w] = new_w
        player_elo[l] = new_l

        player_matches[w] += 1
        player_matches[l] += 1

        player_last_match[w] = date
        player_last_match[l] = date

        dt = datetime.strptime(str(date), "%Y%m%d")
        dates.append(dt)

        for p in [w, l]:
            elo_history[p].append((dt, player_elo[p]))

        # Update peak
        if w not in player_peak or new_w > player_peak[w]:
            player_peak[w] = new_w
        if l not in player_peak or new_l > player_peak[l]:
            player_peak[l] = new_l

        elo_series_dict = {}

    for name, history in elo_history.items():
        hist_dict = dict(history)
        series = pd.Series(hist_dict)
        elo_series_dict[name] = series
    
    elo_df = pd.DataFrame(elo_series_dict)
    elo_df.index = pd.to_datetime(elo_df.index)
    elo_df = elo_df.sort_index()



    # Interpolation linéaire
    elo_df = elo_df.interpolate(method='linear')

    # Masquer ELO après le dernier match
    for name in elo_df.columns:
        if name in player_last_match:
            last_date = datetime.strptime(str(player_last_match[name]), "%Y%m%d")
            elo_df.loc[elo_df.index > last_date, name] = np.nan

    # Sélection des top N joueurs
    top_players = sorted(player_peak.items(), key=lambda x: x[1], reverse=True)
    top_names = [name for name, _ in top_players[:top_n]]

    # --- Plotly ---
    fig = go.Figure()
    for name in top_names:
        if name in elo_df.columns:
            fig.add_trace(go.Scatter(
                x=elo_df.index,
                y=elo_df[name],
                mode="lines",
                name=name
            ))

    title = f"Top {top_n} ELO Players"
    if surface != "All":
        title += f" on {surface}"

    fig.update_layout(
        title=title,
        xaxis_title="Date",
        yaxis_title="ELO Rating",
        height=600
    )
    fig.show()

# --- Widgets interactifs ---
interact(
    compute_elo_and_plot,
    top_n=IntSlider(min=1, max=30, step=1, value=10),
    surface=Dropdown(options=surfaces, value="All", description="Surface")
)


In [None]:
# Initialiser les cotes ELO et l'historique
elo = defaultdict(lambda: 1500)
matches_played = defaultdict(int)
elo_history = defaultdict(list)
top_winners = df_all["winner_name"].value_counts().head(10).index.tolist()

# Calculer l'ELO au fil du temps
for _, row in df_all.iterrows():
    w, l = row["winner_name"], row["loser_name"]
    date = row["tourney_date"]

    if w in top_winners or l in top_winners:
        r_w, r_l = elo[w], elo[l]
        exp_w = calc_exp_score(r_w, r_l)
        exp_l = 1 - exp_w

        k_w = k_factor(matches_played[w])
        k_l = k_factor(matches_played[l])

        elo[w] = update_elo(r_w, k_w, 1, exp_w)
        elo[l] = update_elo(r_l, k_l, 0, exp_l)

        matches_played[w] += 1
        matches_played[l] += 1

        elo_history[w].append((date, elo[w]))
        elo_history[l].append((date, elo[l]))

In [None]:
# Préparer les données pour Plotly
data = []
for player, history in elo_history.items():
    if player in top_winners:
        for date, elo in history:
            data.append({"Date": date, "Joueur": player, "ELO": elo})

df_plotly = pd.DataFrame(data)

# Trier les données par date
df_plotly = df_plotly.sort_values(by="Date")

# === Lissage : Interpolation linéaire mensuelle avec moyennes ===
Monthly_range = pd.date_range(start=df_plotly["Date"].min(), end=df_plotly["Date"].max(), freq="W")
smoothed = []

for player in top_winners:
    df_p = df_plotly[df_plotly["Joueur"] == player].copy()

    # Moyenne par date si plusieurs entrées le même jour
    df_p = df_p.groupby("Date").mean(numeric_only=True)

     # Ajouter un point fictif en 2000 à 1500
    start_date = pd.Timestamp("2000-01-01")
    df_p = pd.concat([
        pd.DataFrame([{"Date": start_date, "ELO": 1500.0, "Joueur": player}]),
        df_p
    ])

    # Réindexer sur la grille mensuelle
    df_p = df_p.reindex(Monthly_range)

    # Interpoler linéairement
    df_p["ELO"] = df_p["ELO"].interpolate(method="linear")

    # Réaffecter le nom du joueur
    df_p["Joueur"] = player

    smoothed.append(df_p)

# Fusionner
df_smooth = pd.concat(smoothed).reset_index().rename(columns={"index": "Date"})
df_smooth = df_smooth.dropna(subset=["ELO"])



In [None]:
# Préparer les données pour Plotly
data = []
for player, history in elo_history.items():
    if player in top_winners:
        for date, elo in history:
            data.append({"Date": date, "Joueur": player, "ELO": elo})

df_plotly = pd.DataFrame(data)
df_plotly['Date'] = pd.to_datetime(df_plotly['Date'], format='%Y%m%d')

# Trier les données par date
df_plotly = df_plotly.sort_values(by="Date")

# Créer les frames pour l'animation
frames = []
dates = df_plotly["Date"].unique()
for date in dates:
    frame_data = df_plotly[df_plotly["Date"] <= date]
    frame = go.Frame(
        data=[
            go.Scatter(
                x=frame_data[frame_data["Joueur"] == player]["Date"],
                y=frame_data[frame_data["Joueur"] == player]["ELO"],
                mode="lines",
                name=player
            )
            for player in top_winners
        ],
        name=str(date)
    )
    frames.append(frame)

In [None]:
# Créer la figure
fig = go.Figure(
    data=[
        go.Scatter(
            x=df_plotly[df_plotly["Joueur"] == player]["Date"],
            y=df_plotly[df_plotly["Joueur"] == player]["ELO"],
            mode="lines",
            name=player
        )
        for player in top_winners
    ],
    layout=go.Layout(
        title="Évolution des cotes ELO des 10 meilleurs joueurs (2000-aujourd'hui)",
        xaxis=dict(title="Date"),
        yaxis=dict(title="ELO"),
        updatemenus=[dict(
            type="buttons",
            showactive=False,
            buttons=[
                dict(label="Play",
                     method="animate",
                     args=[None, dict(
                         frame=dict(duration=250, redraw=True), 
                         fromcurrent=True,
                         transition=dict(duration=1000, easing="cubic-in-out")  # optionnel
                        )]),
                dict(label="Pause",
                     method="animate",
                     args=[[None], dict(
                         frame=dict(duration=0, redraw=False), 
                         mode="immediate")
                         ]
                     ),
                dict(label="Restart",
                     method="animate",
                     args=[None, dict(frame=dict(duration=500), 
                                      mode="immediate", 
                                      fromcurrent=False)]
                    ),       
            ]
        )]
    ),
    frames=frames
)


# Afficher la figure
fig.show()



## Plot 2

In [None]:
# --- Charger les deux fichiers ---
df_stats = pd.read_csv("Dataset/match_tennis_68_ajd.csv")
df_bookmakers = pd.read_csv("Dataset/atp_mens_tour_merged.csv")

df_stats = df_stats[df_stats["tourney_date"] >= 20000101]  # Optionnel : filtrer 2000–2024

# --- Nettoyer les dates ---
df_stats["tourney_date"] = pd.to_datetime(df_stats["tourney_date"], format="%Y%m%d", errors='coerce')
df_bookmakers["Date"] = pd.to_datetime(df_bookmakers["Date"], errors='coerce')

# --- Nettoyage des noms dans df_bookmakers (ex: 'Federer R.')
df_bookmakers["Winner_fmt"] = df_bookmakers["Winner"].str.replace(r"\.", "", regex=True).str.strip()
df_bookmakers["Loser_fmt"] = df_bookmakers["Loser"].str.replace(r"\.", "", regex=True).str.strip()

# --- Fonction pour formater 'Roger Federer' en 'Federer R'
def format_name_v2(full_name):
    try:
        parts = full_name.strip().split()
        if len(parts) < 2:
            return None
        first = parts[0]
        last = " ".join(parts[1:])
        return f"{last} {first[0]}"
    except:
        return None

# --- Appliquer à df_stats
df_stats["winner_fmt"] = df_stats["winner_name"].apply(format_name_v2)
df_stats["loser_fmt"] = df_stats["loser_name"].apply(format_name_v2)

# --- Fusion sur noms formatés + date
merged_merged_df = pd.merge(
    df_stats,
    df_bookmakers,
    left_on=["winner_fmt", "loser_fmt", "tourney_date"],
    right_on=["Winner_fmt", "Loser_fmt", "Date"],
    how="inner"
)

# --- Résultat
print("✅ Lignes fusionnées :", merged_merged_df.shape[0])

# --- Si aucune ligne fusionnée, affichage d'exemples pour debug
if merged_merged_df.empty:
    print("❌ Aucune fusion trouvée. Voici quelques exemples pour comparer :\n")
    print("➡️ df_stats (noms formatés + date) :")
    print(df_stats[["winner_name", "winner_fmt", "loser_fmt", "tourney_date"]].dropna().head())

    print("\n➡️ df_bookmakers (noms formatés + date) :")
    print(df_bookmakers[["Winner", "Winner_fmt", "Loser_fmt", "Date"]].dropna().head())


In [None]:
def compare_players(player1, player2, player_avg_stats):
    stats = player_avg_stats.copy()
    categories = stats.columns.tolist() + [stats.columns[0]]  # fermer la boucle

    if player1 not in stats.index or player2 not in stats.index:
        print("Un des deux joueurs n'a pas assez de données.")
        return

    p1_values = stats.loc[player1].tolist()
    p2_values = stats.loc[player2].tolist()

    fig = go.Figure()

    fig.add_trace(go.Scatterpolar(
        r=p1_values + [p1_values[0]],
        theta=categories,
        fill='toself',
        name=player1
    ))

    fig.add_trace(go.Scatterpolar(
        r=p2_values + [p2_values[0]],
        theta=categories,
        fill='toself',
        name=player2
    ))

    fig.update_layout(
        title=f"Comparaison de performances : {player1} vs {player2}",
        polar=dict(radialaxis=dict(visible=True)),
        showlegend=True,
        height=650,
        template="plotly_white"
    )

    fig.show()

def safe_div(x, y):
    try:
        return x / y if y != 0 else np.nan
    except:
        return np.nan

def extract_player_stats(df, prefix, name_col):
    stats = []

    for _, row in df.iterrows():
        try:
            # --- Infos de base ---
            player_name = row.get(name_col, None)
            if player_name is None or not isinstance(player_name, str):
                continue

            svpt = row.get(f"{prefix}_svpt", np.nan)
            in1 = row.get(f"{prefix}_1stIn", np.nan)
            won1 = row.get(f"{prefix}_1stWon", np.nan)
            won2 = row.get(f"{prefix}_2ndWon", np.nan)
            bpFaced = row.get(f"{prefix}_bpFaced", np.nan)
            bpSaved = row.get(f"{prefix}_bpSaved", np.nan)
            sets = row.get("Wsets" if prefix == "w" else "Lsets", np.nan)
            avgw = row.get("AvgW", np.nan)
            avgl = row.get("AvgL", np.nan)
            minutes = row.get("minutes", np.nan)

            # --- Tie-breaks joués ---
            tb_win = 0
            tb_played = 0
            for i in range(1, 6):
                w_set = row.get(f"W{i}", "")
                l_set = row.get(f"L{i}", "")
                if "7" in str(w_set) or "7" in str(l_set):
                    tb_played += 1
                    if prefix == "w" and str(w_set) == "7":
                        tb_win += 1
                    if prefix == "l" and str(l_set) == "7":
                        tb_win += 1

            # --- Construction des statistiques ---
            stats.append({
                "Player": player_name,
                "1st Serve %": safe_div(in1, svpt) * 100,
                "1st Won %": safe_div(won1, in1) * 100,
                "2nd Won %": safe_div(won2, svpt - in1) * 100,
                "Break Points Saved %": safe_div(bpSaved, bpFaced) * 100,
                "Aces": row.get(f"{prefix}_ace", 0),
                "Double Faults": row.get(f"{prefix}_df", 0),
                "Serve Points": svpt,
                "Service Games": row.get(f"{prefix}_SvGms", np.nan),
                "Match Duration": minutes,
                "Rank": row.get(f"{prefix}_rank", np.nan),

                # --- Stress indicators ---
                "Sets Gagnés": sets,
                "Victoire en outsider %": 100 if avgw > avgl else 0,
                "Tie-break Win Ratio": safe_div(tb_win, tb_played) * 100
            })

        except Exception as e:
            continue

    return pd.DataFrame(stats)


# Appliquer pour winners et losers
w_stats = extract_player_stats(merged_merged_df, "w", "winner_name")
l_stats = extract_player_stats(merged_merged_df, "l", "loser_name")

# Fusionner
all_stats = pd.concat([w_stats, l_stats])
player_avg_stats = all_stats.groupby("Player").mean()

In [None]:
# Charger les fichiers CSV
elo_df = pd.read_csv("Dataset/elo_timeseries_long.csv")
elo_df["date"] = pd.to_datetime(elo_df["date"])
elo_df = elo_df[elo_df["date"] >= "2000-01-01"]
stats_df = player_avg_stats
# Normaliser les stats pour que les barres soient comparables
def normalize_player_stats(df):
    df_norm = df.copy()
    for col in df.columns:
        if df[col].dtype in [float, int]:
            min_val, max_val = df[col].min(), df[col].max()
            if min_val != max_val and (max_val > 100 or min_val < 0):
                df_norm[col] = 100 * (df[col] - min_val) / (max_val - min_val)
    return df_norm


# Fonction interactive de comparaison inspirée de HLTV
def compare_players(player1, player2):
    # --- Courbe ELO ---
    fig_elo = go.Figure()
    for player, color in zip([player1, player2], ["royalblue", "red"]):
        player_df = elo_df[elo_df["joueur"] == player]
        fig_elo.add_trace(go.Scatter(
            x=player_df["date"], y=player_df["elo"],
            mode="lines", name=player, line=dict(color=color)
        ))
    fig_elo.update_layout(
        title="ELO Time Series Comparison",
        xaxis_title="Date",
        yaxis_title="ELO",
        template="plotly_white",
        height=400
    )

    # --- Stats en barres opposées (style HLTV) ---
    if player1 not in stats_df.index or player2 not in stats_df.index:
        print("Statistiques manquantes pour l'un des joueurs sélectionnés.")
        return

    metrics = stats_df.columns.tolist()
    player1_vals = stats_df.loc[player1].values
    player2_vals = stats_df.loc[player2].values

    fig_stats = go.Figure()

    # Joueur 1 à droite (positif)
    fig_stats.add_trace(go.Bar(
        x=player1_vals,
        y=metrics,
        name=player1,
        orientation='h',
        marker_color='royalblue',
        text=[f"{v:.2f}" for v in player1_vals],
        textposition="outside"
    ))

    # Joueur 2 à gauche (négatif pour opposer visuellement)
    fig_stats.add_trace(go.Bar(
        x=[-v for v in player2_vals],
        y=metrics,
        name=player2,
        orientation='h',
        marker_color='red',
        text=[f"{v:.2f}" for v in player2_vals],
        textposition="outside"
    ))

    fig_stats.update_layout(
        title=f"{player1} vs {player2} – Stat Comparison",
        barmode='relative',
        xaxis=dict(
            title="Raw Value",
            tickvals=[-100, -50, 0, 50, 100],
            ticktext=[100, 50, 0, 50, 100],
            zeroline=True,
            zerolinecolor="gray"
        ),
        yaxis=dict(title=""),
        template="plotly_white",
        height=600
    )

    # --- Afficher les deux graphiques ---
    fig_elo.show()
    fig_stats.show()


# Barre de recherche interactive
joueurs = sorted(elo_df["joueur"].unique())
interact(
    compare_players,
    player1=widgets.Combobox(placeholder='Choose player 1', options=joueurs, description='Player 1:'),
    player2=widgets.Combobox(placeholder='Choose player 2', options=joueurs, description='Player 2:')
)


In [None]:
from plotly.subplots import make_subplots

In [None]:
# Charger les fichiers CSV
elo_df = pd.read_csv("Dataset/elo_timeseries_long.csv")
elo_df["date"] = pd.to_datetime(elo_df["date"])
elo_df = elo_df[elo_df["date"] >= "2000-01-01"]
stats_df = player_avg_stats
# Normaliser les stats pour que les barres soient comparables
def normalize_player_stats(df):
    df_norm = df.copy()
    for col in df.columns:
        if df[col].dtype in [float, int]:
            min_val, max_val = df[col].min(), df[col].max()
            if min_val != max_val and (max_val > 100 or min_val < 0):
                df_norm[col] = 100 * (df[col] - min_val) / (max_val - min_val)
    return df_norm


# Fonction interactive de comparaison inspirée de HLTV
def compare_players(player1, player2):
    # --- Courbe ELO ---
    fig_elo = go.Figure()
    for player, color in zip([player1, player2], ["royalblue", "red"]):
        player_df = elo_df[elo_df["joueur"] == player]
        fig_elo.add_trace(go.Scatter(
            x=player_df["date"], y=player_df["elo"],
            mode="lines", name=player, line=dict(color=color)
        ))
    fig_elo.update_layout(
        title="ELO Time Series Comparison",
        xaxis_title="Date",
        yaxis_title="ELO",
        template="plotly_white",
        height=400
    )

    # --- Stats en barres opposées (style HLTV) ---
    if player1 not in stats_df.index or player2 not in stats_df.index:
        print("Statistiques manquantes pour l'un des joueurs sélectionnés.")
        return

    metrics = stats_df.columns.tolist()
    player1_vals = stats_df.loc[player1].values
    player2_vals = stats_df.loc[player2].values

    # Créer le subplot pour barres + camembert
    fig_stats = make_subplots(
        rows=1, cols=2,
        column_widths=[0.7, 0.3],
        specs=[[{"type": "bar"}, {"type": "domain"}]],
        subplot_titles=[f"{player1} vs {player2} – Stat Comparison", f"% Victoires ({player1} vs {player2})"]
    )


    fig_stats = go.Figure()

    # Barres - Joueur 1
    fig_stats.add_trace(go.Bar(
        x=player1_vals,
        y=metrics,
        name=player1,
        orientation='h',
        marker_color='royalblue',
        text=[f"{v:.2f}" for v in player1_vals],
        textposition="outside"
    ), row=1, col=1)

    # Barres - Joueur 2
    fig_stats.add_trace(go.Bar(
        x=[-v for v in player2_vals],
        y=metrics,
        name=player2,
        orientation='h',
        marker_color='red',
        text=[f"{v:.2f}" for v in player2_vals],
        textposition="outside"
    ), row=1, col=1)

    # Camembert des victoires
    filtered_data = data[
        (data['Winner'].isin([player1, player2])) & 
        (data['Loser'].isin([player1, player2]))
    ]
    victories = filtered_data['Winner'].value_counts()
    total_matches = victories.sum()
    percent_player1 = (victories.get(player1, 0) / total_matches) * 100 if total_matches else 0
    percent_player2 = (victories.get(player2, 0) / total_matches) * 100 if total_matches else 0

    fig_stats.add_trace(go.Pie(
        labels=[player1, player2],
        values=[percent_player1, percent_player2],
        hole=0.3,
        textinfo='label+percent'
    ), row=1, col=2)



    fig_stats.update_layout(
    barmode='relative',
    xaxis=dict(
        title="Raw Value",
        tickvals=[-100, -50, 0, 50, 100],
        ticktext=[100, 50, 0, 50, 100],
        zeroline=True,
        zerolinecolor="gray"
    ),
    yaxis=dict(title=""),
    template="plotly_white",
    height=600
    )


    # --- Afficher les deux graphiques ---
    fig_elo.show()
    fig_stats.show()


# Barre de recherche interactive
joueurs = sorted(elo_df["joueur"].unique())
interact(
    compare_players,
    player1=widgets.Combobox(placeholder='Choose player 1', options=joueurs, description='Player 1:'),
    player2=widgets.Combobox(placeholder='Choose player 2', options=joueurs, description='Player 2:')
)
