In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
import warnings
warnings.filterwarnings('ignore')

In [34]:
#  Chargement data
df = pd.read_csv('olympic_data_clean.csv')


#  Nombre de médailles par Pays/Sport/Année

df_target = df.groupby(['country_name', 'discipline_title', 'game_year', 'game_season', 'is_host'])['medal_value'].sum().reset_index()
df_target.rename(columns={'medal_value': 'medals_won'}, inplace=True)

# Nombre d'athletes
df_delegation = df.groupby(['country_name', 'discipline_title', 'game_year', 'game_season']).size().reset_index(name='delegation_size')

# Potentiel du Sport (Nombre TOTAL de medailles distribuees dans ce sport sur cette annee)
# Cela aide le modèle à comprendre que la Natation permet de gagner beaucoup plus de points que le Handball.
df_sport_scale = df.groupby(['discipline_title', 'game_year', 'game_season'])['medal_value'].sum().reset_index(name='total_medals_available_in_sport')

# Puissance Globale du Pays (Total medailles du pays tous sports confondus cette sur cette annee)
# On va l'utiliser pour créer un LAG (historique)
df_country_total = df.groupby(['country_name', 'game_year', 'game_season'])['medal_value'].sum().reset_index(name='country_total_medals_year')

# fusion
df_ml = pd.merge(df_target, df_delegation, on=['country_name', 'discipline_title', 'game_year', 'game_season'], how='left')
df_ml = pd.merge(df_ml, df_sport_scale, on=['discipline_title', 'game_year', 'game_season'], how='left')
df_ml = pd.merge(df_ml, df_country_total, on=['country_name', 'game_year', 'game_season'], how='left')

In [35]:
# historique sports


#   trie pour calculer les décalages
df_ml = df_ml.sort_values(by=['country_name', 'discipline_title', 'game_year'])

#  Historique dans ce sport precis (derniere fois quil a gagner, il a gagner combien)
df_ml['prev_medals_sport'] = df_ml.groupby(['country_name', 'discipline_title'])['medals_won'].shift(1).fillna(0)

#  Historique Puissance Pays (Combien le pays a gagné AU TOTAL aux derniers jeux ?)
# table temporaire pour faire le lag sur le total pays
df_country_lag = df_country_total.sort_values(['country_name', 'game_year'])
df_country_lag['prev_country_power'] = df_country_lag.groupby(['country_name'])['country_total_medals_year'].shift(1).fillna(0)

# # On fusionne cette info de puissance passée
df_ml = pd.merge(df_ml, df_country_lag[['country_name', 'game_year', 'prev_country_power']], on=['country_name', 'game_year'], how='left')

In [36]:
# MODELISATION ---

# Encodage
le_season = LabelEncoder()
df_ml['season_encoded'] = le_season.fit_transform(df_ml['game_season'])

# Pour les variables categorielles avec beaucoup de valeurs (Pays, Sport),
# HistGradientBoosting gere bien l'OrdinalEncoder
le_country = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_ml['country_encoded'] = le_country.fit_transform(df_ml[['country_name']])

le_sport = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
df_ml['sport_encoded'] = le_sport.fit_transform(df_ml[['discipline_title']])

df_ml.head()

Unnamed: 0,country_name,discipline_title,game_year,game_season,is_host,medals_won,delegation_size,total_medals_available_in_sport,country_total_medals_year,prev_medals_sport,prev_country_power,season_encoded,country_encoded,sport_encoded
0,Afghanistan,Athletics,1960,Summer,0,0,4,205,0,0.0,0.0,0,0.0,5.0
1,Afghanistan,Athletics,1996,Summer,0,0,1,264,0,0.0,0.0,0,0.0,5.0
2,Afghanistan,Boxing,1980,Summer,0,0,3,77,0,0.0,0.0,0,0.0,14.0
3,Afghanistan,Boxing,2004,Summer,0,0,1,77,0,0.0,0.0,0,0.0,14.0
4,Afghanistan,Boxing,2012,Summer,0,0,1,91,1,0.0,1.0,0,0.0,14.0


In [37]:
# features finales
features = [
    'country_encoded',
    'sport_encoded',
    'game_year',
    'season_encoded',
    'is_host',
    'delegation_size',
    'total_medals_available_in_sport', # tres important : echelle du sport
    'prev_medals_sport',               # tres important : historique specifique
    'prev_country_power'               # tres important : puissance globale
]

X = df_ml[features]
y = df_ml['medals_won']


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Entraînement avec {len(features)} features avancées...")
# HistGradientBoosting est souvent plus performant et rapide
model = HistGradientBoostingRegressor(random_state=42, max_iter=200)
model.fit(X_train, y_train)

print(f"Score R² (Test) : {model.score(X_test, y_test):.4f}")

Entraînement avec 9 features avancées...
Score R² (Test) : 0.7765


In [39]:
def predict_podium_smart(country_name, target_year, season='Summer'):
    """
    Génère le podium des sports rapporteurs de médailles.
    """

    # observation des dernieres donnee dispos pour ce pays pour estimer sa 'puissance' et ses 'prev_medals'
    country_history = df_ml[df_ml['country_name'] == country_name]

    if country_history.empty:
        return "Pas de données historiques pour ce pays."

    # estimation de la puissance globale
    last_power = country_history['country_total_medals_year'].values[-1] if len(country_history) > 0 else 0

    # Liste de tous les sports possibles sur la saison
    possible_sports = df_ml[df_ml['game_season'] == season]['discipline_title'].unique()

    prediction_rows = []

    for sport in possible_sports:
        # historique specifique à ce sport
        sport_history = country_history[country_history['discipline_title'] == sport]
        if not sport_history.empty:
            prev_sport_perf = sport_history.iloc[-1]['medals_won']
            # estimation de delegation future egale a la dernière
            deleg_size = sport_history.iloc[-1]['delegation_size']
        else:
            prev_sport_perf = 0
            deleg_size = 1
        avg_avail = df_sport_scale[df_sport_scale['discipline_title'] == sport]['total_medals_available_in_sport'].mean()
        if np.isnan(avg_avail): avg_avail = 10

        # Encodage
        c_code = le_country.transform([[country_name]])[0][0]
        s_code = le_sport.transform([[sport]])[0][0]
        seas_code = le_season.transform([season])[0]

        prediction_rows.append({
            'country_encoded': c_code,
            'sport_encoded': s_code,
            'game_year': target_year,
            'season_encoded': seas_code,
            'is_host': 0 if (country_name == 'France' and target_year == 2024) else 0, # Exemple
            'delegation_size': deleg_size,
            'total_medals_available_in_sport': avg_avail,
            'prev_medals_sport': prev_sport_perf,
            'prev_country_power': last_power
        })

    # creation DF prédiction
    df_pred = pd.DataFrame(prediction_rows)

    # prediction
    df_pred['Predicted_Points'] = model.predict(df_pred[features])
    df_pred['Sport'] = possible_sports

    # filtre scores négatifs
    df_pred['Predicted_Points'] = df_pred['Predicted_Points'].clip(lower=0)
    podium = df_pred[['Sport', 'Predicted_Points']].sort_values(by='Predicted_Points', ascending=False)

    return podium.head(3)

In [40]:
#  test  prédiction
pays_test = "France"
print(f"\n--- Prédiction du Podium pour {pays_test} en 2024 (Été) ---")
print(predict_podium_smart(pays_test, 2024, 'Summer'))


--- Prédiction du Podium pour France en 2024 (Été) ---
      Sport  Predicted_Points
4      Judo         14.689439
64  Croquet         11.606942
13  Fencing          9.917732


In [41]:
def aplatir_podium(df_podium, pays):
    """
    Transforme le podium vertical (3 lignes) en une seule ligne horizontale.
    Ex: France | Sport_1 | Points_1 | Sport_2 | Points_2 ...
    """

    data_flat = {'Country': pays}
    for i, (index, row) in enumerate(df_podium.iterrows()):
        rank = i + 1
        data_flat[f'Sport_{rank}'] = row['Sport']
        data_flat[f'Points_{rank}'] = round(row['Predicted_Points'], 2)
    return pd.DataFrame([data_flat])

# test
podium_vertical = predict_podium_smart("France", 2024, 'Summer')
podium_horizontal = aplatir_podium(podium_vertical, 'France')
print(podium_horizontal)

  Country Sport_1  Points_1  Sport_2  Points_2  Sport_3  Points_3
0  France    Judo     14.69  Croquet     11.61  Fencing      9.92


In [42]:
pays_test_2 = "Norway"
print(f"\n--- Prédiction du Podium pour {pays_test_2} en 2024 (Hiver) ---")
predict_podium_smart(pays_test_2, 2024, 'Summer')


--- Prédiction du Podium pour Norway en 2024 (Hiver) ---


Unnamed: 0,Sport,Predicted_Points
0,Athletics,3.502411
19,Sailing,1.741919
43,Triathlon,1.589618


In [43]:
# creation csv pour toutes les possibiliter
liste_pays = df["country_name"].unique()
liste_pays

resultats = []

for pays in liste_pays:
  print(f"--- Prédiction du Podium pour {pays} en 2024 (ETE)")
  podium_vertical = predict_podium_smart(pays, 2024, 'Summer')
  podium_horizontal = aplatir_podium(podium_vertical, pays)
  resultats.append(podium_horizontal)

df_finals_dashboard = pd.concat(resultats, ignore_index=True)
df_finals_dashboard

--- Prédiction du Podium pour Italy en 2024 (ETE)
--- Prédiction du Podium pour Norway en 2024 (ETE)
--- Prédiction du Podium pour Sweden en 2024 (ETE)
--- Prédiction du Podium pour Great Britain en 2024 (ETE)
--- Prédiction du Podium pour Canada en 2024 (ETE)
--- Prédiction du Podium pour Czech Republic en 2024 (ETE)
--- Prédiction du Podium pour Switzerland en 2024 (ETE)
--- Prédiction du Podium pour United States of America en 2024 (ETE)
--- Prédiction du Podium pour People's Republic of China en 2024 (ETE)
--- Prédiction du Podium pour Australia en 2024 (ETE)
--- Prédiction du Podium pour Japan en 2024 (ETE)
--- Prédiction du Podium pour Republic of Korea en 2024 (ETE)
--- Prédiction du Podium pour Denmark en 2024 (ETE)
--- Prédiction du Podium pour ROC en 2024 (ETE)
--- Prédiction du Podium pour Finland en 2024 (ETE)
--- Prédiction du Podium pour Kazakhstan en 2024 (ETE)
--- Prédiction du Podium pour France en 2024 (ETE)
--- Prédiction du Podium pour New Zealand en 2024 (ETE)
--- 

Unnamed: 0,Country,Sport_1,Points_1,Sport_2,Points_2,Sport_3,Points_3
0,Italy,Athletics,12.36,Fencing,9.84,Swimming,9.11
1,Norway,Athletics,3.50,Sailing,1.74,Triathlon,1.59
2,Sweden,Equestrian,5.40,Sailing,3.34,Athletics,2.67
3,Great Britain,Swimming,18.61,Athletics,14.49,Cycling Track,14.28
4,Canada,Athletics,11.50,Swimming,6.50,Cycling Track,4.87
...,...,...,...,...,...,...,...
233,Saar,Gymnastics Artistic,8.30,Artistic Gymnastics,1.13,Wrestling,0.55
234,MIX,Polo,3.83,Sailing,3.32,Water Polo,2.90
235,Australasia,Swimming,5.96,Rugby,1.21,Artistic Gymnastics,1.17
236,Bohemia,Athletics,3.39,Tennis,1.29,Artistic Gymnastics,1.08


In [44]:
df_finals_dashboard.to_csv("jo_final_predi.csv") # sauvegarde