In [2]:
import pandas as pd
import numpy as np

# Lire les données CSV pour chaque année
years = [2015, 2016, 2017]
data = {}
for year in years:
    df = pd.read_csv(f"/Users/claradalon/Documents/GitHub/Sacoche/archive/marathon_results_{year}.csv")  # remplacer par les noms réels des fichiers
    df['Year'] = year  # ajouter la colonne année si ce n'est pas déjà dans le CSV
    data[year] = df

# Vérifions le nombre de lignes pour chaque année
for year, df in data.items():
    print(year, ":", len(df), "coureurs")


2015 : 26598 coureurs
2016 : 26630 coureurs
2017 : 26410 coureurs


In [13]:
import numpy as np

for year, df in data.items():
    # Conversion des temps officiels en secondes (nombre de secondes écoulées)
    def time_to_seconds(t):
        if pd.isna(t) or t == "" or t is None:
            return np.nan  # si pas de temps (DNF par ex.), on garde NaN
        # Supposons le format "H:MM:SS" ou "HH:MM:SS"
        parts = t.split(':')
        # Si le format est H:MM:SS, parts[0] peut être une ou deux chiffres
        hours = int(parts[0])
        minutes = int(parts[1])
        seconds = int(parts[2])
        total_sec = hours*3600 + minutes*60 + seconds
        return total_sec

    df['OfficialSeconds'] = df['Official Time'].apply(time_to_seconds)


In [14]:
df.head()

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Year,OfficialSeconds
0,0,11,"Kirui, Geoffrey",24,M,Keringet,,KEN,,,...,1:48:19,2:02:53,0:04:57,-,2:09:37,1,1,1,2017,7777
1,1,17,"Rupp, Galen",30,M,Portland,OR,USA,,,...,1:48:19,2:03:14,0:04:58,-,2:09:58,2,2,2,2017,7798
2,2,23,"Osako, Suguru",25,M,Machida-City,,JPN,,,...,1:48:31,2:03:38,0:04:59,-,2:10:28,3,3,3,2017,7828
3,3,21,"Biwott, Shadrack",32,M,Mammoth Lakes,CA,USA,,,...,1:48:58,2:04:35,0:05:03,-,2:12:08,4,4,4,2017,7928
4,4,9,"Chebet, Wilson",31,M,Marakwet,,KEN,,,...,1:48:41,2:05:00,0:05:04,-,2:12:35,5,5,5,2017,7955


In [15]:
stats = {}  # dictionnaire pour stocker les stats par année

for year, df in data.items():
    total = len(df)
    # Nombre de finishers (ceux qui ont un temps officiel non manquant)
    finishers = df['OfficialSeconds'].count()
    completion_rate = finishers / total if total > 0 else 0

    # Calcul des statistiques de temps (sur OfficialSeconds non-NaN)
    mean_sec = df['OfficialSeconds'].mean()
    median_sec = df['OfficialSeconds'].median()
    fastest_sec = df['OfficialSeconds'].min()
    slowest_sec = df['OfficialSeconds'].max()

    # Fonction pour formater les secondes en H:MM:SS
    def sec_to_hms(sec):
        if pd.isna(sec):
            return None
        sec = int(sec)
        h = sec // 3600
        m = (sec % 3600) // 60
        s = sec % 60
        return f"{h:d}:{m:02d}:{s:02d}"

    stats[year] = {
        "total_runners": total,
        "completion_rate": completion_rate,
        "mean_time": sec_to_hms(mean_sec),
        "median_time": sec_to_hms(median_sec),
        "fastest_time": sec_to_hms(fastest_sec),
        "slowest_time": sec_to_hms(slowest_sec),
        "segments": []  # on remplira plus tard
    }

# Aperçu des stats globales calculées
from pprint import pprint
pprint(stats)


{2015: {'completion_rate': 1.0,
        'fastest_time': '2:09:17',
        'mean_time': '3:46:25',
        'median_time': '3:39:40',
        'segments': [],
        'slowest_time': '8:06:01',
        'total_runners': 26598},
 2016: {'completion_rate': 1.0,
        'fastest_time': '2:12:45',
        'mean_time': '3:55:02',
        'median_time': '3:48:05',
        'segments': [],
        'slowest_time': '10:30:23',
        'total_runners': 26630},
 2017: {'completion_rate': 1.0,
        'fastest_time': '2:09:37',
        'mean_time': '3:58:03',
        'median_time': '3:51:39',
        'segments': [],
        'slowest_time': '7:58:14',
        'total_runners': 26410}}


In [16]:
def categorize_time(sec):
    """Retourne la catégorie de niveau en fonction du temps (en sec)."""
    if pd.isna(sec):
        return None  # pas de temps => pas de catégorie (DNF ou non partant)
    if sec < 3*3600:
        return "Elite"
    elif sec < 4*3600:
        return "Avancé"
    elif sec < 5*3600:
        return "Intermédiaire"
    else:
        return "Débutant"

for year, df in data.items():
    df['Level'] = df['OfficialSeconds'].apply(categorize_time)


In [19]:
# Définition des tranches d'âge
bins = [18, 30, 40, 50, 60, 70, np.inf]  # np.inf pour couvrir tout âge >= 70
labels = ["18-29", "30-39", "40-49", "50-59", "60-69", "70+"]

for year, df in data.items():
    # Créer la colonne 'AgeRange' en coupant l'âge selon nos intervalles
    df['AgeRange'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)


In [21]:
for year, df in data.items():
    # Filtrer les finishers uniquement (OfficialSeconds non NaN)
    finished = df.dropna(subset=['OfficialSeconds'])
    # Regrouper par AgeRange, Gender, Level
    group = finished.groupby(['AgeRange', 'Gender', 'Level'])
    # Calculer le count, mean, median, min, max des temps par groupe
    agg_stats = group['OfficialSeconds'].agg(['count', 'mean', 'median', 'min', 'max']).reset_index()
    # Renommer les colonnes pour plus de clarté
    agg_stats.rename(columns={
        'count': 'count',
        'mean': 'mean_sec',
        'median': 'median_sec',
        'min': 'min_sec',
        'max': 'max_sec'
    }, inplace=True)
    
    # Pour chaque ligne de agg_stats, formater les temps en H:MM:SS et ajouter au JSON
    for _, row in agg_stats.iterrows():
        segment_info = {
            "age_range": str(row['AgeRange']),  # convertir de catégorie à str
            "gender": row['Gender'],
            "level": row['Level'],
            "count": int(row['count']),
            "mean_time": sec_to_hms(row['mean_sec']),
            "median_time": sec_to_hms(row['median_sec']),
            "fastest_time": sec_to_hms(row['min_sec']),
            "slowest_time": sec_to_hms(row['max_sec'])
        }
        stats[year]["segments"].append(segment_info)


In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,Pace,Proj Time,Official Time,Overall,Gender,Division,Year,OfficialSeconds,Level,AgeRange
0,0,11,"Kirui, Geoffrey",24,M,Keringet,,KEN,,,...,0:04:57,-,2:09:37,1,1,1,2017,7777,Elite,18-29
1,1,17,"Rupp, Galen",30,M,Portland,OR,USA,,,...,0:04:58,-,2:09:58,2,2,2,2017,7798,Elite,30-39
2,2,23,"Osako, Suguru",25,M,Machida-City,,JPN,,,...,0:04:59,-,2:10:28,3,3,3,2017,7828,Elite,18-29
3,3,21,"Biwott, Shadrack",32,M,Mammoth Lakes,CA,USA,,,...,0:05:03,-,2:12:08,4,4,4,2017,7928,Elite,30-39
4,4,9,"Chebet, Wilson",31,M,Marakwet,,KEN,,,...,0:05:04,-,2:12:35,5,5,5,2017,7955,Elite,30-39


In [23]:
import json

# Exporter le dictionnaire stats vers un fichier JSON
with open("stats_marathon_2015_2017.json", "w", encoding="utf-8") as f:
    json.dump(stats, f, ensure_ascii=False, indent=4)


In [49]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import logging
from typing import Dict, List, Any, Optional
import math

# Configuration du logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class MarathonDataProcessor:
    def __init__(self):
        self.data: Dict[int, pd.DataFrame] = {}
        self.stats: Dict[str, Any] = {}
        self.age_bins = [18, 30, 40, 50, 60, 70, np.inf]
        self.age_labels = ["18-29", "30-39", "40-49", "50-59", "60-69", "70+"]
        self.levels = ["Elite", "Avancé", "Intermédiaire", "Débutant"]

    def load_data(self, data_directory: str = "") -> None:
        years = [2015, 2016, 2017]
        data_dir = Path(data_directory)
        if not data_dir.exists():
            raise FileNotFoundError(f"Répertoire de données '{data_directory}' introuvable")

        for year in years:
            file_path = data_dir / f"marathon_results_{year}.csv"
            if not file_path.exists():
                logger.warning(f"Fichier {file_path} introuvable, ignoré")
                continue
            try:
                logger.info(f"Chargement des données {year}...")
                df = pd.read_csv(file_path)
                df['Year'] = year
                required_columns = ['Official Time', 'Age', 'M/F']
                missing_cols = [col for col in required_columns if col not in df.columns]
                if missing_cols:
                    logger.error(f"Colonnes manquantes dans {year}: {missing_cols}")
                    continue
                self.data[year] = df
                logger.info(f"✅ {year}: {len(df)} coureurs chargés")
            except Exception as e:
                logger.error(f"Erreur lors du chargement de {year}: {e}")

    @staticmethod
    def time_to_seconds(time_str: Any) -> Optional[float]:
        if pd.isna(time_str) or time_str == "" or time_str is None:
            return np.nan
        try:
            if isinstance(time_str, str):
                parts = time_str.split(':')
                if len(parts) != 3:
                    return np.nan
                hours = int(parts[0])
                minutes = int(parts[1])
                seconds = int(parts[2])
                if minutes >= 60 or seconds >= 60:
                    return np.nan
                return hours * 3600 + minutes * 60 + seconds
            return np.nan
        except (ValueError, AttributeError):
            return np.nan

    def process_data(self) -> None:
        if not self.data:
            raise ValueError("Aucune donnée chargée. Utilisez load_data() d'abord.")
        for year, df in self.data.items():
            logger.info(f"Traitement des données {year}...")
            df['OfficialSeconds'] = df['Official Time'].apply(self.time_to_seconds)
            df['AgeRange'] = pd.cut(df['Age'], bins=self.age_bins, labels=self.age_labels, right=False)
            df['Gender'] = df['M/F'].map({'M': 1, 'F': 2}).fillna(0)
            valid_times = df['OfficialSeconds'].between(3600, 43200)
            invalid_count = (~valid_times & df['OfficialSeconds'].notna()).sum()
            if invalid_count > 0:
                logger.warning(f"{year}: {invalid_count} temps aberrants supprimés")
                df.loc[~valid_times, 'OfficialSeconds'] = np.nan
            logger.info(f"✅ {year}: traitement terminé")

    def compute_segment_paces(self, output_file: str = "segment_pace_profiles.json") -> None:
        """
        Calcule le pace moyen entre les points (5K, 10K, ..., 42K) toutes les 5 minutes,
        en distinguant hommes et femmes, et l'exporte au format JSON avec le format "MM:SS"
        """
        checkpoints = ["5K", "10K", "15K", "20K", "25K", "30K", "35K", "40K", "Official Time"]
        km_points = [5, 10, 15, 20, 25, 30, 35, 40, 42.195]
        interval_seconds = 5 * 60  # 5 minutes

        def get_seconds(t):
            if pd.isna(t) or t in ["-", ""]:
                return None
            try:
                parts = [int(p) for p in str(t).strip().split(":")]
                if len(parts) == 3:
                    return parts[0]*3600 + parts[1]*60 + parts[2]
                elif len(parts) == 2:
                    return parts[0]*60 + parts[1]
                else:
                    return None
            except ValueError:
                return None


        def decimal_to_min_per_km(decimal_pace):
            if math.isnan(decimal_pace):
                return None
            minutes = int(decimal_pace)
            seconds = int(round((decimal_pace - minutes) * 60))
            return f"{minutes}:{seconds:02d}"

        result = {}

        for year, df in self.data.items():
            for gender_value, gender_label in [(1, "M"), (2, "F")]:
                df_gen = df[df['Gender'] == gender_value].copy()
                for cp in checkpoints:
                    df_gen[cp + "_s"] = df_gen[cp].apply(get_seconds)

                max_time = df_gen["Official Time"].dropna().apply(get_seconds).max()
                max_bin = int(max_time // interval_seconds) * interval_seconds

                for bin_start in range(0, max_bin + 1, interval_seconds):
                    bin_end = bin_start + interval_seconds
                    bin_center = f"{bin_start // 60:02d}:{bin_start % 60:02d}"
                    subset = df_gen[df_gen["Official Time"].apply(get_seconds).between(bin_start, bin_end)]
                    if len(subset) == 0:
                        continue
                    paces = []
                    for i in range(1, len(km_points)):
                        t1 = subset[checkpoints[i-1] + "_s"]
                        t2 = subset[checkpoints[i] + "_s"]
                        delta_t = (t2 - t1)
                        delta_km = km_points[i] - km_points[i-1]
                        pace = (delta_t / delta_km).dropna()
                        avg_pace = pace.mean() / 60 if not pace.empty else np.nan
                        paces.append(avg_pace)
                    paces_min_sec = [decimal_to_min_per_km(p) for p in paces]
                    result.setdefault(year, {}).setdefault(gender_label, {})[bin_center] = {
                        f"{int(km_points[i-1])}-{int(km_points[i])}km": paces_min_sec[i-1]
                        for i in range(1, len(km_points))
                    }

        with open(output_file, "w") as f:
            json.dump(result, f, indent=4)
        logger.info(f"✅ Segment pace profiles exportés vers {output_file}")


def main():
    try:
        processor = MarathonDataProcessor()
        processor.load_data("/Users/claradalon/Documents/GitHub/Sacoche/archive")
        processor.process_data()
        processor.compute_segment_paces("segment_pace_profiles.json")
        logger.info("🎉 Traitement terminé avec succès!")
    except Exception as e:
        logger.error(f"❌ Erreur durante le traitement: {e}")
        raise

if __name__ == "__main__":
    main()


2025-05-29 14:00:46,742 - INFO - Chargement des données 2015...
2025-05-29 14:00:46,830 - INFO - ✅ 2015: 26598 coureurs chargés
2025-05-29 14:00:46,830 - INFO - Chargement des données 2016...
2025-05-29 14:00:46,906 - INFO - ✅ 2016: 26630 coureurs chargés
2025-05-29 14:00:46,907 - INFO - Chargement des données 2017...
2025-05-29 14:00:46,987 - INFO - ✅ 2017: 26410 coureurs chargés
2025-05-29 14:00:46,988 - INFO - Traitement des données 2015...
2025-05-29 14:00:47,018 - INFO - ✅ 2015: traitement terminé
2025-05-29 14:00:47,019 - INFO - Traitement des données 2016...
2025-05-29 14:00:47,049 - INFO - ✅ 2016: traitement terminé
2025-05-29 14:00:47,049 - INFO - Traitement des données 2017...
2025-05-29 14:00:47,079 - INFO - ✅ 2017: traitement terminé
2025-05-29 14:00:58,051 - INFO - ✅ Segment pace profiles exportés vers segment_pace_profiles.json
2025-05-29 14:00:58,052 - INFO - 🎉 Traitement terminé avec succès!
