In [1]:
%load_ext autoreload
%autoreload 2

# (opcional) criar a pastinha do pacote local
import os
os.makedirs("marathon", exist_ok=True)

In [3]:
#Paso 1 - Criando constantes

In [5]:
%%writefile marathon/config.py
# -*- coding: utf-8 -*-
D_MARATHON = 42195.0  # metros

# Cortes por gênero (s)
LIMIT_MEN_S   = 3 * 3600          # 3:00
LIMIT_WOMEN_S = int(3.75 * 3600)  # 3:45

# Regras de semanas
LONG_RUN_TARGET_KM = 30.0
LONG_RUN_INC_KM    = 2.0

# Multiplicadores por faixa de esforço (com HR)
EFFORT_MULT = {"Já atinge": 0.0, "Baixo": 1.0, "Moderado": 1.2, "Alto": 1.5}

# Limites para slope (m/s por semana) e cap de semanas
SLOPE_MIN, SLOPE_MAX = 0.01, 0.10
WEEKS_CAP = 24


Writing marathon/config.py


In [7]:
#Paso 2 - PRocessamento de dados

In [11]:
%%writefile marathon/data.py
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from .config import D_MARATHON, LIMIT_MEN_S, LIMIT_WOMEN_S

def load_raw(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if df.shape[1] == 1 and ";" in df.columns[0]:
        cols = [c.strip() for c in df.columns[0].split(";")]
        df = df.iloc[:, 0].str.split(";", expand=True)
        df.columns = cols

    rename = {
        "athlete": "athlete_id",
        "gender": "gender",
        "timestamp": "timestamp",
        "distance (m)": "distance_m",
        "elapsed time (s)": "elapsed_s",
        "elevation gain (m)": "elev_gain_m",
        "average heart rate (bpm)": "avg_hr_bpm",
    }
    df = df.rename(columns={k: v for k, v in rename.items() if k in df.columns})

    df["athlete_id"] = df["athlete_id"].astype(str)
    for c in ["distance_m", "elapsed_s", "elev_gain_m", "avg_hr_bpm"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c].astype(str).str.replace(",", "."), errors="coerce")
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", dayfirst=True)

    df = df.dropna(subset=["athlete_id", "timestamp", "distance_m", "elapsed_s"]).copy()
    df["km"] = df["distance_m"] / 1000.0
    df["speed_mps"] = df["distance_m"] / df["elapsed_s"]
    df["pace_s_per_km"] = df["elapsed_s"] / df["km"]
    df = df[(df["km"] > 0.5) & (df["pace_s_per_km"].between(150, 1200))].copy()
    return df

def riegel_pred_seconds(distance_m, elapsed_s, exp: float = 1.06):
    return elapsed_s * (D_MARATHON / distance_m) ** exp

def aggregate_athlete(df_runs: pd.DataFrame) -> pd.DataFrame:
    df = df_runs.copy()
    df["pred_marathon_s_from_run"] = riegel_pred_seconds(df["distance_m"], df["elapsed_s"])

    mask10k = df["distance_m"] >= 10000
    best_pred = (df.loc[mask10k]
                   .groupby("athlete_id")["pred_marathon_s_from_run"]
                   .min()
                   .rename("best_pred_marathon_s"))

    agg = (df.groupby("athlete_id").agg(
            gender=("gender", lambda x: x.mode().iloc[0] if len(x.mode()) else "U"),
            num_runs=("athlete_id", "size"),
            total_km=("km", "sum"),
            longest_run_km=("km", "max"),
            median_pace_s_per_km=("pace_s_per_km", "median"),
            best_pace_s_per_km=("pace_s_per_km", "min"),
            avg_hr_bpm=("avg_hr_bpm", "mean"),
            elev_gain_sum=("elev_gain_m", "sum"),
            share_runs_ge_15k=("km", lambda s: (s >= 15).mean()),
         )
         .assign(mean_elev_gain_per_km=lambda d: d["elev_gain_sum"] / d["total_km"])
         .drop(columns=["elev_gain_sum"])
         .reset_index()
         .merge(best_pred.reset_index(), on="athlete_id", how="left")
    )

    agg["gender"] = agg["gender"].astype(str).str.upper().str.strip()
    agg["target_s_gender"] = np.where(agg["gender"].str.startswith("M"), LIMIT_MEN_S, LIMIT_WOMEN_S)
    agg["v_pred_mps"]   = D_MARATHON / agg["best_pred_marathon_s"]
    agg["v_needed_mps"] = D_MARATHON / agg["target_s_gender"]
    agg["apto_genero"] = (agg["best_pred_marathon_s"] <= agg["target_s_gender"]).astype(int)
    agg["esforco_extra_pct_genero"] = ((agg["v_needed_mps"] - agg["v_pred_mps"]) / agg["v_pred_mps"]) * 100
    return agg


Overwriting marathon/data.py


In [13]:
# Passo 3 — marathon/effort.py (HR e esforço)

In [15]:
%%writefile marathon/effort.py
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd

def fit_hr_speed(df_runs: pd.DataFrame) -> pd.DataFrame:
    flt = df_runs[(df_runs["avg_hr_bpm"].notna()) & (df_runs["km"].between(8.0, 25.0))].copy()
    flt = flt.sort_values(["athlete_id", "timestamp"])

    def _fit(g: pd.DataFrame):
        x = g["speed_mps"].values
        y = g["avg_hr_bpm"].values
        if len(x) < 3 or np.allclose(x, x[0]) or np.any(~np.isfinite(y)):
            return pd.Series({"a_int": np.nan, "b_slope": np.nan, "hr_p50": np.nan, "hr_p80": np.nan, "hr_p90": np.nan})
        try:
            b, a = np.polyfit(x, y, 1)  # HR = a + b*vel
            return pd.Series({
                "a_int": float(a),
                "b_slope": float(b),
                "hr_p50": float(np.percentile(y, 50)),
                "hr_p80": float(np.percentile(y, 80)),
                "hr_p90": float(np.percentile(y, 90)),
            })
        except Exception:
            return pd.Series({"a_int": np.nan, "b_slope": np.nan, "hr_p50": np.nan, "hr_p80": np.nan, "hr_p90": np.nan})

    fits = flt.groupby("athlete_id").apply(_fit).reset_index()
    for c in ["a_int", "b_slope", "hr_p50", "hr_p80", "hr_p90"]:
        med = fits[c].median(skipna=True)
        fits[c] = fits[c].fillna(med)
    return fits

def classify_effort_with_hr(agg: pd.DataFrame, fits: pd.DataFrame) -> pd.DataFrame:
    df = agg.merge(fits, on="athlete_id", how="left")
    df["hr_needed_bpm"] = df["a_int"] + df["b_slope"] * df["v_needed_mps"]

    def _cls(row):
        if row["apto_genero"] == 1:
            return "Já atinge"
        gap = row["esforco_extra_pct_genero"]
        hrn, p80, p90 = row["hr_needed_bpm"], row["hr_p80"], row["hr_p90"]
        if (gap < 5) and (hrn <= p80):
            return "Baixo"
        if (5 <= gap <= 15) or (p80 < hrn <= p90):
            return "Moderado"
        return "Alto"

    df["esforco_genero_hr"] = df.apply(_cls, axis=1)
    return df


Writing marathon/effort.py


In [None]:
#Passo 4 — marathon/weeks.py (semanas e slope)

In [17]:
%%writefile marathon/weeks.py
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from .config import LONG_RUN_TARGET_KM, LONG_RUN_INC_KM, SLOPE_MIN, SLOPE_MAX, WEEKS_CAP

def estimate_speed_slope_weeks(df_runs: pd.DataFrame) -> pd.DataFrame:
    flt = df_runs[(df_runs["km"].between(8.0, 25.0))].copy().sort_values(["athlete_id", "timestamp"])
    first = flt.groupby("athlete_id")["timestamp"].transform("min")
    flt["weeks_since_first"] = (flt["timestamp"] - first).dt.days / 7.0

    def _slope(g: pd.DataFrame):
        if g.shape[0] < 2:
            return np.nan
        x = g["weeks_since_first"].values
        y = g["speed_mps"].values
        if np.allclose(x, x[0]): return np.nan
        k, b = np.polyfit(x, y, 1)
        return k

    slopes = flt.groupby("athlete_id").apply(_slope).rename("slope_speed_mps_per_week").reset_index()
    med = slopes["slope_speed_mps_per_week"].median(skipna=True)
    slopes["slope_speed_mps_per_week"] = slopes["slope_speed_mps_per_week"].fillna(med).clip(SLOPE_MIN, SLOPE_MAX)
    return slopes

def estimate_weeks_to_target(df_effort_hr: pd.DataFrame, slopes: pd.DataFrame, effort_mult_map: dict) -> pd.DataFrame:
    df = df_effort_hr.merge(slopes, on="athlete_id", how="left")
    med = df["slope_speed_mps_per_week"].median(skipna=True)
    df["slope_speed_mps_per_week"] = df["slope_speed_mps_per_week"].fillna(med).clip(SLOPE_MIN, SLOPE_MAX)

    v_pred = df["v_pred_mps"].fillna(0.0)
    df["delta_v_needed"] = (df["v_needed_mps"] - v_pred).clip(lower=0)

    eps = 1e-6
    df["weeks_speed_goal"] = df["delta_v_needed"] / (df["slope_speed_mps_per_week"] + eps)

    lr = df["longest_run_km"].fillna(0.0)
    df["weeks_long_run"] = np.ceil(np.clip((LONG_RUN_TARGET_KM - lr) / LONG_RUN_INC_KM, a_min=0, a_max=None))

    df["mult_effort_hr"] = df["esforco_genero_hr"].map(effort_mult_map).fillna(effort_mult_map["Moderado"])
    base_weeks = np.maximum(df["weeks_speed_goal"], df["weeks_long_run"])
    df["weeks_to_target_genero_est_hr"] = np.ceil(np.clip(base_weeks * df["mult_effort_hr"], 0, WEEKS_CAP))
    return df


Writing marathon/weeks.py


In [19]:
#Passo 5 — marathon/modeling.py (treino + gráficos + métricas)

In [21]:
%%writefile marathon/modeling.py
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, precision_recall_curve)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import joblib

try:
    import lightgbm as lgb
    HAS_LGBM = True
except Exception:
    HAS_LGBM = False

def _ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def _plot_feature_importance(names, importances, out_path: Path, top_n: int = 20, title: str = "Importância de variáveis"):
    idx = np.argsort(importances)[::-1][:top_n]
    names = np.array(names)[idx]; vals = np.array(importances)[idx]
    plt.figure(figsize=(8, max(4, len(idx)*0.3)))
    plt.barh(range(len(idx)), vals[::-1])
    plt.yticks(range(len(idx)), names[::-1])
    plt.xlabel("Importância"); plt.title(title); plt.tight_layout()
    plt.savefig(out_path, dpi=150); plt.close()

def _plot_confusion(y_true, y_pred, out_path: Path, title: str):
    cm = confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=[0,1], yticks=[0,1], xticklabels=["0","1"], yticklabels=["0","1"],
           ylabel="Verdadeiro", xlabel="Previsto", title=title)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i, j], ha="center", va="center", color="black")
    plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def _plot_auc_over_time(dates, y_true, y_score, out_path: Path, n_bins: int = 10, title: str = "AUC ao longo do tempo"):
    order = np.argsort(dates)
    dates_ord = np.array(dates)[order]; y_ord = np.array(y_true)[order]; s_ord = np.array(y_score)[order]
    xs, aucs = [], []
    for k in range(max(2, n_bins)):
        end = int(len(y_ord) * (k+1) / n_bins)
        if end < 2: continue
        try:
            auc = roc_auc_score(y_ord[:end], s_ord[:end])
            aucs.append(auc); xs.append(dates_ord[end-1])
        except Exception:
            pass
    plt.figure(figsize=(7,3.5))
    plt.plot(xs, aucs, marker="o")
    plt.title(title); plt.ylabel("AUC"); plt.xlabel("Tempo (ordem dos atletas)")
    plt.grid(True, alpha=0.3); plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def _plot_precision_recall_over_time(dates, y_true, y_score, thr: float, out_path: Path, n_bins: int = 10, title: str = "Precision e Recall ao longo do tempo"):
    order = np.argsort(dates)
    dates_ord = np.array(dates)[order]; y_ord = np.array(y_true)[order]; s_ord = np.array(y_score)[order]
    precs, recs, xs = [], [], []
    for k in range(max(2, n_bins)):
        end = int(len(y_ord) * (k+1) / n_bins)
        if end < 2: continue
        yp = (s_ord[:end] >= thr).astype(int)
        tp = (yp & (y_ord[:end]==1)).sum(); fp = (yp & (y_ord[:end]==0)).sum(); fn = ((1-yp) & (y_ord[:end]==1)).sum()
        prec = tp / (tp+fp) if (tp+fp)>0 else 0.0
        rec = tp / (tp+fn) if (tp+fn)>0 else 0.0
        precs.append(prec); recs.append(rec); xs.append(dates_ord[end-1])
    plt.figure(figsize=(7,3.5))
    plt.plot(xs, precs, marker="o", label="Precision")
    plt.plot(xs, recs, marker="o", label="Recall")
    plt.legend(); plt.title(title); plt.ylim(0,1); plt.xlabel("Tempo")
    plt.grid(True, alpha=0.3); plt.tight_layout(); plt.savefig(out_path, dpi=150); plt.close()

def train_apto_classifier(df_agg_hr: pd.DataFrame, out_dir: Path, runs_df: pd.DataFrame) -> dict:
    _ensure_dir(out_dir)
    target = "apto_genero"
    feat_all = [
        "gender","num_runs","total_km","longest_run_km",
        "median_pace_s_per_km","best_pace_s_per_km","avg_hr_bpm",
        "mean_elev_gain_per_km","share_runs_ge_15k","best_pred_marathon_s"
    ]

    # v1) Processamento dos Dados
    df = df_agg_hr.dropna(subset=[target]).copy()
    runs_df = runs_df.copy()
    runs_df["timestamp"] = pd.to_datetime(runs_df["timestamp"], errors="coerce")
    first_ts = runs_df.groupby("athlete_id")["timestamp"].min().rename("first_timestamp")
    df = df.merge(first_ts.reset_index(), on="athlete_id", how="left")

    # 2) Split Treino/Val/Teste
    X = df[feat_all].copy(); y = df[target].astype(int).values
    pre = ColumnTransformer([("cat", OneHotEncoder(handle_unknown="ignore"), ["gender"])], remainder="passthrough")
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, stratify=y_train_full, random_state=42)

    # 3) Seleção de Variáveis (model-based)
    base_clf = (lgb.LGBMClassifier(objective="binary", n_estimators=600, learning_rate=0.03,
                                   num_leaves=63, subsample=0.8, colsample_bytree=0.8,
                                   class_weight="balanced", random_state=42)
                if HAS_LGBM else
                RandomForestClassifier(n_estimators=400, class_weight="balanced_subsample", random_state=42, n_jobs=-1))
    base_pipe = Pipeline([("prep", pre), ("clf", base_clf)])
    base_pipe.fit(X_train, y_train)
    feat_names = list(base_pipe.named_steps["prep"].get_feature_names_out())
    selector = SelectFromModel(base_pipe.named_steps["clf"], prefit=True, threshold="median")
    X_train_sel = selector.transform(base_pipe.named_steps["prep"].transform(X_train))
    X_val_sel   = selector.transform(base_pipe.named_steps["prep"].transform(X_val))
    X_test_sel  = selector.transform(base_pipe.named_steps["prep"].transform(X_test))

    # 4) Treinamento Final + 4.1 Importância
    final_clf = (lgb.LGBMClassifier(objective="binary", n_estimators=800, learning_rate=0.03,
                                    num_leaves=63, subsample=0.8, colsample_bytree=0.8,
                                    class_weight="balanced", random_state=42)
                 if HAS_LGBM else
                 RandomForestClassifier(n_estimators=600, class_weight="balanced_subsample", random_state=42, n_jobs=-1))
    final_clf.fit(X_train_sel, y_train)
    imp = final_clf.feature_importances_
    sel_names = np.array(feat_names)[selector.get_support()]
    pd.DataFrame({"feature": sel_names, "importance": imp}).sort_values("importance", ascending=False)\
      .to_csv(out_dir / "feature_importance.csv", index=False)
    _plot_feature_importance(sel_names, imp, out_dir / "feature_importance.png")

    # Scores (probas)
    proba = (lambda clf, X: clf.predict_proba(X)[:,1] if hasattr(clf,"predict_proba") else clf.decision_function(X))
    s_train, s_val, s_test = proba(final_clf, X_train_sel), proba(final_clf, X_val_sel), proba(final_clf, X_test_sel)

    # 5.4) Ajuste de Score (threshold) por F1 na validação
    thr_grid = np.linspace(0.1, 0.9, 81)
    best_thr, best_f1 = 0.5, -1
    for t in thr_grid:
        yp = (s_val >= t).astype(int)
        f1 = f1_score(y_val, yp, zero_division=0)
        if f1 > best_f1: best_f1, best_thr = f1, t

    # 5.4.1 Validação (métricas + matriz confusão)
    y_val_pred = (s_val >= best_thr).astype(int)
    val_metrics = {
        "roc_auc": float(roc_auc_score(y_val, s_val)),
        "accuracy": float(accuracy_score(y_val, y_val_pred)),
        "precision": float(precision_score(y_val, y_val_pred, zero_division=0)),
        "recall": float(recall_score(y_val, y_val_pred, zero_division=0)),
        "f1": float(f1_score(y_val, y_val_pred, zero_division=0)),
        "threshold": float(best_thr)
    }
    _plot_confusion(y_val, y_val_pred, out_dir / "cm_validation.png", "Matriz de Confusão – Validação")

    # 5.4.2 Teste
    y_test_pred = (s_test >= best_thr).astype(int)
    test_metrics = {
        "roc_auc": float(roc_auc_score(y_test, s_test)),
        "accuracy": float(accuracy_score(y_test, y_test_pred)),
        "precision": float(precision_score(y_test, y_test_pred, zero_division=0)),
        "recall": float(recall_score(y_test, y_test_pred, zero_division=0)),
        "f1": float(f1_score(y_test, y_test_pred, zero_division=0)),
        "threshold": float(best_thr)
    }
    _plot_confusion(y_test, y_test_pred, out_dir / "cm_test.png", "Matriz de Confusão – Teste")

    # 5.2/5.3 AUC / Precision-Recall ao longo do tempo (usa first_timestamp)
    df_train = df.iloc[X_train.index].copy(); df_val = df.iloc[X_val.index].copy(); df_test = df.iloc[X_test.index].copy()
    for d in [df_val, df_test]:
        if d["first_timestamp"].isna().any():
            # fallback determinístico
            d.loc[d["first_timestamp"].isna(), "first_timestamp"] = pd.date_range("2000-01-01", periods=d["first_timestamp"].isna().sum(), freq="D")

    _plot_auc_over_time(df_val["first_timestamp"], y_val, s_val, out_dir / "auc_over_time_val.png", title="AUC ao longo do tempo – Validação")
    _plot_auc_over_time(df_test["first_timestamp"], y_test, s_test, out_dir / "auc_over_time_test.png", title="AUC ao longo do tempo – Teste")
    from sklearn.metrics import precision_recall_curve
    def _pr_curve_to_df(y_true, scores):
        prec, rec, thr = precision_recall_curve(y_true, scores)
        thr_full = np.r_[thr, 1.0]
        return pd.DataFrame({"threshold": thr_full, "precision": prec, "recall": rec})
    pr_val_df = _pr_curve_to_df(y_val, s_val); pr_test_df = _pr_curve_to_df(y_test, s_test)
    pr_val_df.to_csv(out_dir / "precision_recall_curve_val.csv", index=False)
    pr_test_df.to_csv(out_dir / "precision_recall_curve_test.csv", index=False)

    # Bundle do modelo
    bundle = {
        "prep": base_pipe.named_steps["prep"],
        "selector": selector,
        "model": final_clf,
        "threshold": best_thr,
        "selected_feature_names": sel_names.tolist(),
        "val_metrics": val_metrics,
        "test_metrics": test_metrics,
    }
    joblib.dump(bundle, out_dir / "model_apto_genero_bundle.pkl")

    # Resumo PR
    def _metrics_from_scores(y_true, scores, thr):
        y_pred = (scores >= thr).astype(int)
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(y_true, y_pred)
        tp = int((y_pred & (y_true==1)).sum())
        fp = int((y_pred & (y_true==0)).sum())
        fn = int(((1-y_pred) & (y_true==1)).sum())
        tn = int(((1-y_pred) & (y_true==0)).sum())
        from sklearn.metrics import precision_score, recall_score, f1_score
        return {"threshold": float(thr),
                "precision": float(precision_score(y_true, y_pred, zero_division=0)),
                "recall": float(recall_score(y_true, y_pred, zero_division=0)),
                "f1": float(f1_score(y_true, y_pred, zero_division=0)),
                "tp": tp, "fp": fp, "fn": fn, "tn": tn}
    pr_summary = pd.DataFrame([{"split":"validação", **_metrics_from_scores(y_val, s_val, best_thr)},
                               {"split":"teste",     **_metrics_from_scores(y_test, s_test, best_thr)}])
    pr_summary.to_csv(out_dir / "precision_recall_summary.csv", index=False)

    return {"model_path": str(out_dir / "model_apto_genero_bundle.pkl"),
            "val_metrics": val_metrics, "test_metrics": test_metrics}


Writing marathon/modeling.py


In [23]:
#Passo 6 — marathon/pipeline.py (orquestra tudo)

In [25]:
%%writefile marathon/pipeline.py
# -*- coding: utf-8 -*-
import json
import pandas as pd
from pathlib import Path
from .config import EFFORT_MULT
from .data import load_raw, aggregate_athlete
from .effort import fit_hr_speed, classify_effort_with_hr
from .weeks import estimate_speed_slope_weeks, estimate_weeks_to_target
from .modeling import train_apto_classifier

def distributions(df_final: pd.DataFrame) -> dict:
    N = len(df_final)
    apto = df_final["apto_genero"].map({1:"Sim",0:"Não"}).value_counts()
    effort = df_final["esforco_genero_hr"].value_counts()
    def bucket(w):
        if pd.isna(w): return "Sem estimativa"
        w = int(w)
        if w == 0: return "0 (já atinge)"
        if 1 <= w <= 4: return "1–4"
        if 5 <= w <= 8: return "5–8"
        if 9 <= w <= 12: return "9–12"
        if 13 <= w <= 16: return "13–16"
        if 17 <= w <= 20: return "17–20"
        return "21–24"
    weeks = df_final["weeks_to_target_genero_est_hr"].apply(bucket).value_counts()
    return {
        "total": N,
        "apto": {k:int(v) for k,v in apto.items()},
        "esforco": {k:int(v) for k,v in effort.items()},
        "semanas": {k:int(weeks.get(k,0)) for k in ["0 (já atinge)","1–4","5–8","9–12","13–16","17–20","21–24","Sem estimativa"]}
    }

def run_pipeline(raw_path: str, out_dir: str, train_model: bool = True):
    out_dir = Path(out_dir); out_dir.mkdir(parents=True, exist_ok=True)
    runs = load_raw(raw_path)

    agg = aggregate_athlete(runs)
    agg.to_csv(out_dir / "athlete_features_gender.csv", index=False)

    fits = fit_hr_speed(runs)
    agg_hr = classify_effort_with_hr(agg, fits)
    agg_hr.to_csv(out_dir / "athlete_features_gender_hr.csv", index=False)

    slopes = estimate_speed_slope_weeks(runs)
    final = estimate_weeks_to_target(agg_hr, slopes, EFFORT_MULT)

    keep = ["athlete_id","gender","apto_genero","esforco_genero_hr",
            "best_pred_marathon_s","target_s_gender",
            "v_pred_mps","v_needed_mps","esforco_extra_pct_genero",
            "hr_p80","hr_p90","hr_needed_bpm",
            "num_runs","total_km","longest_run_km","median_pace_s_per_km","best_pace_s_per_km","avg_hr_bpm",
            "mean_elev_gain_per_km","share_runs_ge_15k",
            "slope_speed_mps_per_week","weeks_speed_goal","weeks_long_run","weeks_to_target_genero_est_hr"]
    final[keep].to_csv(out_dir / "athlete_final_weeks_gender_hr.csv", index=False)

    summary = distributions(final)
    (out_dir / "summary.json").write_text(json.dumps(summary, indent=2, ensure_ascii=False))

    model_info = None
    if train_model:
        model_info = train_apto_classifier(agg_hr, out_dir, runs)

    return summary, model_info


Writing marathon/pipeline.py


In [None]:
#Passo 7 — Usar no notebook (executar pipeline por partes)

In [27]:
from marathon.pipeline import run_pipeline

summary, model_info = run_pipeline(
    raw_path=r"C:\Users\gtvca\Documents\raw-data-kaggle.csv",
    out_dir=r"C:\Users\gtvca\Documents\output",
    train_model=True
)

summary, model_info

({'total': 116,
  'apto': {'Não': 59, 'Sim': 57},
  'esforco': {'Já atinge': 57, 'Moderado': 35, 'Alto': 20, 'Baixo': 4},
  'semanas': {'0 (já atinge)': 57,
   '1–4': 3,
   '5–8': 2,
   '9–12': 3,
   '13–16': 3,
   '17–20': 1,
   '21–24': 47,
   'Sem estimativa': 0}},
 {'model_path': 'C:\\Users\\gtvca\\Documents\\output\\model_apto_genero_bundle.pkl',
  'val_metrics': {'roc_auc': 0.8636363636363635,
   'accuracy': 0.782608695652174,
   'precision': 0.6875,
   'recall': 1.0,
   'f1': 0.8148148148148148,
   'threshold': 0.18},
  'test_metrics': {'roc_auc': 0.9027777777777778,
   'accuracy': 0.7916666666666666,
   'precision': 0.7692307692307693,
   'recall': 0.8333333333333334,
   'f1': 0.8,
   'threshold': 0.18}})