# Rookie Invest – Live Prototype Demo

Ablauf:
1. Eine Saison wird als CSV in den Input-Ordner gelegt
2. Das Modell erzeugt ein Ranking
3. Der Output wird als HTML gespeichert

Der Input enthält **keine Information**, ob ein Fahrer später in die F1 kam.


In [91]:
#Setup & Pfade
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import re

# Projektstruktur
DEMO_ROOT = Path.cwd().parent / "demo"
INPUT_DIR = DEMO_ROOT / "input"
OUTPUT_DIR = DEMO_ROOT / "output"
ARTIFACT_DIR = DEMO_ROOT / "artifacts"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Genau eine Input-CSV erwarten
input_files = list(INPUT_DIR.glob("*.csv"))
if len(input_files) != 1:
    raise ValueError(f"Bitte GENAU eine CSV in demo/input ablegen. Gefunden: {len(input_files)}")

INPUT_PATH = input_files[0]

MODEL_PATH = ARTIFACT_DIR / "logreg_model.joblib"
DROP_COLS_PATH = ARTIFACT_DIR / "drop_cols.txt"
VALIDATION_LOOKUP_PATH = ARTIFACT_DIR / "validation_lookup.csv"

print("Input:", INPUT_PATH.name)
print("Output Ordner:", OUTPUT_DIR.resolve())

# Jahr aus Dateiname extrahieren
m = re.search(r"(19|20)\d{2}", INPUT_PATH.name)
year_label = m.group(0) if m else "Unknown Year"


Input: drivers_2023.csv
Output Ordner: /Users/sheyla/Desktop/rookie_invest_ML/demo/output


In [92]:
from pathlib import Path
import pandas as pd

DEMO_ROOT = Path.cwd().parent / "demo"
ARTIFACT_DIR = DEMO_ROOT / "artifacts"
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

out_path = ARTIFACT_DIR / "validation_lookup.csv"

# Wir suchen im ganzen Projekt nach CSVs, die f1_entry enthalten
PROJECT_ROOT = Path.cwd().parent
csv_files = list(PROJECT_ROOT.rglob("*.csv"))

found = []
for fp in csv_files:
    try:
        cols = pd.read_csv(fp, nrows=1).columns.str.lower().tolist()
        if "f1_entry" in cols and "driver_code" in cols:
            found.append(fp)
    except Exception:
        pass

print("Gefundene CSVs mit driver_code + f1_entry:", len(found))
for p in found[:10]:
    print("  ", p)

if not found:
    raise FileNotFoundError(
        "Ich finde keine CSV im Projekt, die driver_code und f1_entry enthält. "
        "Dann kann ich die Validierung nicht automatisch bauen."
    )

# Nimm die erste gefundene Datei als Quelle
source = found[0]
df = pd.read_csv(source)

# Normalisieren
df["driver_code"] = df["driver_code"].astype(str).str.upper().str.strip()

val = df[["driver_code", "f1_entry"]].copy()

# Falls first_f1_year existiert, nehmen wir es optional mit
if "first_f1_year" in df.columns:
    val["first_f1_year"] = df["first_f1_year"]

val = val.dropna(subset=["driver_code"]).drop_duplicates(subset=["driver_code"], keep="last")

val.to_csv(out_path, index=False, encoding="utf-8")
print("validation_lookup.csv geschrieben nach:", out_path.resolve())
print("Rows:", len(val), "Cols:", val.columns.tolist())


Gefundene CSVs mit driver_code + f1_entry: 4
   /Users/sheyla/Desktop/rookie_invest_ML/demo/artifacts/validation_lookup.csv
   /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/f2_f3_features_with_f1_label.csv
   /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/splits/test_after_2021.csv
   /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/splits/train_upto_2021.csv
validation_lookup.csv geschrieben nach: /Users/sheyla/Desktop/rookie_invest_ML/demo/artifacts/validation_lookup.csv
Rows: 181 Cols: ['driver_code', 'f1_entry', 'first_f1_year']


In [93]:
#Input laden
df_in = pd.read_csv(INPUT_PATH)
print("Geladene Fahrer:", len(df_in))
display(df_in.head(3))


Geladene Fahrer: 59


Unnamed: 0,series,year,driver_name,driver_code,team_name,n_races,total_points,avg_points,avg_finish,best_finish,...,points_rate,top10_finishes,top10_rate,total_laps,avg_kph,finish_std,points_std,dnf_count,dnf_rate,avg_best_lap_s
0,F2,2023,A Cordeel,ACO,Invicta Virtuosi Racing,13,8.0,0.615385,16.461538,8,...,0.153846,2,0.153846,387.0,166.443923,4.611858,1.502135,1.0,0.076923,98.393231
1,F2,2023,P Aron,ARO,Trident,1,0.0,0.0,18.0,18,...,0.0,0,0.0,32.0,178.363,,,0.0,0.0,99.55
2,F2,2023,O Bearman,BEA,PREMA Racing,13,96.0,7.384615,9.769231,1,...,0.538462,7,0.538462,392.0,167.555846,6.78422,10.484421,2.0,0.153846,97.591923


In [94]:
#Modell & Drop-Spalten laden
logreg_model = joblib.load(MODEL_PATH)

drop_cols = set(DROP_COLS_PATH.read_text(encoding="utf-8").splitlines())
drop_cols = {c.strip() for c in drop_cols if c.strip()}

print("Drop columns:", drop_cols)


Drop columns: {'first_f1_year', 'f1_entry', 'team_name', 'driver_code', 'driver_name', 'year', 'series'}


In [95]:
#Prediction & Ranking
X = df_in.drop(columns=list(drop_cols), errors="ignore")

proba = logreg_model.predict_proba(X)[:, 1]

df_rank = df_in.copy()
df_rank["predicted_probability"] = proba

df_rank = (
    df_rank
    .sort_values("predicted_probability", ascending=False)
    .reset_index(drop=True)
)

display(df_rank.head(10))


  ret = a @ b
  ret = a @ b
  ret = a @ b


Unnamed: 0,series,year,driver_name,driver_code,team_name,n_races,total_points,avg_points,avg_finish,best_finish,...,top10_finishes,top10_rate,total_laps,avg_kph,finish_std,points_std,dnf_count,dnf_rate,avg_best_lap_s,predicted_probability
0,F3,2023,G Bortoleto,BOR,Trident,9,,,7.777778,3,...,7,0.777778,200.0,158.821333,3.865805,,,,101.193111,0.701703
1,F3,2023,J Martí,MAR,Campos Racing,9,,,8.444444,2,...,6,0.666667,194.0,159.735556,4.034573,,,,101.476,0.648018
2,F2,2023,F Vesti,VES,PREMA Racing,13,130.0,10.0,9.846154,1,...,8,0.615385,298.0,164.282727,9.109026,9.433981,5.0,0.384615,93.9119,0.507588
3,F2,2023,J Doohan,DOO,Invicta Virtuosi Racing,13,137.0,10.538462,8.153846,1,...,9,0.692308,374.0,174.497083,7.459291,9.946653,2.0,0.153846,98.34225,0.49786
4,F2,2023,A Iwasa,IWA,DAMS,13,124.0,9.538462,7.0,1,...,10,0.769231,406.0,166.705385,5.802298,8.078937,1.0,0.076923,95.429083,0.490915
5,F2,2023,T Pourchaire,POU,ART Grand Prix,13,154.0,11.846154,5.769231,1,...,11,0.846154,401.0,169.502538,5.599908,7.548136,1.0,0.076923,97.391154,0.466901
6,F2,2023,Z Maloney,MAL,Carlin,13,88.0,6.769231,10.076923,2,...,6,0.461538,410.0,168.708923,6.873397,7.88621,2.0,0.153846,97.547692,0.361898
7,F3,2023,D Beganovic,BEG,PREMA Racing,9,,,10.555556,3,...,5,0.555556,200.0,158.927222,7.485171,,,,101.357444,0.357259
8,F3,2023,Z O'Sullivan,OSU,PREMA Racing,9,,,10.444444,2,...,5,0.555556,200.0,158.820444,6.207075,,,,101.428889,0.346186
9,F2,2023,E Fittipaldi,FIT,Carlin,13,95.0,7.307692,8.461538,2,...,10,0.769231,401.0,173.390923,6.239864,6.549613,2.0,0.153846,97.527154,0.338608


In [96]:
#Top-N auswählen + ex-post Validierung
top_n = 20
tbl = df_rank.head(top_n).copy()

# --- EX POST VALIDIERUNG (nicht Teil des Inputs fürs Modell) ---
tbl["hit"] = ""

if VALIDATION_LOOKUP_PATH.exists():
    val = pd.read_csv(VALIDATION_LOOKUP_PATH)

    # Normalisieren
    val["driver_code"] = val["driver_code"].astype(str).str.upper().str.strip()
    tbl_codes = tbl["driver_code"].astype(str).str.upper().str.strip()

    lookup = val.set_index("driver_code")["f1_entry"]
    hit_raw = tbl_codes.map(lookup)

    hit_mask = (
        hit_raw.fillna(False)
        .astype(str)
        .str.strip()
        .str.lower()
        .isin(["true", "1", "yes", "y", "t"])
    )

    tbl.loc[hit_mask, "hit"] = "✅"
else:
    print("Hinweis: validation_lookup.csv fehlt, daher keine ✅ möglich.")



In [97]:
print("VALIDATION_LOOKUP_PATH:", VALIDATION_LOOKUP_PATH)
print("Exists:", VALIDATION_LOOKUP_PATH.exists())

if VALIDATION_LOOKUP_PATH.exists():
    val = pd.read_csv(VALIDATION_LOOKUP_PATH)
    print("Validation columns:", val.columns.tolist())
    display(val.head(5))


VALIDATION_LOOKUP_PATH: /Users/sheyla/Desktop/rookie_invest_ML/demo/artifacts/validation_lookup.csv
Exists: True
Validation columns: ['driver_code', 'f1_entry', 'first_f1_year']


Unnamed: 0,driver_code,f1_entry,first_f1_year
0,BIN,False,
1,CAN,False,
2,CEC,False,
3,JEF,False,
4,KIN,False,


In [98]:
#Nur Demo-Spalten behalten
preferred = [
    "driver_name",
    "driver_code",
    "series",
    "year",
    "team_name",
    "predicted_probability",
    "hit",
]

tbl = tbl[[c for c in preferred if c in tbl.columns]]
display(tbl)


Unnamed: 0,driver_name,driver_code,series,year,team_name,predicted_probability,hit
0,G Bortoleto,BOR,F3,2023,Trident,0.701703,
1,J Martí,MAR,F3,2023,Campos Racing,0.648018,
2,F Vesti,VES,F2,2023,PREMA Racing,0.507588,
3,J Doohan,DOO,F2,2023,Invicta Virtuosi Racing,0.49786,
4,A Iwasa,IWA,F2,2023,DAMS,0.490915,
5,T Pourchaire,POU,F2,2023,ART Grand Prix,0.466901,
6,Z Maloney,MAL,F2,2023,Carlin,0.361898,
7,D Beganovic,BEG,F3,2023,PREMA Racing,0.357259,
8,Z O'Sullivan,OSU,F3,2023,PREMA Racing,0.346186,
9,E Fittipaldi,FIT,F2,2023,Carlin,0.338608,


In [99]:
#HTML erzeugen
title = f"Rookie Invest Prototype Demo – Top Kandidaten {year_label}"
out_path = OUTPUT_DIR / "top_candidates.html"

html = f"""
<html>
<head>
  <meta charset="utf-8"/>
  <title>{title}</title>
  <style>
    body {{ font-family: Arial, sans-serif; margin: 24px; }}
    h2 {{ margin-bottom: 8px; }}
    p {{ margin-top: 0; color: #444; }}
    table {{ border-collapse: collapse; width: 100%; }}
    th, td {{ padding: 10px; border-bottom: 1px solid #eee; text-align: left; }}
    th {{ background: #111; color: white; }}
    tr:hover {{ background: #f5f5f5; }}
  </style>
</head>
<body>
  <h2>{title}</h2>
  <p>Ranking basiert auf Modellwahrscheinlichkeit. Input enthält keine Information über F1-Eintritt.</p>
  <p><small>✅ markiert einen bestätigten F1-Einstieg (ex-post, nicht Teil des Inputs).</small></p>
  {tbl.to_html(index=False, escape=False)}
</body>
</html>
"""

out_path.write_text(html, encoding="utf-8")
print("HTML erzeugt:", out_path.resolve())


HTML erzeugt: /Users/sheyla/Desktop/rookie_invest_ML/demo/output/top_candidates.html
