# Rookie Invest – Live Prototype Demo

Ablauf:
1. Eine Saison wird als CSV in den Input-Ordner gelegt
2. Das Modell erzeugt ein Ranking
3. Der Output wird als HTML gespeichert

Der Input enthält **keine Information**, ob ein Fahrer später in die F1 kam.


In [1]:
# --- Notebook setup: src sicher finden und zum Import-Pfad hinzufügen ---
import sys
from pathlib import Path

HERE = Path.cwd()

# nach oben laufen, bis wir src finden
PROJECT_ROOT = None
for parent in [HERE] + list(HERE.parents):
    if (parent / "src").exists():
        PROJECT_ROOT = parent
        break

if PROJECT_ROOT is None:
    raise RuntimeError("Konnte 'src' Ordner nicht finden. Prüfe Projektstruktur.")

SRC_PATH = PROJECT_ROOT / "src"

if str(SRC_PATH) not in sys.path:
    sys.path.insert(0, str(SRC_PATH))

print("Projekt-Root:", PROJECT_ROOT)
print("src im sys.path:", str(SRC_PATH) in sys.path)


Projekt-Root: /Users/sheyla/Desktop/rookie_invest_ML
src im sys.path: True


In [2]:
from pathlib import Path

print("Working directory:", Path.cwd())
print("Demo folder exists:", (Path.cwd() / "demo").exists())
print("Artifacts exist:", (Path.cwd() / "demo" / "artifacts").exists())


Working directory: /Users/sheyla/Desktop/rookie_invest_ML/demo
Demo folder exists: False
Artifacts exist: False


In [3]:
#Setup & Pfade
from pathlib import Path
import pandas as pd
import numpy as np
import joblib
import re

# Projektstruktur
DEMO_ROOT = Path.cwd().parent / "demo"
INPUT_DIR = DEMO_ROOT / "input"
OUTPUT_DIR = DEMO_ROOT / "output"
ARTIFACT_DIR = DEMO_ROOT / "artifacts"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Genau eine Input-CSV erwarten
input_files = list(INPUT_DIR.glob("*.csv"))
if len(input_files) != 1:
    raise ValueError(f"Bitte GENAU eine CSV in demo/input ablegen. Gefunden: {len(input_files)}")

INPUT_PATH = input_files[0]

MODEL_PATH = ARTIFACT_DIR / "logreg_model.joblib"
DROP_COLS_PATH = ARTIFACT_DIR / "drop_cols.txt"
VALIDATION_LOOKUP_PATH = ARTIFACT_DIR / "validation_lookup.csv"

print("Input:", INPUT_PATH.name)
print("Output Ordner:", OUTPUT_DIR.resolve())

# Jahr aus Dateiname extrahieren
m = re.search(r"(19|20)\d{2}", INPUT_PATH.name)
year_label = m.group(0) if m else "Unknown Year"


Input: drivers_2019.csv
Output Ordner: /Users/sheyla/Desktop/rookie_invest_ML/demo/output


In [4]:
from pathlib import Path
import pandas as pd

DEMO_ROOT = Path.cwd().parent / "demo"
ARTIFACT_DIR = DEMO_ROOT / "artifacts"
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

out_path = ARTIFACT_DIR / "validation_lookup.csv"

# Wir suchen im ganzen Projekt nach CSVs, die f1_entry enthalten
PROJECT_ROOT = Path.cwd().parent
csv_files = list(PROJECT_ROOT.rglob("*.csv"))

found = []
for fp in csv_files:
    try:
        cols = pd.read_csv(fp, nrows=1).columns.str.lower().tolist()
        if "f1_entry" in cols and "driver_code" in cols:
            found.append(fp)
    except Exception:
        pass

print("Gefundene CSVs mit driver_code + f1_entry:", len(found))
for p in found[:10]:
    print("  ", p)

if not found:
    raise FileNotFoundError(
        "Ich finde keine CSV im Projekt, die driver_code und f1_entry enthält. "
        "Dann kann ich die Validierung nicht automatisch bauen."
    )

# Nimm die erste gefundene Datei als Quelle
source = found[0]
df = pd.read_csv(source)

# Normalisieren
df["driver_code"] = df["driver_code"].astype(str).str.upper().str.strip()

val = df[["driver_code", "f1_entry"]].copy()

# Falls first_f1_year existiert, nehmen wir es optional mit
if "first_f1_year" in df.columns:
    val["first_f1_year"] = df["first_f1_year"]

val = val.dropna(subset=["driver_code"]).drop_duplicates(subset=["driver_code"], keep="last")

val.to_csv(out_path, index=False, encoding="utf-8")
print("validation_lookup.csv geschrieben nach:", out_path.resolve())
print("Rows:", len(val), "Cols:", val.columns.tolist())


Gefundene CSVs mit driver_code + f1_entry: 4
   /Users/sheyla/Desktop/rookie_invest_ML/demo/artifacts/validation_lookup.csv
   /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/f2_f3_features_with_f1_label.csv
   /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/splits/test_after_2021.csv
   /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/splits/train_upto_2021.csv
validation_lookup.csv geschrieben nach: /Users/sheyla/Desktop/rookie_invest_ML/demo/artifacts/validation_lookup.csv
Rows: 181 Cols: ['driver_code', 'f1_entry', 'first_f1_year']


In [5]:
#Input laden
df_in = pd.read_csv(INPUT_PATH)
print("Geladene Fahrer:", len(df_in))
display(df_in.head(3))


Geladene Fahrer: 60


Unnamed: 0,series,year,driver_name,driver_code,team_name,n_races,total_points,avg_points,avg_finish,best_finish,...,points_rate,top10_finishes,top10_rate,total_laps,avg_kph,finish_std,points_std,dnf_count,dnf_rate,avg_best_lap_s
0,F2,2019,A Cordeel,ACO,Rodin Motorsport,12,4.0,0.333333,14.833333,8,...,0.083333,1,0.083333,293.0,165.68975,3.76185,1.154701,2.0,0.166667,102.038333
1,F2,2019,D Beganovic,BEG,Hitech TGR,14,81.0,5.785714,8.357143,3,...,0.714286,10,0.714286,417.0,168.400143,4.27168,5.726841,1.0,0.071429,101.171357
2,F2,2019,J Bennett,BEN,Van Amersfoort Racing,14,1.0,0.071429,16.0,10,...,0.071429,1,0.071429,398.0,166.480357,3.551814,0.267261,1.0,0.071429,101.650929


In [6]:
#Modell & Drop-Spalten laden
logreg_model = joblib.load(MODEL_PATH)

drop_cols = set(DROP_COLS_PATH.read_text(encoding="utf-8").splitlines())
drop_cols = {c.strip() for c in drop_cols if c.strip()}

print("Drop columns:", drop_cols)


Drop columns: {'team_name', 'first_f1_year', 'f1_entry', 'driver_code', 'series', 'driver_name', 'year'}


In [7]:
#Prediction & Ranking
X = df_in.drop(columns=list(drop_cols), errors="ignore")

proba = logreg_model.predict_proba(X)[:, 1]

df_rank = df_in.copy()
df_rank["predicted_probability"] = proba

df_rank = (
    df_rank
    .sort_values("predicted_probability", ascending=False)
    .reset_index(drop=True)
)

display(df_rank.head(10))


  ret = a @ b
  ret = a @ b
  ret = a @ b


Unnamed: 0,series,year,driver_name,driver_code,team_name,n_races,total_points,avg_points,avg_finish,best_finish,...,top10_finishes,top10_rate,total_laps,avg_kph,finish_std,points_std,dnf_count,dnf_rate,avg_best_lap_s,predicted_probability
0,F3,2019,R Shwartzman,SHW,Prema Racing,8,,,7.25,2,...,7,0.875,162.0,175.2225,2.866058,,,,105.754125,0.857013
1,F3,2019,J Hughes,HUG,HWA RACELAB,8,,,8.875,2,...,6,0.75,133.0,175.151857,8.838835,,,,107.652714,0.779402
2,F3,2019,J Daruvala,DAR,Prema Racing,8,,,9.875,3,...,5,0.625,166.0,174.977125,5.462535,,,,105.86625,0.64278
3,F2,2019,L Fornaroli,FOR,Invicta Racing,14,167.0,11.928571,5.428571,1,...,12,0.857143,416.0,169.220857,5.139804,6.944442,1.0,0.071429,101.378,0.586735
4,F2,2019,A Dunne,DUN,Rodin Motorsport,14,120.0,8.571429,9.571429,1,...,10,0.714286,344.0,179.684692,8.121333,9.685153,5.0,0.357143,102.834,0.576847
5,F2,2019,L Browning,BRO,Hitech TGR,14,144.0,10.285714,7.0,1,...,11,0.785714,419.0,168.228071,6.312381,7.858641,0.0,0.0,101.285214,0.572475
6,F3,2019,M Armstrong,ARM,Prema Racing,8,,,10.625,3,...,5,0.625,167.0,175.3275,6.435116,,,,105.585375,0.564683
7,F3,2019,Y Tsunoda,TSU,Jenzer Motorsport,8,,,9.875,2,...,3,0.375,166.0,174.701625,4.733996,,,,106.14,0.564042
8,F2,2019,R Verschoor,VER,MP Motorsport,14,119.0,8.5,7.642857,1,...,11,0.785714,404.0,178.005615,5.429367,8.3551,1.0,0.071429,102.622308,0.522129
9,F2,2019,J Crawford,CRA,DAMS Lucas Oil,14,141.0,10.071429,7.785714,1,...,9,0.642857,399.0,169.377071,6.784355,10.21661,1.0,0.071429,101.718643,0.519102


In [8]:
# =========================
# ONE CELL: Top-N + ex-post Hit + HTML (ganze Zeile grün, Prozent, ohne Index-Spalte)
# =========================

import pandas as pd

top_n = 20
out_path = OUTPUT_DIR / "top_candidates.html"
title = f"Rookie Invest Prototype Demo – Top Kandidaten {year_label}"

# Top-N aus Ranking (Index sauber neu setzen)
tbl = df_rank.head(top_n).copy().reset_index(drop=True)

# Probability als Prozent anzeigen (z.B. 0.905 -> 90.5%)
if "predicted_probability" in tbl.columns:
    tbl["predicted_probability"] = tbl["predicted_probability"].map(lambda x: f"{float(x)*100:.1f}%")

# Ex-post Flag lokal erzeugen
is_hit = pd.Series([False] * len(tbl), index=tbl.index)

if VALIDATION_LOOKUP_PATH.exists() and "driver_code" in tbl.columns:
    val = pd.read_csv(VALIDATION_LOOKUP_PATH)

    if {"driver_code", "f1_entry"}.issubset(val.columns):
        val_codes = val["driver_code"].astype(str).str.upper().str.strip()
        val_entries = (
            val["f1_entry"]
            .fillna(False)
            .astype(str).str.strip().str.lower()
            .isin(["true", "1", "yes", "y", "t"])
        )

        lookup = pd.Series(val_entries.values, index=val_codes.values)

        tbl_codes = tbl["driver_code"].astype(str).str.upper().str.strip()
        is_hit = tbl_codes.map(lookup).fillna(False).astype(bool)

preferred = ["driver_name", "driver_code", "series", "year", "team_name", "predicted_probability"]
tbl_out = tbl[[c for c in preferred if c in tbl.columns]].copy()

# Grün etwas dunkler
HIT_BG = "#d9f0e0"

def highlight_hits(row):
    if is_hit.loc[row.name]:
        return [f"background-color: {HIT_BG}"] * len(row)
    return [""] * len(row)

styled = (
    tbl_out.style
    .apply(highlight_hits, axis=1)
    .hide(axis="index")  # keine linke Index-Spalte mehr
    .set_table_styles([
        {"selector": "th", "props": [("background-color", "#111"), ("color", "white"), ("padding", "10px")]},
        {"selector": "td", "props": [("padding", "10px"), ("border-bottom", "1px solid #eee")]},
        {"selector": "table", "props": [("border-collapse", "collapse"), ("width", "100%")]},
    ])
)

html = f"""
<html>
<head>
  <meta charset="utf-8"/>
  <title>{title}</title>
</head>
<body style="font-family: Arial, sans-serif; margin: 24px;">
  <h2>{title}</h2>
  <p>Ranking basiert auf Modellwahrscheinlichkeit. Input enthält keine Information über F1-Eintritt.</p>
  <p><small>Grün markiert: bestätigter F1-Einstieg ex post, nicht Teil des Modell Inputs.</small></p>
  {styled.to_html()}
</body>
</html>
"""

out_path.write_text(html, encoding="utf-8")
print("HTML erzeugt:", out_path.resolve())


HTML erzeugt: /Users/sheyla/Desktop/rookie_invest_ML/demo/output/top_candidates.html


In [9]:
# --- DEMO OUTPUT: ML-only view ---
preferred_ml = [
    "driver_name",
    "driver_code",
    "series",
    "year",
    "team_name",
    "predicted_probability",
    "hit",
]

tbl_ml = tbl[[c for c in preferred_ml if c in tbl.columns]]
display(tbl_ml)


Unnamed: 0,driver_name,driver_code,series,year,team_name,predicted_probability
0,R Shwartzman,SHW,F3,2019,Prema Racing,85.7%
1,J Hughes,HUG,F3,2019,HWA RACELAB,77.9%
2,J Daruvala,DAR,F3,2019,Prema Racing,64.3%
3,L Fornaroli,FOR,F2,2019,Invicta Racing,58.7%
4,A Dunne,DUN,F2,2019,Rodin Motorsport,57.7%
5,L Browning,BRO,F2,2019,Hitech TGR,57.2%
6,M Armstrong,ARM,F3,2019,Prema Racing,56.5%
7,Y Tsunoda,TSU,F3,2019,Jenzer Motorsport,56.4%
8,R Verschoor,VER,F2,2019,MP Motorsport,52.2%
9,J Crawford,CRA,F2,2019,DAMS Lucas Oil,51.9%


In [11]:
# --- HYBRID OUTPUT (Anzeige): ML Prediction + Knowledge Base Context ---
# Erwartet, dass es bereits einen DataFrame mit ML-Prediction gibt:
# bevorzugt: tbl (dein ML-only Output DataFrame)
# fallback: df_in (Input DataFrame), falls tbl nicht existiert
# sowie: OUTPUT_DIR (Path) wie in deinem Notebook

import pandas as pd
from knowledge_base.racing_intelligence_engine import RacingIntelligenceEngine

# 1) Basis wählen (ML Output nicht verändern)
if "tbl" in globals():
    base = tbl.copy()
elif "df_in" in globals():
    base = df_in.copy()
else:
    raise NameError("Weder 'tbl' noch 'df_in' existieren. Führe zuerst die ML-Prediction-Zellen aus.")

# 2) Sicherstellen, dass predicted_probability existiert
if "predicted_probability" not in base.columns:
    # manchmal heisst es anders
    if "ml_probability" in base.columns:
        base["predicted_probability"] = base["ml_probability"]
    else:
        raise ValueError("Keine Spalte 'predicted_probability' gefunden. Prüfe, wie du die Prediction-Spalte nennst.")

# 3) KB Engine initialisieren
kb_engine = RacingIntelligenceEngine()

def _safe_get(row, key, default):
    v = row.get(key, default)
    if pd.isna(v):
        return default
    return v

def kb_context(row):
    """
    Wir nutzen hier Defaults für Felder, die in eurem Performance-Datensatz nicht vorhanden sind.
    Das Ziel ist Anzeige/Einordnung, nicht Training.
    """
    driver_input = {
        "age": int(_safe_get(row, "age", 20)),
        "nationality": str(_safe_get(row, "nationality", "unknown")),
        "superlicense_points": float(_safe_get(row, "superlicense_points", 0)),
        "junior_series_years": float(_safe_get(row, "junior_series_years", 0)),
        "years_in_f3": float(_safe_get(row, "years_in_f3", 0)),
        "previous_series": str(_safe_get(row, "previous_series", "")),
        "social_media_behavior": str(_safe_get(row, "social_media_behavior", "neutral")),
        "weight_kg": float(_safe_get(row, "weight_kg", 70)),
        "neck_cm": float(_safe_get(row, "neck_cm", 42)),
        "sponsor_capital_chf": float(_safe_get(row, "sponsor_capital_chf", 0)),
    }

    team_input = {"team_name": str(_safe_get(row, "team_name", ""))}
    vehicle_input = {"engine_status": "ok", "drs_active": False, "tire_status": "ok"}

    return kb_engine.generate_full_profile(driver_input, team_input, vehicle_input)

# 4) KB Features berechnen und anfügen
kb_df = base.apply(kb_context, axis=1, result_type="expand")
hybrid = pd.concat([base.reset_index(drop=True), kb_df.reset_index(drop=True)], axis=1)

# 5) Anzeige-Spalten wählen (nur wenn vorhanden)
preferred_cols = [
    "driver_name",
    "driver_code",
    "series",
    "year",
    "team_name",
    "predicted_probability",
    # KB Kontext (wenn vorhanden)
    "financial_viability",
    "team_political_power",
    "f1_marketing_boost",
    "phys_neck_strength",
    "f3_pathway_score",
    "f1_qualified",
]

show_cols = [c for c in preferred_cols if c in hybrid.columns]

# 6) Sortieren + Top N auswählen + optional Prozent-Spalte
hybrid = hybrid.sort_values("predicted_probability", ascending=False)
hybrid_view = hybrid[show_cols].head(25).copy()

# predicted_probability robust normalisieren
pp = hybrid_view["predicted_probability"].astype(str).str.replace("%", "", regex=False)
pp = pd.to_numeric(pp, errors="coerce")

# immer als Prozent anzeigen
hybrid_view = hybrid_view.drop(columns=["predicted_probability"])



# 7) HTML schreiben (zweiter Output)
out_path2 = OUTPUT_DIR / "top_candidates_with_context.html"
styled = (
    hybrid_view.style
    .format({"predicted_probability_pct": "{:.1f}%"})
    .set_table_styles([
        {"selector": "th", "props": [("background-color", "#111827"), ("color", "white"), ("padding", "8px"), ("text-align", "left")]},
        {"selector": "td", "props": [("padding", "8px"), ("border", "1px solid #e5e7eb")]},
        {"selector": "table", "props": [("border-collapse", "collapse"), ("font-family", "Arial"), ("font-size", "12px")]},
    ])
)

out_path2.write_text(styled.to_html(), encoding="utf-8")

print("Hybrid HTML erzeugt:", out_path2.resolve())
display(hybrid_view)


Hybrid HTML erzeugt: /Users/sheyla/Desktop/rookie_invest_ML/demo/output/top_candidates_with_context.html


Unnamed: 0,driver_name,driver_code,series,year,team_name,financial_viability,team_political_power,f1_marketing_boost,phys_neck_strength,f3_pathway_score,f1_qualified
0,R Shwartzman,SHW,F3,2019,Prema Racing,0.0,0.0,0.0,1.0,0.5,0.0
1,J Hughes,HUG,F3,2019,HWA RACELAB,0.0,0.0,0.0,1.0,0.5,0.0
2,J Daruvala,DAR,F3,2019,Prema Racing,0.0,0.0,0.0,1.0,0.5,0.0
3,L Fornaroli,FOR,F2,2019,Invicta Racing,0.0,0.0,0.0,1.0,0.5,0.0
4,A Dunne,DUN,F2,2019,Rodin Motorsport,0.0,0.0,0.0,1.0,0.5,0.0
5,L Browning,BRO,F2,2019,Hitech TGR,0.0,0.0,0.0,1.0,0.5,0.0
6,M Armstrong,ARM,F3,2019,Prema Racing,0.0,0.0,0.0,1.0,0.5,0.0
7,Y Tsunoda,TSU,F3,2019,Jenzer Motorsport,0.0,0.0,0.0,1.0,0.5,0.0
8,R Verschoor,VER,F2,2019,MP Motorsport,0.0,0.0,0.0,1.0,0.5,0.0
9,J Crawford,CRA,F2,2019,DAMS Lucas Oil,0.0,0.0,0.0,1.0,0.5,0.0
