# Notebook 02: Labeling & Train/Test Split

Erzeugt das Label `f1_entry` und schreibt die Model-Input-Dateien.


In [1]:
from pathlib import Path
import pandas as pd
from IPython.display import display

# Projekt-Root (eine Ebene über notebooks/)
PROJECT_ROOT = Path.cwd().parent

PATH_ALL = PROJECT_ROOT / "data/all_series/processed/all_series_master_features_core.csv"
OUT_DIR = PROJECT_ROOT / "data/model_input"
OUT_PATH = OUT_DIR / "f2_f3_features_with_f1_label.csv"

print("Using project root:", PROJECT_ROOT)
print("CSV exists:", PATH_ALL.exists())


Using project root: /Users/sheyla/Desktop/rookie_invest_ML
CSV exists: True


In [2]:
# Daten laden und Grundcheck

df = pd.read_csv(PATH_ALL)

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head(5))


Shape: (3713, 26)
Columns: ['series', 'year', 'driver_name', 'driver_code', 'team_name', 'n_races', 'total_points', 'avg_points', 'avg_finish', 'best_finish', 'worst_finish', 'wins', 'win_rate', 'podiums', 'podium_rate', 'points_finishes', 'points_rate', 'top10_finishes', 'top10_rate', 'total_laps', 'avg_kph', 'finish_std', 'points_std', 'dnf_count', 'dnf_rate', 'avg_best_lap_s']


Unnamed: 0,series,year,driver_name,driver_code,team_name,n_races,total_points,avg_points,avg_finish,best_finish,...,points_rate,top10_finishes,top10_rate,total_laps,avg_kph,finish_std,points_std,dnf_count,dnf_rate,avg_best_lap_s
0,F1,1950,Alberto Ascari,\N,Ferrari,4,11.0,2.2,8.6,2,...,0.75,3,0.75,238.0,,7.765307,2.48998,4.0,1.0,
1,F1,1950,Alfredo Pián,\N,Maserati,1,0.0,0.0,21.0,21,...,0.0,0,0.0,0.0,,,,1.0,1.0,
2,F1,1950,Bayliss Levrett,\N,Adams,1,0.0,0.0,27.0,27,...,0.0,0,0.0,108.0,,,,1.0,1.0,
3,F1,1950,Bill Cantrell,\N,Adams,1,0.0,0.0,27.0,27,...,0.0,0,0.0,108.0,,,,1.0,1.0,
4,F1,1950,Bill Holland,\N,Deidt,1,6.0,6.0,2.0,2,...,1.0,1,1.0,137.0,,,,1.0,1.0,


In [3]:
# Serien und Jahr Abdeckung
print(df["series"].value_counts())
print("Year range:", df["year"].min(), "-", df["year"].max())


series
F1    3211
F3     255
F2     247
Name: count, dtype: int64
Year range: 1950 - 2025


In [4]:
# Duplikate global und in F2/F3
KEY = ["series", "year", "driver_code"]

print("Global duplicates by (series, year, driver_code):", df.duplicated(KEY).sum())

non_f1 = df[df["series"].isin(["F2","F3"])].copy()

print("F2+F3 shape:", non_f1.shape)
print("driver_code equals \\N in F2+F3:", (non_f1["driver_code"] == r"\N").sum())
print("Duplicates F2/F3 by (series, year, driver_code):", non_f1.duplicated(KEY).sum())
print("Duplicates F2/F3 by (series, year, driver_name):", non_f1.duplicated(["series","year","driver_name"]).sum())


Global duplicates by (series, year, driver_code): 2558
F2+F3 shape: (502, 26)
driver_code equals \N in F2+F3: 0
Duplicates F2/F3 by (series, year, driver_code): 10
Duplicates F2/F3 by (series, year, driver_name): 5


In [5]:
# First F1 year erzeugen
f1 = df[df["series"] == "F1"].copy()
first_f1_year = f1.groupby("driver_code")["year"].min()

print("Drivers with F1 entry:", first_f1_year.shape[0])
display(first_f1_year.head(10))


Drivers with F1 entry: 98


driver_code
AIT    2020
ALB    2005
ALG    2009
ALO    2001
BAD    1993
BAR    1993
BEA    2024
BIA    1959
BOT    2013
BOU    2008
Name: year, dtype: int64

In [6]:
# Label bauen
tmp = non_f1.copy()
tmp["first_f1_year"] = tmp["driver_code"].map(first_f1_year)
tmp["f1_entry"] = tmp["first_f1_year"].notna() & (tmp["first_f1_year"] > tmp["year"])

print(tmp["f1_entry"].value_counts())
print("Positive share:", tmp["f1_entry"].mean())

same_year_conflicts = ((tmp["first_f1_year"] == tmp["year"]) & tmp["first_f1_year"].notna()).sum()
print("Same-year conflicts:", same_year_conflicts)


f1_entry
False    470
True      32
Name: count, dtype: int64
Positive share: 0.06374501992031872
Same-year conflicts: 4


In [7]:
# Deduplizieren, Label neu, same year raus
clean = non_f1.copy()

clean = (
    clean.sort_values(["series","year","driver_code","n_races","total_points"], ascending=[True,True,True,False,False])
         .drop_duplicates(["series","year","driver_code"], keep="first")
         .copy()
)

clean["first_f1_year"] = clean["driver_code"].map(first_f1_year)
clean["f1_entry"] = clean["first_f1_year"].notna() & (clean["first_f1_year"] > clean["year"])

# remove same-year conflicts
clean = clean[~((clean["first_f1_year"] == clean["year"]) & clean["first_f1_year"].notna())].copy()

print("After dedupe shape:", clean.shape)
print(clean["f1_entry"].value_counts())
print("Positive share:", clean["f1_entry"].mean())


After dedupe shape: (488, 28)
f1_entry
False    456
True      32
Name: count, dtype: int64
Positive share: 0.06557377049180328


In [8]:
# Feature Set definieren
DROP_COLS = ["driver_name", "driver_code", "team_name", "series"]

feature_cols = [c for c in clean.columns if c not in DROP_COLS + ["f1_entry", "first_f1_year"]]

print("Number of features:", len(feature_cols))
print(feature_cols)


Number of features: 22
['year', 'n_races', 'total_points', 'avg_points', 'avg_finish', 'best_finish', 'worst_finish', 'wins', 'win_rate', 'podiums', 'podium_rate', 'points_finishes', 'points_rate', 'top10_finishes', 'top10_rate', 'total_laps', 'avg_kph', 'finish_std', 'points_std', 'dnf_count', 'dnf_rate', 'avg_best_lap_s']


In [9]:
# Speichern
OUT_DIR.mkdir(parents=True, exist_ok=True)

clean.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
print("Saved shape:", clean.shape)


Saved: /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/f2_f3_features_with_f1_label.csv
Saved shape: (488, 28)


## Train/Test Split (zeitbasiert)

In [10]:
from pathlib import Path
import pandas as pd

def find_project_root(start: Path) -> Path:
    for p in [start] + list(start.parents):
        if (p / "data").exists() and (p / "src").exists():
            return p
    return start

PROJECT_ROOT = find_project_root(Path.cwd())
DATA_PATH = PROJECT_ROOT / "data/model_input/f2_f3_features_with_f1_label.csv"

print("Project root:", PROJECT_ROOT)
print("Data path:", DATA_PATH)
print("Exists:", DATA_PATH.exists())


Project root: /Users/sheyla/Desktop/rookie_invest_ML
Data path: /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/f2_f3_features_with_f1_label.csv
Exists: True


In [11]:
# Daten laden und Basis prüfen
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df["series"].value_counts() if "series" in df.columns else "No series column")
print(df["f1_entry"].value_counts())
df.head(5)


Shape: (488, 28)
Columns: ['series', 'year', 'driver_name', 'driver_code', 'team_name', 'n_races', 'total_points', 'avg_points', 'avg_finish', 'best_finish', 'worst_finish', 'wins', 'win_rate', 'podiums', 'podium_rate', 'points_finishes', 'points_rate', 'top10_finishes', 'top10_rate', 'total_laps', 'avg_kph', 'finish_std', 'points_std', 'dnf_count', 'dnf_rate', 'avg_best_lap_s', 'first_f1_year', 'f1_entry']
series
F2    244
F3    244
Name: count, dtype: int64
f1_entry
False    456
True      32
Name: count, dtype: int64


Unnamed: 0,series,year,driver_name,driver_code,team_name,n_races,total_points,avg_points,avg_finish,best_finish,...,top10_rate,total_laps,avg_kph,finish_std,points_std,dnf_count,dnf_rate,avg_best_lap_s,first_f1_year,f1_entry
0,F2,2017,A Albon,ALB,ART Grand Prix,10,67.0,6.7,7.7,2,...,0.9,245.0,177.766,4.498148,6.429965,0.0,0.0,97.6208,2005.0,False
1,F2,2017,R Binder,BIN,Rapax,1,0.0,0.0,17.0,17,...,0.0,28.0,169.58,,,0.0,0.0,89.032,,False
2,F2,2017,R Boschung,BOS,Campos Racing,10,2.0,0.2,16.2,9,...,0.1,195.0,174.4635,3.457681,0.632456,5.0,0.5,98.3791,,False
3,F2,2017,S Canamasas,CAN,Trident,7,14.0,2.0,12.428571,4,...,0.285714,170.0,172.147143,5.223573,4.472136,1.0,0.142857,96.403857,,False
4,F2,2017,J Cecotto,CEC,Rapax,4,21.0,5.25,9.0,2,...,0.75,100.0,166.8905,5.354126,8.539126,0.0,0.0,100.32425,,False


In [12]:
# Jahresverteilung und Positives pro Jahr
print("Year range:", df["year"].min(), "-", df["year"].max())

year_counts = df["year"].value_counts().sort_index()
year_pos = df.groupby("year")["f1_entry"].sum().sort_index()

summary = pd.DataFrame({
    "n_rows": year_counts,
    "n_positive": year_pos,
})
summary["positive_share"] = summary["n_positive"] / summary["n_rows"]

summary


Year range: 2017 - 2025


Unnamed: 0_level_0,n_rows,n_positive,positive_share
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017,28,6,0.214286
2018,24,5,0.208333
2019,60,3,0.05
2020,59,6,0.101695
2021,63,5,0.079365
2022,67,4,0.059701
2023,59,3,0.050847
2024,64,0,0.0
2025,64,0,0.0


In [13]:
CUTOFF = 2021

train_df = df[df["year"] <= CUTOFF].copy()
test_df = df[(df["year"] > CUTOFF) & (df["year"] <= 2023)].copy()

print("Train:", train_df.shape, "Positives:", int(train_df["f1_entry"].sum()))
print("Test :", test_df.shape,  "Positives:", int(test_df["f1_entry"].sum()))

print("Train years:", train_df["year"].min(), "-", train_df["year"].max())
print("Test years :", test_df["year"].min(), "-", test_df["year"].max())


Train: (234, 28) Positives: 25
Test : (126, 28) Positives: 7
Train years: 2017 - 2021
Test years : 2022 - 2023


In [14]:
# Split speichern
OUT_DIR = PROJECT_ROOT / "data/model_input/splits"
OUT_DIR.mkdir(parents=True, exist_ok=True)

train_path = OUT_DIR / f"train_upto_{CUTOFF}.csv"
test_path = OUT_DIR / f"test_after_{CUTOFF}.csv"

train_df.to_csv(train_path, index=False)
test_df.to_csv(test_path, index=False)

print("Saved train:", train_path)
print("Saved test :", test_path)


Saved train: /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/splits/train_upto_2021.csv
Saved test : /Users/sheyla/Desktop/rookie_invest_ML/data/model_input/splits/test_after_2021.csv
