In [598]:
import pandas as pd

In [599]:
# === Load Data ===
path = "../data/"
races = pd.read_csv(path + "races.csv")
results = pd.read_csv(path + "results.csv") 
drivers = pd.read_csv(path + "drivers.csv") 
constructors = pd.read_csv(path + "constructors.csv") 
circuits = pd.read_csv(path + "circuits.csv")
status = pd.read_csv(path + "status.csv")

**Combining Data Stage**  
Main dataset used will be results. Other relevent data will be merged into results dataset

In [601]:
#merge drivers and results

#Keep only relevent information
drivers = drivers[["driverId", "forename", "surname", "dob"]].copy()

#Combine sirname and forename
drivers["driver_name"] = drivers["forename"] + " " + drivers["surname"]
drivers = drivers.drop(columns=["forename", "surname"])

#Merge with results
results = results.merge(drivers, on="driverId", how="left")

#drop driverId (replaced by driver_name)
results = results.drop(columns=["driverId"])

#Change 'time' to 'finish_time' to avoid confusion
results = results.rename(columns={"time": "finish_time"})

In [602]:
#merge races
races.head()
races.columns

#Keep only relevent information
races = races[["raceId", "year", "round", "circuitId", "name", "date", "time"]].copy()

#Merge with results
results = results.merge(races, on="raceId", how="left")

#drop raceId 
results = results.drop(columns=["raceId"])

#Computer drivers age at race using data and DOB, then  drop DOB 
results["date"] = pd.to_datetime(results["date"])
results["dob"] = pd.to_datetime(results["dob"])
results["age_at_race"] = (results["date"] - results["dob"]).dt.days / 365.25
results = results.drop(columns=["dob"])

#Change 'name' to 'race_name' to avoid confusion
results = results.rename(columns={"name": "race_name"})
#Change 'time' to 'finish_time' to avoid confusion
results = results.rename(columns={"time": "race_start_time"})

In [603]:
#merge constuctors
constructors.head()
constructors.columns

#Keep only relevent information
constructors = constructors[["constructorId", "name"]].copy()

#Merge with results
results = results.merge(constructors, on="constructorId", how="left")

#drop constructorId 
results = results.drop(columns=["constructorId"])

#Change 'name' to 'constructor' to avoid confusion
results = results.rename(columns={"name": "constructor_name"})

In [604]:
#merge circuits 

#Keep only relevent information
circuits = circuits[["circuitId", "name"]].copy()

#Merge with results
results = results.merge(circuits, on="circuitId", how="left")

#drop circuitId 
results = results.drop(columns=["circuitId"])

#Change 'name' to 'circuit' to avoid confusion
results = results.rename(columns={"name": "circuit"})

In [605]:
#merge status 

results = results.merge(status, on="statusId", how="left")

#drop statusId 
results = results.drop(columns=["statusId"])
results.head()
results.columns

Index(['resultId', 'number', 'grid', 'position', 'positionText',
       'positionOrder', 'points', 'laps', 'finish_time', 'milliseconds',
       'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed',
       'driver_name', 'year', 'round', 'race_name', 'date', 'race_start_time',
       'age_at_race', 'constructor_name', 'circuit', 'status'],
      dtype='object')

In [606]:
#drop irrelevent information
results = results.drop(columns=["number", "position", "positionText", "finish_time", "rank", "fastestLapSpeed", "points", "laps", "milliseconds","fastestLap", "fastestLapTime", "resultId"])

# reorder a bit for readability
results = results[
    [
        "year", "round", "race_name", "circuit", "driver_name", "constructor_name", "grid", "positionOrder", 
        "date", "race_start_time", "age_at_race","status"
    ]
]



**Clean data for machine learning**  
Ensure none of the columns are of type object 

In [608]:
#drop rows where position order is missing
df = results.dropna(subset=["positionOrder"]).copy()

In [619]:
# make sure date is a datetime object
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# extract numeric time features
df["race_month"] = df["date"].dt.month
df["race_dayofweek"] = df["date"].dt.dayofweek  # Monday=0, Sunday=6

# convert race_start_time (HH:MM:SS) to hour if available
def extract_hour(t):
    try:
        return pd.to_datetime(t, format="%H:%M:%S", errors="coerce").hour
    except Exception:
        return None

df["race_start_hour"] = df["race_start_time"].apply(extract_hour)

df = df.drop(columns=["date", "race_start_time"])



**Split Data**
Years 1950 - 2025 will be used to train and 2021–2025 will be used to test 

In [622]:
#sort by year 
df["year"].value_counts().sort_index()

train_df = df[df["year"] <= 2020].copy()
test_df  = df[df["year"] > 2020].copy()

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Train seasons:", train_df["year"].min(), "-", train_df["year"].max())
print("Test seasons:", test_df["year"].min(), "-", test_df["year"].max())


Train shape: (24960, 13)
Test shape: (2158, 13)
Train seasons: 1950 - 2020
Test seasons: 2021 - 2025


**Encode catigorical Columns as numeric values**

In [625]:
categorical_cols = ["race_name", "circuit", "driver_name", "constructor_name", "status"]

from sklearn.preprocessing import LabelEncoder

train_df_encoded = train_df.copy()
test_df_encoded = test_df.copy()
encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train_df_encoded[col] = le.fit_transform(train_df_encoded[col].astype(str))
    encoders[col] = le

    # Handle unseen labels in test set
    test_df_encoded[col] = test_df_encoded[col].apply(
        lambda x: x if x in le.classes_ else "Unknown"
    )

    # Add "Unknown" to encoder classes
    le_classes = np.append(le.classes_, "Unknown")
    le.classes_ = le_classes

    test_df_encoded[col] = le.transform(test_df_encoded[col].astype(str))

**Create Model**

In [632]:
#set X and y values from training data 
X_train = train_df_encoded.drop("positionOrder", axis=1)
y_train = train_df_encoded["positionOrder"]

#set X and y values from testing data 
X_test = test_df_encoded.drop("positionOrder", axis=1)
y_test = test_df_encoded["positionOrder"]

from sklearn.ensemble import RandomForestRegressor

#create model
model = RandomForestRegressor(
    n_estimators=100,         # number of trees in the forest
    max_depth=None,           # how deep each tree can go (None = unlimited until leaf size)
    min_samples_leaf=1,       # minimum number of samples required to be at a leaf node
    random_state=42,          # seed for reproducibility
    n_jobs=-1                 # use all CPU cores
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import mean_absolute_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("R^2 Score:", r2)


Mean Absolute Error: 2.2251668211306765
R^2 Score: 0.7488740274640603
