In [33]:
# Feature Engineering This notebook creates derived features from cleaned Formula 1 race data to support analysis of qualifying importance, driver consistency, and constructor dominance across eras.

In [34]:
import pandas as pd
import numpy as np

race_data = pd.read_csv("../data/processed/race_data_clean.csv")

In [35]:
race_data.rename(
    columns={
        'name_x': 'race_name',
        'name_y': 'constructor_name'
    },
    inplace=True
)

In [36]:
race_data.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,...,fastestLapSpeed,statusId,year,race_name,circuitId,forename,surname,nationality,constructor_name,grid_position
0,1,18,1,1,22,1,1,1,1,10.0,...,218.3,1,2008,Australian Grand Prix,1,Lewis,Hamilton,British,McLaren,1.0
1,2,18,2,2,3,5,2,2,2,8.0,...,217.586,1,2008,Australian Grand Prix,1,Nick,Heidfeld,German,BMW Sauber,5.0
2,3,18,3,3,7,7,3,3,3,6.0,...,216.719,1,2008,Australian Grand Prix,1,Nico,Rosberg,German,Williams,7.0
3,4,18,4,4,5,11,4,4,4,5.0,...,215.464,1,2008,Australian Grand Prix,1,Fernando,Alonso,Spanish,Renault,12.0
4,5,18,5,1,23,3,5,5,5,4.0,...,218.385,1,2008,Australian Grand Prix,1,Heikki,Kovalainen,Finnish,McLaren,3.0


In [37]:
race_data['position_change'] = (
    race_data['grid_position'] - race_data['position']
)

In [38]:
race_data[['grid_position', 'position', 'position_change']].head()

Unnamed: 0,grid_position,position,position_change
0,1.0,1,0.0
1,5.0,2,3.0
2,7.0,3,4.0
3,12.0,4,8.0
4,3.0,5,-2.0


In [39]:
def era_label(year):
    if year < 1970:
        return "Early Era"
    elif year < 1990:
        return "Turbo / Transition Era"
    elif year < 2010:
        return "Refuelling Era"
    elif year < 2022:
        return "Hybrid Era"
    else:
        return "Ground Effect Era"

race_data['era'] = race_data['year'].apply(era_label)

In [40]:
driver_consistency = (
    race_data
    .groupby(['driverId', 'forename', 'surname'])
    .agg(
        avg_finish=('position', 'mean'),
        finish_std=('position', 'std'),
        races=('position', 'count')
    )
    .reset_index()
)

In [41]:
driver_consistency.head()

Unnamed: 0,driverId,forename,surname,avg_finish,finish_std,races
0,1,Lewis,Hamilton,3.977143,3.593937,350
1,2,Nick,Heidfeld,8.355072,3.77429,138
2,3,Nico,Rosberg,6.355932,4.333128,177
3,4,Fernando,Alonso,6.826816,4.569802,358
4,5,Heikki,Kovalainen,11.835165,5.093054,91


In [42]:
MIN_RACES = 30

driver_consistency_filtered = driver_consistency[
    driver_consistency['races'] >= MIN_RACES
].copy()

In [43]:
driver_consistency_filtered.head()

Unnamed: 0,driverId,forename,surname,avg_finish,finish_std,races
0,1,Lewis,Hamilton,3.977143,3.593937,350
1,2,Nick,Heidfeld,8.355072,3.77429,138
2,3,Nico,Rosberg,6.355932,4.333128,177
3,4,Fernando,Alonso,6.826816,4.569802,358
4,5,Heikki,Kovalainen,11.835165,5.093054,91


In [44]:
race_data['is_win'] = (race_data['position'] == 1).astype(int)

constructor_dominance = (
    race_data
    .groupby(['year', 'constructor_name'])
    .agg(
        wins=('is_win', 'sum'),
        races=('raceId', 'nunique')
    )
    .reset_index()
)

constructor_dominance['win_rate'] = (
    constructor_dominance['wins'] / constructor_dominance['races']
)

In [45]:
constructor_dominance.head()

Unnamed: 0,year,constructor_name,wins,races,win_rate
0,1950,Alfa Romeo,6,6,1.0
1,1950,Alta,0,1,0.0
2,1950,Deidt,0,1,0.0
3,1950,ERA,0,2,0.0
4,1950,Ewing,0,1,0.0


In [46]:
print(len(race_data))
print(len(driver_consistency))
print(len(constructor_dominance))

16285
662
972


In [47]:
driver_consistency_filtered['consistency_score'] = (
    1 / driver_consistency_filtered['finish_std']
)

In [48]:
driver_consistency_filtered.to_csv(
    "../data/processed/driver_consistency_final.csv",
    index=False
)

print("driver_consistency_final.csv saved")

driver_consistency_final.csv saved


In [49]:
race_data['is_win'] = (race_data['position'] == 1).astype(int)

constructor_dominance = (
    race_data
    .groupby(['year', 'constructor_name'])
    .agg(
        wins=('is_win', 'sum'),
        races=('raceId', 'nunique')
    )
    .reset_index()
)

constructor_dominance['win_rate'] = (
    constructor_dominance['wins'] / constructor_dominance['races']
)

In [50]:
constructor_dominance.head()

Unnamed: 0,year,constructor_name,wins,races,win_rate
0,1950,Alfa Romeo,6,6,1.0
1,1950,Alta,0,1,0.0
2,1950,Deidt,0,1,0.0
3,1950,ERA,0,2,0.0
4,1950,Ewing,0,1,0.0


In [51]:
constructor_dominance.to_csv(
    "../data/processed/constructor_dominance.csv",
    index=False
)

print("constructor_dominance.csv saved")

constructor_dominance.csv saved


In [52]:
import os
os.listdir("../data/processed/")

['.ipynb_checkpoints',
 'constructor_dominance.csv',
 'driver_consistency_final.csv',
 'race_data_clean.csv']