In [1]:
import sqlite3
import pandas as pd
import nbimporter

In [2]:
from final_scores import calculate_final_driver_places 
from lap_times_functions import get_lap_times_stats
from lap_times_functions import get_median_lap_position
from lap_times_functions import get_total_laps_count
from pit_stop_functions import get_pit_stop_stats_in_miliseconds
from pit_stop_functions import get_most_common_pit_stop_lap
from constructor_table import get_drivers_with_constructor_relationship
from constructor_table import get_constructor_points
from drivers_functions import get_age_at_first_race
from drivers_functions import get_age_at_each_race
from circuits_functions import get_best_performing_circuits
from results_functions import get_most_common_positions

In [3]:
conn = sqlite3.connect('formula1.db')

# Data Agreggation : races -> driver per year 

In [4]:
drivers_final_positions = calculate_final_driver_places(conn)

In [5]:
driver_starting_year = get_age_at_first_race(conn)
drivers_final_positions = pd.merge(drivers_final_positions, driver_starting_year, on=['driverId'], how='left')

In [6]:
driver_years = get_age_at_each_race(conn)
drivers_final_positions = pd.merge(drivers_final_positions, driver_years, on=['driverId','year'], how='left')

In [7]:
drivers_final_positions

Unnamed: 0,year,driverId,driver_nationality,driver_points,driver_end_position,age_at_first_race,age_at_race
0,1950,642,Italian,30.0,1,44,44
1,1950,642,Italian,30.0,1,44,44
2,1950,642,Italian,30.0,1,44,44
3,1950,642,Italian,30.0,1,44,44
4,1950,642,Italian,30.0,1,44,44
...,...,...,...,...,...,...,...
25783,2022,849,Canadian,2.0,20,25,27
25784,2022,849,Canadian,2.0,20,25,27
25785,2022,856,Dutch,2.0,20,27,27
25786,2022,807,German,0.0,22,23,35


In [8]:
lap_times_stats = get_lap_times_stats(conn)

In [9]:
lap_positions = get_median_lap_position(conn)
lap_times_stats = pd.merge(lap_times_stats, lap_positions, on=['driverId', 'year'], how='left')

In [10]:
lap_count = get_total_laps_count(conn)
lap_times_stats = pd.merge(lap_times_stats, lap_count, on=['driverId', 'year'], how='left')

In [11]:
lap_times_stats

Unnamed: 0,year,driverId,min_lap_time_diff,avg_lap_time_diff,max_lap_time_diff,median_lap_position,total_laps
0,2007,1,0.000000,7.852700,1465.791089,2,1037
1,2008,1,0.000000,7.666610,101.287948,3,1065
2,2009,1,0.074346,6.171578,79.374498,7,904
3,2010,1,0.000000,10.709896,2567.450593,4,1001
4,2011,1,0.000000,8.270089,1600.104940,4,1013
...,...,...,...,...,...,...,...
642,2021,853,2.817997,21.635194,2443.049302,18,1008
643,2021,854,2.234354,18.256438,2428.528386,18,1135
644,2022,854,0.704944,14.242761,3432.659375,14,1142
645,2022,855,0.000000,12.770114,1625.453523,14,1060


In [12]:
pit_stop_stats = get_pit_stop_stats_in_miliseconds(conn)

In [13]:
most_common_pit_stop_lap = get_most_common_pit_stop_lap(conn)
pit_stop_stats = pd.merge(pit_stop_stats, most_common_pit_stop_lap, on=['driverId', 'year'], how='left')

In [14]:
pit_stop_stats

Unnamed: 0,driverId,year,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap
0,1,2011,13173,22666.446429,35688,16
1,1,2012,17598,22671.861111,31081,36
2,1,2013,17385,22399.111111,30085,31
3,1,2014,19710,53061.974359,1137295,26
4,1,2015,16579,23666.000000,30216,13
...,...,...,...,...,...,...
270,853,2021,15054,211286.047619,2076977,31
271,854,2021,15058,201851.200000,2075728,34
272,854,2022,14144,111302.675000,3065174,14
273,855,2022,14128,67903.444444,1174235,9


In [15]:
constructor_driver_connection = get_drivers_with_constructor_relationship(conn)

In [16]:
drivers_with_constructors = pd.merge(drivers_final_positions, constructor_driver_connection, on=['driverId', 'year'], how='left')

In [17]:
connstructor_points = get_constructor_points(conn)

In [18]:
drivers_with_constructors = pd.merge(drivers_with_constructors, connstructor_points, on=['constructorId', 'year'], how='left')

In [19]:
best_performing_circuits = get_best_performing_circuits(conn)
drivers_with_constructors = pd.merge(drivers_with_constructors, best_performing_circuits, on=['year','driverId' ], how='left')

In [20]:
drivers_with_constructors

Unnamed: 0,year,driverId,driver_nationality,driver_points,driver_end_position,age_at_first_race,age_at_race,constructorId,constructorName,constructor_nationality,constructor_points,constructor_place,best_performing_circuit_name
0,1950,642,Italian,30.0,1,44,44,51,Alfa Romeo,Swiss,,,Silverstone Circuit
1,1950,642,Italian,30.0,1,44,44,51,Alfa Romeo,Swiss,,,Silverstone Circuit
2,1950,642,Italian,30.0,1,44,44,51,Alfa Romeo,Swiss,,,Silverstone Circuit
3,1950,642,Italian,30.0,1,44,44,51,Alfa Romeo,Swiss,,,Silverstone Circuit
4,1950,642,Italian,30.0,1,44,44,51,Alfa Romeo,Swiss,,,Silverstone Circuit
...,...,...,...,...,...,...,...,...,...,...,...,...,...
28344,2022,849,Canadian,2.0,20,25,27,3,Williams,British,8.0,10.0,Suzuka Circuit
28345,2022,849,Canadian,2.0,20,25,27,3,Williams,British,8.0,10.0,Suzuka Circuit
28346,2022,856,Dutch,2.0,20,27,27,3,Williams,British,8.0,10.0,Autodromo Nazionale di Monza
28347,2022,807,German,0.0,22,23,35,117,Aston Martin,British,55.0,6.0,Bahrain International Circuit


In [21]:
lap_pit_df = pd.merge(lap_times_stats, pit_stop_stats, on=['driverId', 'year'], how='left', suffixes=('_lap', '_pit'))

In [22]:
lap_pit_df

Unnamed: 0,year,driverId,min_lap_time_diff,avg_lap_time_diff,max_lap_time_diff,median_lap_position,total_laps,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap
0,2007,1,0.000000,7.852700,1465.791089,2,1037,,,,
1,2008,1,0.000000,7.666610,101.287948,3,1065,,,,
2,2009,1,0.074346,6.171578,79.374498,7,904,,,,
3,2010,1,0.000000,10.709896,2567.450593,4,1001,,,,
4,2011,1,0.000000,8.270089,1600.104940,4,1013,13173.0,22666.446429,35688.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...
642,2021,853,2.817997,21.635194,2443.049302,18,1008,15054.0,211286.047619,2076977.0,31.0
643,2021,854,2.234354,18.256438,2428.528386,18,1135,15058.0,201851.200000,2075728.0,34.0
644,2022,854,0.704944,14.242761,3432.659375,14,1142,14144.0,111302.675000,3065174.0,14.0
645,2022,855,0.000000,12.770114,1625.453523,14,1060,14128.0,67903.444444,1174235.0,9.0


In [23]:
final_df = pd.merge(drivers_with_constructors, lap_pit_df, on=['driverId', 'year'], how='left')

In [24]:
positions = get_most_common_positions(conn)

In [25]:
final_df = pd.merge(final_df, positions, on=['driverId', 'year'], how='left')

In [26]:
final_df = final_df.drop_duplicates(['driverId', 'year'])

In [27]:
final_df

Unnamed: 0,year,driverId,driver_nationality,driver_points,driver_end_position,age_at_first_race,age_at_race,constructorId,constructorName,constructor_nationality,...,avg_lap_time_diff,max_lap_time_diff,median_lap_position,total_laps,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap,most_common_starting_position,most_common_ending_position
0,1950,642,Italian,30.0,1,44,44,51,Alfa Romeo,Swiss,...,,,,,,,,,1,1
6,1950,579,Argentine,27.0,2,39,39,51,Alfa Romeo,Swiss,...,,,,,,,,,3,\N
13,1950,786,Italian,24.0,3,52,52,51,Alfa Romeo,Swiss,...,,,,,,,,,2,2
19,1950,627,French,13.0,4,45,45,154,Talbot-Lago,French,...,,,,,,,,,9,5
26,1950,647,Italian,11.0,5,32,32,6,Ferrari,Italian,...,,,,,,,,,7,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28281,2022,855,Chinese,6.0,18,23,23,51,Alfa Romeo,Swiss,...,12.770114,1625.453523,14.0,1060.0,14128.0,67903.444444,1174235.0,9.0,15,10
28303,2022,848,Thai,4.0,19,23,26,3,Williams,British,...,12.351442,1636.886991,15.0,1100.0,14114.0,54369.842105,1185676.0,18.0,14,13
28324,2022,849,Canadian,2.0,20,25,27,3,Williams,British,...,16.016565,3433.583029,18.0,1155.0,14459.0,120419.354167,3051904.0,19.0,20,16
28346,2022,856,Dutch,2.0,20,27,27,3,Williams,British,...,8.555862,46.221588,9.0,53.0,24628.0,24628.000000,24628.0,19.0,8,9


# Feature Creation

In [28]:
final_df = final_df.sort_values(by=['year', 'driverId'])

for index, row in final_df.iterrows():
    temp_df = final_df[(final_df['driverId'] == row['driverId']) & (final_df['year'] <= row['year'])]
    
    first_place_count = temp_df[temp_df['driver_end_position'] == 1].shape[0]
    second_place_count = temp_df[temp_df['driver_end_position'] == 2].shape[0]
    third_place_count = temp_df[temp_df['driver_end_position'] == 3].shape[0]
    top_5_count = temp_df[temp_df['driver_end_position'] <= 5].shape[0]

    final_df.at[index, 'driver_first_place_count'] = first_place_count
    final_df.at[index, 'driver_second_place_count'] = second_place_count
    final_df.at[index, 'driver_third_place_count'] = third_place_count
    final_df.at[index, 'driver_top_5_place_count'] = top_5_count


In [29]:
final_df = final_df.sort_values(by=['year', 'constructorId'])

for index, row in final_df.iterrows():
    temp_df = final_df[(final_df['constructorId'] == row['constructorId']) & (final_df['year'] <= row['year'])]
    
    temp_first_place_count = temp_df[temp_df['constructor_place'] == 1].shape[0]
    temp_second_place_count = temp_df[temp_df['constructor_place'] == 2].shape[0]
    temp_third_place_count = temp_df[temp_df['constructor_place'] == 3].shape[0]
    temp_top_5_count = temp_df[temp_df['constructor_place'] <= 5].shape[0]
    
    final_df.at[index, 'constructor_first_place_count'] = temp_first_place_count
    final_df.at[index, 'constructor_second_place_count'] = temp_second_place_count
    final_df.at[index, 'constructor_third_place_count'] = temp_third_place_count
    final_df.at[index, 'constructor_top_5_place_count'] = temp_top_5_count

In [30]:
final_df

Unnamed: 0,year,driverId,driver_nationality,driver_points,driver_end_position,age_at_first_race,age_at_race,constructorId,constructorName,constructor_nationality,...,most_common_starting_position,most_common_ending_position,driver_first_place_count,driver_second_place_count,driver_third_place_count,driver_top_5_place_count,constructor_first_place_count,constructor_second_place_count,constructor_third_place_count,constructor_top_5_place_count
142,1950,633,Italian,0.0,23,41,41,6,Ferrari,Italian,...,6,\N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,1950,647,Italian,11.0,5,32,32,6,Ferrari,Italian,...,7,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
38,1950,687,British,4.0,9,36,36,6,Ferrari,Italian,...,21,\N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,1950,791,Italian,0.0,23,52,52,6,Ferrari,Italian,...,25,\N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,1950,793,French,3.0,13,44,44,6,Ferrari,Italian,...,9,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28259,2022,854,German,12.0,16,22,23,210,Haas F1 Team,American,...,12,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
28193,2022,842,French,23.0,14,21,26,213,AlphaTauri,Italian,...,10,\N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28237,2022,852,Japanese,12.0,16,21,22,213,AlphaTauri,Italian,...,16,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28085,2022,4,Spanish,81.0,9,20,41,214,Alpine F1 Team,French,...,8,9,2.0,4.0,0.0,9.0,0.0,0.0,0.0,4.0


# Diskretization

In [31]:
columns_to_discretize = ['min_lap_time_diff', 'avg_lap_time_diff', 'max_lap_time_diff']
labels = ['0-25%', '25-50%', '50% and more']

for col in columns_to_discretize:
    if col == 'min_lap_time_diff':
        custom_bin_edges = [final_df[col].min(), 0.1, final_df[col].quantile(0.5), final_df[col].max()]
        final_df[col] = pd.cut(final_df[col], bins=custom_bin_edges, include_lowest=True, duplicates='drop', labels=labels)
    else:
        bin_edges = [final_df[col].min(), final_df[col].quantile(0.25), final_df[col].quantile(0.5), final_df[col].max()]
        final_df[col] = pd.cut(final_df[col], bins=bin_edges, include_lowest=True, duplicates='drop', labels=labels)

In [32]:
categorical_columns = ['min_lap_time_diff', 'avg_lap_time_diff', 'max_lap_time_diff']

for col in categorical_columns:
    final_df[col] = final_df[col].astype('category').cat.add_categories('Unknown')
    final_df[col].fillna('Unknown', inplace=True)

In [33]:
columns_to_discretize = ['min_pit_stop_time', 'avg_pit_time', 'max_pit_stop_time']
labels = ['0-25%', '25-50%', '50% and more']

for col in columns_to_discretize:
    if col in pit_stop_stats.columns:
        bin_edges = [final_df[col].min(), final_df[col].quantile(0.25), final_df[col].quantile(0.5), final_df[col].max()]
        final_df[col] = pd.cut(final_df[col], bins=bin_edges, include_lowest=True, duplicates='drop', labels=labels)

In [34]:
categorical_columns = ['min_pit_stop_time', 'avg_pit_time', 'max_pit_stop_time']

for col in categorical_columns:
    final_df[col] = final_df[col].astype('category').cat.add_categories('Unknown')
    final_df[col].fillna('Unknown', inplace=True)

In [35]:
def categorize_total_laps(laps):
    if laps <= 1000:
        return '0-1000 Laps'
    elif laps <= 3000:
        return '1001-3000 Laps'
    elif laps <= 6000:
        return '3001-6000 Laps'
    elif laps <= 9000:
        return '6001-9000 Laps'
    elif laps <= 12000:
        return '9001-12000 Laps'
    else:
        return 'More than 12000 Laps'

final_df['total_laps'] = final_df['total_laps'].apply(categorize_total_laps)


In [36]:
final_df

Unnamed: 0,year,driverId,driver_nationality,driver_points,driver_end_position,age_at_first_race,age_at_race,constructorId,constructorName,constructor_nationality,...,most_common_starting_position,most_common_ending_position,driver_first_place_count,driver_second_place_count,driver_third_place_count,driver_top_5_place_count,constructor_first_place_count,constructor_second_place_count,constructor_third_place_count,constructor_top_5_place_count
142,1950,633,Italian,0.0,23,41,41,6,Ferrari,Italian,...,6,\N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,1950,647,Italian,11.0,5,32,32,6,Ferrari,Italian,...,7,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
38,1950,687,British,4.0,9,36,36,6,Ferrari,Italian,...,21,\N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,1950,791,Italian,0.0,23,52,52,6,Ferrari,Italian,...,25,\N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,1950,793,French,3.0,13,44,44,6,Ferrari,Italian,...,9,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28259,2022,854,German,12.0,16,22,23,210,Haas F1 Team,American,...,12,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
28193,2022,842,French,23.0,14,21,26,213,AlphaTauri,Italian,...,10,\N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28237,2022,852,Japanese,12.0,16,21,22,213,AlphaTauri,Italian,...,16,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28085,2022,4,Spanish,81.0,9,20,41,214,Alpine F1 Team,French,...,8,9,2.0,4.0,0.0,9.0,0.0,0.0,0.0,4.0


In [37]:
final_df['most_common_ending_position'] = final_df['most_common_ending_position'].replace({'\\N': 'Did Not Finish'})

In [38]:
final_df['most_common_starting_position'] = final_df['most_common_starting_position'].replace(0, 'Unknown')

In [39]:
def categorize_pitstop_lap(lap):
    if isinstance(lap, str):
        return 'Unknown'
    if lap <= 10:
        return 'Early Pitstop'
    elif lap <= 35:
        return 'Mid-race Pitstop'
    else:
        return 'Late Pitstop'

final_df['most_common_pitstop_lap'] = final_df['most_common_pitstop_lap'].apply(categorize_pitstop_lap)

final_df['most_common_starting_position'] = pd.to_numeric(positions['most_common_starting_position'], errors='coerce')
final_df['most_common_ending_position'] = pd.to_numeric(positions['most_common_ending_position'], errors='coerce')

def categorize_position(position):
    if isinstance(position, str):
        return 'Unknown'
    if position <= 5:
        return 'Top 5'
    elif position <= 10:
        return 'Top 10'
    else:
        return 'Beyond Top 10'

final_df['most_common_starting_position'] = final_df['most_common_starting_position'].apply(categorize_position)
final_df['most_common_ending_position'] = final_df['most_common_ending_position'].apply(categorize_position)


In [40]:
final_df

Unnamed: 0,year,driverId,driver_nationality,driver_points,driver_end_position,age_at_first_race,age_at_race,constructorId,constructorName,constructor_nationality,...,most_common_starting_position,most_common_ending_position,driver_first_place_count,driver_second_place_count,driver_third_place_count,driver_top_5_place_count,constructor_first_place_count,constructor_second_place_count,constructor_third_place_count,constructor_top_5_place_count
142,1950,633,Italian,0.0,23,41,41,6,Ferrari,Italian,...,Beyond Top 10,Beyond Top 10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,1950,647,Italian,11.0,5,32,32,6,Ferrari,Italian,...,Beyond Top 10,Beyond Top 10,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
38,1950,687,British,4.0,9,36,36,6,Ferrari,Italian,...,Top 5,Top 5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,1950,791,Italian,0.0,23,52,52,6,Ferrari,Italian,...,Beyond Top 10,Beyond Top 10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55,1950,793,French,3.0,13,44,44,6,Ferrari,Italian,...,Top 10,Top 5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28259,2022,854,German,12.0,16,22,23,210,Haas F1 Team,American,...,Beyond Top 10,Beyond Top 10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
28193,2022,842,French,23.0,14,21,26,213,AlphaTauri,Italian,...,Beyond Top 10,Beyond Top 10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28237,2022,852,Japanese,12.0,16,21,22,213,AlphaTauri,Italian,...,Beyond Top 10,Beyond Top 10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28085,2022,4,Spanish,81.0,9,20,41,214,Alpine F1 Team,French,...,Beyond Top 10,Beyond Top 10,2.0,4.0,0.0,9.0,0.0,0.0,0.0,4.0


# Dealing with NaN Values

In [41]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3146 entries, 142 to 28063
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   year                            3146 non-null   int64   
 1   driverId                        3146 non-null   int64   
 2   driver_nationality              3146 non-null   object  
 3   driver_points                   3146 non-null   float64 
 4   driver_end_position             3146 non-null   int32   
 5   age_at_first_race               3146 non-null   int64   
 6   age_at_race                     3146 non-null   int64   
 7   constructorId                   3146 non-null   int64   
 8   constructorName                 3146 non-null   object  
 9   constructor_nationality         3146 non-null   object  
 10  constructor_points              2337 non-null   float64 
 11  constructor_place               2337 non-null   float64 
 12  best_performing_circui

In [42]:
columns_to_fill_with_zero = ['constructor_points', 'constructor_place', 'median_lap_position']
final_df[columns_to_fill_with_zero] = final_df[columns_to_fill_with_zero].fillna(0)

In [43]:
final_df['driver_first_place_count'] = final_df['driver_second_place_count'].astype(int)
final_df['driver_second_place_count'] = final_df['driver_second_place_count'].astype(int)
final_df['driver_third_place_count'] = final_df['driver_top_5_place_count'].astype(int)
final_df['driver_top_5_place_count'] = final_df['driver_top_5_place_count'].astype(int)
final_df['driver_points'] = final_df['driver_points'].astype(int)
final_df['driver_end_position'] = final_df['driver_end_position'].astype(int)
final_df['age_at_first_race'] = final_df['age_at_first_race'].astype(int)
final_df['age_at_race'] = final_df['age_at_race'].astype(int)
final_df['median_lap_position'] = final_df['median_lap_position'].astype(int)
final_df['constructor_points'] = final_df['constructor_points'].astype(int)
final_df['constructor_place'] = final_df['constructor_place'].astype(int)
final_df['constructor_first_place_count'] = final_df['constructor_first_place_count'].astype(int)
final_df['constructor_second_place_count'] = final_df['constructor_second_place_count'].astype(int)
final_df['constructor_third_place_count'] = final_df['constructor_third_place_count'].astype(int)
final_df['constructor_top_5_place_count'] = final_df['constructor_top_5_place_count'].astype(int)
final_df['constructor_first_place_count'] = final_df['constructor_first_place_count'].astype(int)
final_df['constructor_second_place_count'] = final_df['constructor_second_place_count'].astype(int)
final_df['constructor_third_place_count'] = final_df['constructor_third_place_count'].astype(int)
final_df['constructor_top_5_place_count'] = final_df['constructor_top_5_place_count'].astype(int)
final_df['constructor_points'] = final_df['constructor_points'].astype(int)
final_df['constructor_place'] = final_df['constructor_place'].astype(int)

In [44]:
lap_times_stats['median_lap_position'] = lap_times_stats['median_lap_position'].replace(0, 'Unknown')

# Aggregation: driver per year -> driver

In [45]:
df = final_df.sort_values(by='year', ascending=False)

numeric_columns = ['year','driver_points', 'driver_end_position', 'constructor_points', 'constructor_place','median_lap_position', 'total_laps']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

aggregation_functions = {
    'year': 'max',
    'driver_nationality': 'first',
    'age_at_first_race': 'first',
    'driver_first_place_count': 'first',
    'driver_second_place_count': 'first',
    'driver_third_place_count': 'first',
    'driver_top_5_place_count': 'first',
    'constructorName': 'first',
    'constructor_nationality': 'first',
    'constructor_first_place_count': 'first',
    'constructor_second_place_count': 'first',
    'constructor_third_place_count': 'first',
    'constructor_top_5_place_count': 'first',
    'best_performing_circuit_name': lambda x: x.mode().iloc[0], 
    'min_lap_time_diff': lambda x: x.mode().iloc[0],
    'avg_lap_time_diff': lambda x: x.mode().iloc[0],
    'max_lap_time_diff': lambda x: x.mode().iloc[0],
    'median_lap_position':lambda x: round(x.mean()) if x.notna().any() else 'Unknown',
    'min_pit_stop_time': lambda x: x.mode().iloc[0],
    'avg_pit_time': lambda x: x.mode().iloc[0],
    'max_pit_stop_time': lambda x: x.mode().iloc[0],
    'most_common_pitstop_lap': lambda x: x.mode().iloc[0],
    'most_common_starting_position': lambda x: x.mode().iloc[0] if x.notna().any() else 'Unknown',
    'most_common_ending_position': lambda x: x.mode().iloc[0] 
}

aggregated_df = df.groupby('driverId').agg(aggregation_functions)

aggregated_df = aggregated_df.reset_index()

In [46]:
aggregated_df

Unnamed: 0,driverId,year,driver_nationality,age_at_first_race,driver_first_place_count,driver_second_place_count,driver_third_place_count,driver_top_5_place_count,constructorName,constructor_nationality,...,min_lap_time_diff,avg_lap_time_diff,max_lap_time_diff,median_lap_position,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap,most_common_starting_position,most_common_ending_position
0,1,2022,British,22,3,3,15,15,Mercedes,German,...,0-25%,50% and more,50% and more,3,25-50%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
1,2,2011,German,23,0,0,1,1,Renault,French,...,50% and more,25-50%,50% and more,10,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
2,3,2016,German,21,2,2,3,3,Mercedes,German,...,0-25%,50% and more,50% and more,6,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
3,4,2022,Spanish,20,4,4,9,9,Alpine F1 Team,French,...,0-25%,50% and more,50% and more,7,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
4,5,2013,Finnish,26,0,0,0,0,Lotus F1,British,...,50% and more,50% and more,50% and more,12,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843,852,2022,Japanese,21,0,0,0,0,AlphaTauri,Italian,...,25-50%,50% and more,50% and more,13,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
844,853,2021,Russian,22,0,0,0,0,Haas F1 Team,American,...,50% and more,50% and more,50% and more,18,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
845,854,2022,German,22,0,0,0,0,Haas F1 Team,American,...,25-50%,50% and more,50% and more,16,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
846,855,2022,Chinese,23,0,0,0,0,Alfa Romeo,Swiss,...,0-25%,50% and more,50% and more,14,0-25%,50% and more,50% and more,Early Pitstop,Beyond Top 10,Beyond Top 10


In [47]:
aggregated_df = aggregated_df.drop(['year', 'driverId'], axis=1)

In [48]:
drivers_lost = aggregated_df[aggregated_df['driver_top_5_place_count'] == 0]
drivers_won = aggregated_df[aggregated_df['driver_top_5_place_count'] != 0]

In [49]:
drivers_lost

Unnamed: 0,driver_nationality,age_at_first_race,driver_first_place_count,driver_second_place_count,driver_third_place_count,driver_top_5_place_count,constructorName,constructor_nationality,constructor_first_place_count,constructor_second_place_count,...,min_lap_time_diff,avg_lap_time_diff,max_lap_time_diff,median_lap_position,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap,most_common_starting_position,most_common_ending_position
4,Finnish,26,0,0,0,0,Lotus F1,British,0,0,...,50% and more,50% and more,50% and more,12,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
5,Japanese,22,0,0,0,0,Williams,British,25,13,...,50% and more,0-25%,25-50%,12,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
6,French,29,0,0,0,0,Toro Rosso,Italian,0,0,...,50% and more,25-50%,0-25%,14,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
9,German,22,0,0,0,0,Marussia,Russian,0,0,...,50% and more,50% and more,50% and more,15,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
10,Japanese,25,0,0,0,0,Super Aguri,Japanese,0,0,...,50% and more,50% and more,50% and more,12,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843,Japanese,21,0,0,0,0,AlphaTauri,Italian,0,0,...,25-50%,50% and more,50% and more,13,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
844,Russian,22,0,0,0,0,Haas F1 Team,American,0,0,...,50% and more,50% and more,50% and more,18,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
845,German,22,0,0,0,0,Haas F1 Team,American,0,0,...,25-50%,50% and more,50% and more,16,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
846,Chinese,23,0,0,0,0,Alfa Romeo,Swiss,0,0,...,0-25%,50% and more,50% and more,14,0-25%,50% and more,50% and more,Early Pitstop,Beyond Top 10,Beyond Top 10


In [50]:
drivers_won

Unnamed: 0,driver_nationality,age_at_first_race,driver_first_place_count,driver_second_place_count,driver_third_place_count,driver_top_5_place_count,constructorName,constructor_nationality,constructor_first_place_count,constructor_second_place_count,...,min_lap_time_diff,avg_lap_time_diff,max_lap_time_diff,median_lap_position,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap,most_common_starting_position,most_common_ending_position
0,British,22,3,3,15,15,Mercedes,German,16,2,...,0-25%,50% and more,50% and more,3,25-50%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
1,German,23,0,0,1,1,Renault,French,4,2,...,50% and more,25-50%,50% and more,10,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
2,German,21,2,2,3,3,Mercedes,German,6,2,...,0-25%,50% and more,50% and more,6,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
3,Spanish,20,4,4,9,9,Alpine F1 Team,French,0,0,...,0-25%,50% and more,50% and more,7,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
7,Finnish,22,2,2,10,10,Alfa Romeo,Swiss,0,0,...,0-25%,25-50%,50% and more,6,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,Finnish,24,2,2,7,7,Alfa Romeo,Swiss,0,0,...,0-25%,50% and more,50% and more,6,50% and more,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
821,Dutch,18,0,0,6,6,Red Bull,Austrian,10,9,...,0-25%,50% and more,25-50%,4,50% and more,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
823,Spanish,21,0,0,2,2,Ferrari,Italian,40,56,...,25-50%,50% and more,50% and more,8,50% and more,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
835,Monegasque,21,1,1,2,2,Ferrari,Italian,40,56,...,0-25%,50% and more,50% and more,6,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10


In [51]:
import inflect

def number_to_words(number):
    p = inflect.engine()
    return p.number_to_words(number)

columns_to_convert = ['driver_first_place_count','driver_second_place_count','driver_third_place_count','driver_top_5_place_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_place_count', 'median_lap_position']
drivers_won[columns_to_convert] = drivers_won[columns_to_convert].fillna(0).astype(int)

drivers_won[columns_to_convert] = drivers_won[columns_to_convert].apply(pd.to_numeric, errors='coerce')

drivers_won[columns_to_convert] = drivers_won[columns_to_convert].applymap(number_to_words)

print(drivers_won)



    driver_nationality age_at_first_race driver_first_place_count  \
0              British        twenty-two                    three   
1               German      twenty-three                     zero   
2               German        twenty-one                      two   
3              Spanish            twenty                     four   
7              Finnish        twenty-two                      two   
..                 ...               ...                      ...   
813            Finnish       twenty-four                      two   
821              Dutch          eighteen                     zero   
823            Spanish        twenty-one                     zero   
835         Monegasque        twenty-one                      one   
838            British        twenty-one                     zero   

    driver_second_place_count driver_third_place_count  \
0                       three                  fifteen   
1                        zero                      one 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers_won[columns_to_convert] = drivers_won[columns_to_convert].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers_won[columns_to_convert] = drivers_won[columns_to_convert].apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers_won[c

In [52]:
import inflect

def number_to_words(number):
    p = inflect.engine()
    return p.number_to_words(number)

columns_to_convert = ['driver_first_place_count','driver_second_place_count','driver_third_place_count','driver_top_5_place_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_place_count', 'median_lap_position']
drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].fillna(0).astype(int)

drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].apply(pd.to_numeric, errors='coerce')

drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].applymap(number_to_words)

print(drivers_lost)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].apply(pd.to_numeric, errors='coerce')


    driver_nationality age_at_first_race driver_first_place_count  \
4              Finnish        twenty-six                     zero   
5             Japanese        twenty-two                     zero   
6               French       twenty-nine                     zero   
9               German        twenty-two                     zero   
10            Japanese       twenty-five                     zero   
..                 ...               ...                      ...   
843           Japanese        twenty-one                     zero   
844            Russian        twenty-two                     zero   
845             German        twenty-two                     zero   
846            Chinese      twenty-three                     zero   
847              Dutch      twenty-seven                     zero   

    driver_second_place_count driver_third_place_count  \
4                        zero                     zero   
5                        zero                     zero 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers_lost[columns_to_convert] = drivers_lost[columns_to_convert].applymap(number_to_words)


In [53]:
import inflect

def number_to_words(number):
    p = inflect.engine()
    return p.number_to_words(number)

columns_to_convert = ['driver_first_place_count','driver_second_place_count','driver_third_place_count','driver_top_5_place_count','age_at_first_race','constructor_first_place_count','constructor_second_place_count','constructor_third_place_count','constructor_top_5_place_count', 'median_lap_position']
aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].fillna(0).astype(int)

aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

aggregated_df[columns_to_convert] = aggregated_df[columns_to_convert].applymap(number_to_words)

print(aggregated_df)


    driver_nationality age_at_first_race driver_first_place_count  \
0              British        twenty-two                    three   
1               German      twenty-three                     zero   
2               German        twenty-one                      two   
3              Spanish            twenty                     four   
4              Finnish        twenty-six                     zero   
..                 ...               ...                      ...   
843           Japanese        twenty-one                     zero   
844            Russian        twenty-two                     zero   
845             German        twenty-two                     zero   
846            Chinese      twenty-three                     zero   
847              Dutch      twenty-seven                     zero   

    driver_second_place_count driver_third_place_count  \
0                       three                  fifteen   
1                        zero                      one 

In [54]:
aggregated_df.to_csv("all_drivers.csv", index=False, encoding='utf-8')

In [55]:
aggregated_df = aggregated_df[aggregated_df.ne('Unknown').all(axis=1)]

In [56]:
aggregated_df.to_csv("all_drivers_withiut_unknown.csv", index=False, encoding='utf-8')

In [57]:
drivers_won.to_csv("winning_drivers.csv", index=False, encoding='utf-8')

In [58]:
drivers_won = drivers_won[drivers_won.ne('Unknown').all(axis=1)]

In [59]:
drivers_won.to_csv("winning_drivers_without_unknown.csv", index=False, encoding='utf-8')

In [60]:
drivers_lost.to_csv("losing_drivers.csv", index=False, encoding='utf-8')

In [61]:
drivers_lost = drivers_lost[drivers_lost.ne('Unknown').all(axis=1)]

In [62]:
drivers_lost.to_csv("losing_drivers_without_unknown.csv", index=False, encoding='utf-8')