In [1]:
import sqlite3
import pandas as pd
import nbimporter

In [2]:
from final_scores import calculate_final_driver_places 
from lap_times_functions import get_lap_times_stats
from lap_times_functions import get_median_lap_position
from lap_times_functions import get_total_laps_count
from pit_stop_functions import get_pit_stop_stats_in_miliseconds
from pit_stop_functions import get_most_common_pit_stop_lap
from constructor_table import get_drivers_with_constructor_relationship
from constructor_table import get_constructor_points
from drivers_functions import get_age_at_first_race
from circuits_functions import get_best_performing_circuits
from results_functions import get_most_common_positions

In [3]:
conn = sqlite3.connect('formula1.db')

In [4]:
drivers_final_positions = calculate_final_driver_places(conn)
print(drivers_final_positions)

      year  driverId date_of_birth driver_nationality  driver_points  \
73    1950       642    1906-10-30            Italian           30.0   
66    1950       579    1911-06-24          Argentine           27.0   
72    1950       786    1898-06-09            Italian           24.0   
70    1950       627    1905-11-05             French           13.0   
56    1950       647    1918-07-13            Italian           11.0   
...    ...       ...           ...                ...            ...   
3133  2022       855    1999-05-30            Chinese            6.0   
3136  2022       848    1996-03-23               Thai            4.0   
3139  2022       849    1995-06-29           Canadian            2.0   
3145  2022       856    1995-02-06              Dutch            2.0   
3140  2022       807    1987-08-19             German            0.0   

      driver_end_position  
73                      1  
66                      2  
72                      3  
70                     

In [5]:
drivers_final_positions = drivers_final_positions.sort_values(by=['year', 'driverId'])

for index, row in drivers_final_positions.iterrows():
    temp_df = drivers_final_positions[(drivers_final_positions['driverId'] == row['driverId']) & (drivers_final_positions['year'] <= row['year'])]
    
    first_place_count = temp_df[temp_df['driver_end_position'] == 1].shape[0]
    second_place_count = temp_df[temp_df['driver_end_position'] == 2].shape[0]
    third_place_count = temp_df[temp_df['driver_end_position'] == 3].shape[0]
    top_5_count = temp_df[temp_df['driver_end_position'] <= 5].shape[0]

    drivers_final_positions.at[index, 'first_place_count'] = first_place_count
    drivers_final_positions.at[index, 'second_place_count'] = second_place_count
    drivers_final_positions.at[index, 'third_place_count'] = third_place_count
    drivers_final_positions.at[index, 'top_5_count'] = top_5_count

print(drivers_final_positions)


      year  driverId date_of_birth driver_nationality  driver_points  \
51    1950       427    1917-10-30             French            0.0   
53    1950       498    1922-10-05          Argentine            0.0   
49    1950       501    1921-06-29           American            0.0   
25    1950       509    1928-07-16           American            0.0   
37    1950       518    1913-05-05           American            0.0   
...    ...       ...           ...                ...            ...   
3139  2022       849    1995-06-29           Canadian            2.0   
3131  2022       852    2000-05-11           Japanese           12.0   
3134  2022       854    1999-03-22             German           12.0   
3133  2022       855    1999-05-30            Chinese            6.0   
3145  2022       856    1995-02-06              Dutch            2.0   

      driver_end_position  first_place_count  second_place_count  \
51                     23                0.0                 0.0   

In [6]:
driver_years = get_age_at_first_race(conn)
drivers_final_positions = pd.merge(drivers_final_positions, driver_years, on=['driverId'], how='left')

In [7]:
lap_times_stats = get_lap_times_stats(conn)
print(lap_times_stats)

     year  driverId  min_lap_time_diff  avg_lap_time_diff  max_lap_time_diff
0    2007         1           0.000000           7.852700        1465.791089
1    2008         1           0.000000           7.666610         101.287948
2    2009         1           0.074346           6.171578          79.374498
3    2010         1           0.000000          10.709896        2567.450593
4    2011         1           0.000000           8.270089        1600.104940
..    ...       ...                ...                ...                ...
642  2021       853           2.817997          21.635194        2443.049302
643  2021       854           2.234354          18.256438        2428.528386
644  2022       854           0.704944          14.242761        3432.659375
645  2022       855           0.000000          12.770114        1625.453523
646  2022       856           3.086993           8.555862          46.221588

[647 rows x 5 columns]


In [8]:
columns_to_discretize = ['min_lap_time_diff', 'avg_lap_time_diff', 'max_lap_time_diff']
labels = ['0-25%', '25-50%', '50% and more']

for col in columns_to_discretize:
    if col == 'min_lap_time_diff':
        custom_bin_edges = [lap_times_stats[col].min(), 0.1, lap_times_stats[col].quantile(0.5), lap_times_stats[col].max()]
        lap_times_stats[col] = pd.cut(lap_times_stats[col], bins=custom_bin_edges, include_lowest=True, duplicates='drop', labels=labels)
    else:
        bin_edges = [lap_times_stats[col].min(), lap_times_stats[col].quantile(0.25), lap_times_stats[col].quantile(0.5), lap_times_stats[col].max()]
        lap_times_stats[col] = pd.cut(lap_times_stats[col], bins=bin_edges, include_lowest=True, duplicates='drop', labels=labels)

print(lap_times_stats)


     year  driverId min_lap_time_diff avg_lap_time_diff max_lap_time_diff
0    2007         1             0-25%            25-50%      50% and more
1    2008         1             0-25%            25-50%            25-50%
2    2009         1             0-25%             0-25%             0-25%
3    2010         1             0-25%      50% and more      50% and more
4    2011         1             0-25%            25-50%      50% and more
..    ...       ...               ...               ...               ...
642  2021       853      50% and more      50% and more      50% and more
643  2021       854      50% and more      50% and more      50% and more
644  2022       854            25-50%      50% and more      50% and more
645  2022       855             0-25%      50% and more      50% and more
646  2022       856      50% and more            25-50%             0-25%

[647 rows x 5 columns]


In [9]:
lap_positions = get_median_lap_position(conn)
lap_times_stats = pd.merge(lap_times_stats, lap_positions, on=['driverId', 'year'], how='left')
print(lap_times_stats)

     year  driverId min_lap_time_diff avg_lap_time_diff max_lap_time_diff  \
0    2007         1             0-25%            25-50%      50% and more   
1    2008         1             0-25%            25-50%            25-50%   
2    2009         1             0-25%             0-25%             0-25%   
3    2010         1             0-25%      50% and more      50% and more   
4    2011         1             0-25%            25-50%      50% and more   
..    ...       ...               ...               ...               ...   
642  2021       853      50% and more      50% and more      50% and more   
643  2021       854      50% and more      50% and more      50% and more   
644  2022       854            25-50%      50% and more      50% and more   
645  2022       855             0-25%      50% and more      50% and more   
646  2022       856      50% and more            25-50%             0-25%   

     median_lap_position  
0                      2  
1                    

In [10]:
lap_count = get_total_laps_count(conn)
lap_times_stats = pd.merge(lap_times_stats, lap_count, on=['driverId', 'year'], how='left')
print(lap_times_stats)

     year  driverId min_lap_time_diff avg_lap_time_diff max_lap_time_diff  \
0    2007         1             0-25%            25-50%      50% and more   
1    2008         1             0-25%            25-50%            25-50%   
2    2009         1             0-25%             0-25%             0-25%   
3    2010         1             0-25%      50% and more      50% and more   
4    2011         1             0-25%            25-50%      50% and more   
..    ...       ...               ...               ...               ...   
642  2021       853      50% and more      50% and more      50% and more   
643  2021       854      50% and more      50% and more      50% and more   
644  2022       854            25-50%      50% and more      50% and more   
645  2022       855             0-25%      50% and more      50% and more   
646  2022       856      50% and more            25-50%             0-25%   

     median_lap_position  total_laps  
0                      2        1037

In [11]:
lap_times_stats.describe()

Unnamed: 0,year,driverId,median_lap_position,total_laps
count,647.0,647.0,647.0,647.0
mean,2008.712519,254.25966,10.0,831.717156
std,7.656169,357.613647,4.903401,301.565783
min,1996.0,1.0,1.0,1.0
25%,2002.0,17.0,6.0,710.0
50%,2009.0,39.0,10.0,902.0
75%,2015.0,811.5,14.0,1043.5
max,2022.0,856.0,22.0,1294.0


In [12]:
pit_stop_stats = get_pit_stop_stats_in_miliseconds(conn)
print(pit_stop_stats)

     driverId  year  min_pit_stop_time   avg_pit_time  max_pit_stop_time
0           1  2011              13173   22666.446429              35688
1           1  2012              17598   22671.861111              31081
2           1  2013              17385   22399.111111              30085
3           1  2014              19710   53061.974359            1137295
4           1  2015              16579   23666.000000              30216
..        ...   ...                ...            ...                ...
270       853  2021              15054  211286.047619            2076977
271       854  2021              15058  201851.200000            2075728
272       854  2022              14144  111302.675000            3065174
273       855  2022              14128   67903.444444            1174235
274       856  2022              24628   24628.000000              24628

[275 rows x 5 columns]


In [13]:
columns_to_discretize = ['min_pit_stop_time', 'avg_pit_time', 'max_pit_stop_time']
labels = ['0-25%', '25-50%', '50% and more']

for col in columns_to_discretize:
    if col in pit_stop_stats.columns:
        bin_edges = [pit_stop_stats[col].min(), pit_stop_stats[col].quantile(0.25), pit_stop_stats[col].quantile(0.5), pit_stop_stats[col].max()]
        pit_stop_stats[col] = pd.cut(pit_stop_stats[col], bins=bin_edges, include_lowest=True, duplicates='drop', labels=labels)

print(pit_stop_stats)

     driverId  year min_pit_stop_time  avg_pit_time max_pit_stop_time
0           1  2011             0-25%         0-25%            25-50%
1           1  2012            25-50%         0-25%             0-25%
2           1  2013            25-50%         0-25%             0-25%
3           1  2014      50% and more  50% and more      50% and more
4           1  2015            25-50%         0-25%             0-25%
..        ...   ...               ...           ...               ...
270       853  2021             0-25%  50% and more      50% and more
271       854  2021             0-25%  50% and more      50% and more
272       854  2022             0-25%  50% and more      50% and more
273       855  2022             0-25%  50% and more      50% and more
274       856  2022      50% and more        25-50%             0-25%

[275 rows x 5 columns]


In [14]:
most_common_pit_stop_lap = get_most_common_pit_stop_lap(conn)
pit_stop_stats = pd.merge(pit_stop_stats, most_common_pit_stop_lap, on=['driverId', 'year'], how='left')
print(pit_stop_stats)

     driverId  year min_pit_stop_time  avg_pit_time max_pit_stop_time  \
0           1  2011             0-25%         0-25%            25-50%   
1           1  2012            25-50%         0-25%             0-25%   
2           1  2013            25-50%         0-25%             0-25%   
3           1  2014      50% and more  50% and more      50% and more   
4           1  2015            25-50%         0-25%             0-25%   
..        ...   ...               ...           ...               ...   
270       853  2021             0-25%  50% and more      50% and more   
271       854  2021             0-25%  50% and more      50% and more   
272       854  2022             0-25%  50% and more      50% and more   
273       855  2022             0-25%  50% and more      50% and more   
274       856  2022      50% and more        25-50%             0-25%   

     most_common_pitstop_lap  
0                         16  
1                         36  
2                         31  

In [15]:
constructor_driver_connection = get_drivers_with_constructor_relationship(conn)

In [16]:
drivers_with_constructors = pd.merge(drivers_final_positions, constructor_driver_connection, on=['driverId', 'year'], how='left')
print(drivers_with_constructors)

      year  driverId date_of_birth driver_nationality  driver_points  \
0     1950       427    1917-10-30             French            0.0   
1     1950       498    1922-10-05          Argentine            0.0   
2     1950       501    1921-06-29           American            0.0   
3     1950       501    1921-06-29           American            0.0   
4     1950       509    1928-07-16           American            0.0   
...    ...       ...           ...                ...            ...   
3482  2022       849    1995-06-29           Canadian            2.0   
3483  2022       852    2000-05-11           Japanese           12.0   
3484  2022       854    1999-03-22             German           12.0   
3485  2022       855    1999-05-30            Chinese            6.0   
3486  2022       856    1995-02-06              Dutch            2.0   

      driver_end_position  first_place_count  second_place_count  \
0                      23                0.0                 0.0   

In [17]:
connstructor_points = get_constructor_points(conn)
print(connstructor_points)

     year  constructorId  constructor_points  constructor_place
0    2022              9               759.0                  1
1    2022              6               554.0                  2
2    2022            131               515.0                  3
3    2022            214               173.0                  4
4    2022              1               159.0                  5
..    ...            ...                 ...                ...
895  1958            105                 6.0                  5
896  1958             32                 3.0                  6
897  1958            125                 0.0                  7
898  1958            127                 0.0                  7
899  1958             95                 0.0                  7

[900 rows x 4 columns]


In [18]:
connstructor_points = connstructor_points.sort_values(by=['year', 'constructorId'])

for index, row in connstructor_points.iterrows():
    temp_df = connstructor_points[(connstructor_points['constructorId'] == row['constructorId']) & (connstructor_points['year'] <= row['year'])]
    
    temp_first_place_count = temp_df[temp_df['constructor_place'] == 1].shape[0]
    temp_second_place_count = temp_df[temp_df['constructor_place'] == 2].shape[0]
    temp_third_place_count = temp_df[temp_df['constructor_place'] == 3].shape[0]
    temp_top_5_count = temp_df[temp_df['constructor_place'] <= 5].shape[0]
    
    connstructor_points.at[index, 'constructor_first_place_count'] = temp_first_place_count
    connstructor_points.at[index, 'constructor_second_place_count'] = temp_second_place_count
    connstructor_points.at[index, 'constructor_third_place_count'] = temp_third_place_count
    connstructor_points.at[index, 'constructor_top_5_count'] = temp_top_5_count

print(connstructor_points)


     year  constructorId  constructor_points  constructor_place  \
892  1958              6                40.0                  2   
896  1958             32                 3.0                  6   
894  1958             66                18.0                  4   
893  1958             87                31.0                  3   
899  1958             95                 0.0                  7   
..    ...            ...                 ...                ...   
6    2022            117                55.0                  6   
2    2022            131               515.0                  3   
7    2022            210                37.0                  8   
8    2022            213                35.0                  9   
3    2022            214               173.0                  4   

     constructor_first_place_count  constructor_second_place_count  \
892                            0.0                             1.0   
896                            0.0                     

In [19]:
drivers_with_constructors = pd.merge(drivers_with_constructors, connstructor_points, on=['constructorId', 'year'], how='left')
print(drivers_with_constructors)

      year  driverId date_of_birth driver_nationality  driver_points  \
0     1950       427    1917-10-30             French            0.0   
1     1950       498    1922-10-05          Argentine            0.0   
2     1950       501    1921-06-29           American            0.0   
3     1950       501    1921-06-29           American            0.0   
4     1950       509    1928-07-16           American            0.0   
...    ...       ...           ...                ...            ...   
3482  2022       849    1995-06-29           Canadian            2.0   
3483  2022       852    2000-05-11           Japanese           12.0   
3484  2022       854    1999-03-22             German           12.0   
3485  2022       855    1999-05-30            Chinese            6.0   
3486  2022       856    1995-02-06              Dutch            2.0   

      driver_end_position  first_place_count  second_place_count  \
0                      23                0.0                 0.0   

In [20]:
best_performing_circuits = get_best_performing_circuits(conn)
drivers_with_constructors = pd.merge(drivers_with_constructors, best_performing_circuits, on=['year','driverId' ], how='left')
print(drivers_with_constructors)

      year  driverId date_of_birth driver_nationality  driver_points  \
0     1950       427    1917-10-30             French            0.0   
1     1950       498    1922-10-05          Argentine            0.0   
2     1950       501    1921-06-29           American            0.0   
3     1950       501    1921-06-29           American            0.0   
4     1950       509    1928-07-16           American            0.0   
...    ...       ...           ...                ...            ...   
3482  2022       849    1995-06-29           Canadian            2.0   
3483  2022       852    2000-05-11           Japanese           12.0   
3484  2022       854    1999-03-22             German           12.0   
3485  2022       855    1999-05-30            Chinese            6.0   
3486  2022       856    1995-02-06              Dutch            2.0   

      driver_end_position  first_place_count  second_place_count  \
0                      23                0.0                 0.0   

In [21]:
lap_pit_df = pd.merge(lap_times_stats, pit_stop_stats, on=['driverId', 'year'], how='left', suffixes=('_lap', '_pit'))

print(lap_pit_df)

     year  driverId min_lap_time_diff avg_lap_time_diff max_lap_time_diff  \
0    2007         1             0-25%            25-50%      50% and more   
1    2008         1             0-25%            25-50%            25-50%   
2    2009         1             0-25%             0-25%             0-25%   
3    2010         1             0-25%      50% and more      50% and more   
4    2011         1             0-25%            25-50%      50% and more   
..    ...       ...               ...               ...               ...   
642  2021       853      50% and more      50% and more      50% and more   
643  2021       854      50% and more      50% and more      50% and more   
644  2022       854            25-50%      50% and more      50% and more   
645  2022       855             0-25%      50% and more      50% and more   
646  2022       856      50% and more            25-50%             0-25%   

     median_lap_position  total_laps min_pit_stop_time  avg_pit_time  \
0  

In [22]:
lap_pit_df.describe()

Unnamed: 0,year,driverId,median_lap_position,total_laps,most_common_pitstop_lap
count,647.0,647.0,647.0,647.0,275.0
mean,2008.712519,254.25966,10.0,831.717156,27.152727
std,7.656169,357.613647,4.903401,301.565783,13.007239
min,1996.0,1.0,1.0,1.0,1.0
25%,2002.0,17.0,6.0,710.0,17.0
50%,2009.0,39.0,10.0,902.0,28.0
75%,2015.0,811.5,14.0,1043.5,37.0
max,2022.0,856.0,22.0,1294.0,63.0


In [23]:

def categorize_total_laps(laps):
    if laps <= 1000:
        return '0-1000 Laps'
    elif laps <= 3000:
        return '1001-3000 Laps'
    elif laps <= 6000:
        return '3001-6000 Laps'
    elif laps <= 9000:
        return '6001-9000 Laps'
    elif laps <= 12000:
        return '9001-12000 Laps'
    else:
        return 'More than 12000 Laps'

lap_pit_df['total_laps'] = lap_pit_df['total_laps'].apply(categorize_total_laps)


In [24]:
final_df = pd.merge(drivers_with_constructors, lap_pit_df, on=['driverId', 'year'], how='left')

In [25]:
final_df

Unnamed: 0,year,driverId,date_of_birth,driver_nationality,driver_points,driver_end_position,first_place_count,second_place_count,third_place_count,top_5_count,...,best_performing_circuit_name,min_lap_time_diff,avg_lap_time_diff,max_lap_time_diff,median_lap_position,total_laps,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap
0,1950,427,1917-10-30,French,0.0,23,0.0,0.0,0.0,0.0,...,Circuit de Monaco,,,,,,,,,
1,1950,498,1922-10-05,Argentine,0.0,23,0.0,0.0,0.0,0.0,...,Circuit de Monaco,,,,,,,,,
2,1950,501,1921-06-29,American,0.0,23,0.0,0.0,0.0,0.0,...,Circuit de Monaco,,,,,,,,,
3,1950,501,1921-06-29,American,0.0,23,0.0,0.0,0.0,0.0,...,Circuit de Monaco,,,,,,,,,
4,1950,509,1928-07-16,American,0.0,23,0.0,0.0,0.0,0.0,...,Indianapolis Motor Speedway,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3482,2022,849,1995-06-29,Canadian,2.0,20,0.0,0.0,0.0,0.0,...,Suzuka Circuit,50% and more,50% and more,50% and more,18.0,1001-3000 Laps,0-25%,50% and more,50% and more,19.0
3483,2022,852,2000-05-11,Japanese,12.0,16,0.0,0.0,0.0,0.0,...,Autodromo Enzo e Dino Ferrari,50% and more,50% and more,50% and more,13.0,1001-3000 Laps,50% and more,50% and more,50% and more,18.0
3484,2022,854,1999-03-22,German,12.0,16,0.0,0.0,0.0,0.0,...,Red Bull Ring,25-50%,50% and more,50% and more,14.0,1001-3000 Laps,0-25%,50% and more,50% and more,14.0
3485,2022,855,1999-05-30,Chinese,6.0,18,0.0,0.0,0.0,0.0,...,Circuit Gilles Villeneuve,0-25%,50% and more,50% and more,14.0,1001-3000 Laps,0-25%,50% and more,50% and more,9.0


In [26]:
final_df.describe()

Unnamed: 0,year,driverId,driver_points,driver_end_position,first_place_count,second_place_count,third_place_count,top_5_count,age_at_first_race,constructorId,constructor_points,constructor_place,constructor_first_place_count,constructor_second_place_count,constructor_third_place_count,constructor_top_5_count,median_lap_position,most_common_pitstop_lap
count,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,2614.0,2614.0,2614.0,2614.0,2614.0,2614.0,661.0,281.0
mean,1975.854029,356.659019,14.380531,16.701463,0.148839,0.164611,0.1345,0.740465,28.17895,74.654718,44.259946,6.486611,1.367253,1.410099,1.049732,5.753634,10.01059,27.153025
std,20.152251,242.422349,42.375905,7.325131,0.642212,0.558025,0.456332,1.85072,5.98445,62.43329,93.690194,3.786143,2.900535,3.067575,1.96053,10.113541,4.868097,13.036271
min,1950.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,18.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,1958.0,146.5,0.0,11.0,0.0,0.0,0.0,0.0,24.0,18.0,2.0,3.0,0.0,0.0,0.0,0.0,6.0,17.0
50%,1972.0,341.0,0.0,19.0,0.0,0.0,0.0,0.0,27.0,58.0,14.0,6.0,0.0,0.0,0.0,2.0,10.0,28.0
75%,1990.0,541.0,8.0,22.0,0.0,0.0,0.0,0.0,31.0,114.0,44.0,9.0,1.0,1.75,1.0,7.0,14.0,37.0
max,2022.0,856.0,454.0,30.0,7.0,4.0,3.0,15.0,54.0,214.0,765.0,17.0,15.0,21.0,12.0,62.0,22.0,63.0


In [27]:
positions = get_most_common_positions(conn)

In [28]:
positions['most_common_ending_position'] = positions['most_common_ending_position'].replace({'\\N': 'Did Not Finish'})

In [29]:
positions['most_common_starting_position'] = positions['most_common_starting_position'].replace(0, 'Unknown')

In [30]:
def categorize_pitstop_lap(lap):
    if lap <= 10:
        return 'Early Pitstop'
    elif lap <= 35:
        return 'Mid-race Pitstop'
    else:
        return 'Late Pitstop'

final_df['most_common_pitstop_lap'] = final_df['most_common_pitstop_lap'].apply(categorize_pitstop_lap)

positions['most_common_starting_position'] = pd.to_numeric(positions['most_common_starting_position'], errors='coerce')
positions['most_common_ending_position'] = pd.to_numeric(positions['most_common_ending_position'], errors='coerce')

def categorize_position(position):
    if position <= 5:
        return 'Top 5'
    elif position <= 10:
        return 'Top 10'
    else:
        return 'Beyond Top 10'

# Apply the function to create new columns
positions['most_common_starting_position'] = positions['most_common_starting_position'].apply(categorize_position)
positions['most_common_ending_position'] = positions['most_common_ending_position'].apply(categorize_position)




print(final_df)

print(positions)

      year  driverId date_of_birth driver_nationality  driver_points  \
0     1950       427    1917-10-30             French            0.0   
1     1950       498    1922-10-05          Argentine            0.0   
2     1950       501    1921-06-29           American            0.0   
3     1950       501    1921-06-29           American            0.0   
4     1950       509    1928-07-16           American            0.0   
...    ...       ...           ...                ...            ...   
3482  2022       849    1995-06-29           Canadian            2.0   
3483  2022       852    2000-05-11           Japanese           12.0   
3484  2022       854    1999-03-22             German           12.0   
3485  2022       855    1999-05-30            Chinese            6.0   
3486  2022       856    1995-02-06              Dutch            2.0   

      driver_end_position  first_place_count  second_place_count  \
0                      23                0.0                 0.0   

In [31]:
final_df = pd.merge(final_df, positions, on=['driverId', 'year'], how='left')

In [32]:
final_df.describe()

Unnamed: 0,year,driverId,driver_points,driver_end_position,first_place_count,second_place_count,third_place_count,top_5_count,age_at_first_race,constructorId,constructor_points,constructor_place,constructor_first_place_count,constructor_second_place_count,constructor_third_place_count,constructor_top_5_count,median_lap_position
count,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,3487.0,2614.0,2614.0,2614.0,2614.0,2614.0,2614.0,661.0
mean,1975.854029,356.659019,14.380531,16.701463,0.148839,0.164611,0.1345,0.740465,28.17895,74.654718,44.259946,6.486611,1.367253,1.410099,1.049732,5.753634,10.01059
std,20.152251,242.422349,42.375905,7.325131,0.642212,0.558025,0.456332,1.85072,5.98445,62.43329,93.690194,3.786143,2.900535,3.067575,1.96053,10.113541,4.868097
min,1950.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,18.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,1958.0,146.5,0.0,11.0,0.0,0.0,0.0,0.0,24.0,18.0,2.0,3.0,0.0,0.0,0.0,0.0,6.0
50%,1972.0,341.0,0.0,19.0,0.0,0.0,0.0,0.0,27.0,58.0,14.0,6.0,0.0,0.0,0.0,2.0,10.0
75%,1990.0,541.0,8.0,22.0,0.0,0.0,0.0,0.0,31.0,114.0,44.0,9.0,1.0,1.75,1.0,7.0,14.0
max,2022.0,856.0,454.0,30.0,7.0,4.0,3.0,15.0,54.0,214.0,765.0,17.0,15.0,21.0,12.0,62.0,22.0


In [33]:
columns_to_fill_unknown = ['min_lap_time_diff', 'avg_lap_time_diff', 'max_lap_time_diff', 'min_pit_stop_time', 'avg_pit_time', 'max_pit_stop_time']

final_df[columns_to_fill_unknown] = final_df[columns_to_fill_unknown].astype(str)

final_df[columns_to_fill_unknown] = final_df[columns_to_fill_unknown].applymap(lambda x: 'Unknown' if str(x).lower() == 'nan' else x)

print(final_df)


      year  driverId date_of_birth driver_nationality  driver_points  \
0     1950       427    1917-10-30             French            0.0   
1     1950       498    1922-10-05          Argentine            0.0   
2     1950       501    1921-06-29           American            0.0   
3     1950       501    1921-06-29           American            0.0   
4     1950       509    1928-07-16           American            0.0   
...    ...       ...           ...                ...            ...   
3482  2022       849    1995-06-29           Canadian            2.0   
3483  2022       852    2000-05-11           Japanese           12.0   
3484  2022       854    1999-03-22             German           12.0   
3485  2022       855    1999-05-30            Chinese            6.0   
3486  2022       856    1995-02-06              Dutch            2.0   

      driver_end_position  first_place_count  second_place_count  \
0                      23                0.0                 0.0   

In [34]:
final_df.fillna('Unknown', inplace=True)

In [35]:
final_df.to_csv('final-with-pistop-miliseconds-with-unknown-discretized.csv', index=False)

In [36]:
final_df

Unnamed: 0,year,driverId,date_of_birth,driver_nationality,driver_points,driver_end_position,first_place_count,second_place_count,third_place_count,top_5_count,...,avg_lap_time_diff,max_lap_time_diff,median_lap_position,total_laps,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap,most_common_starting_position,most_common_ending_position
0,1950,427,1917-10-30,French,0.0,23,0.0,0.0,0.0,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
1,1950,498,1922-10-05,Argentine,0.0,23,0.0,0.0,0.0,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Top 5,Beyond Top 10
2,1950,501,1921-06-29,American,0.0,23,0.0,0.0,0.0,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
3,1950,501,1921-06-29,American,0.0,23,0.0,0.0,0.0,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
4,1950,509,1928-07-16,American,0.0,23,0.0,0.0,0.0,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3482,2022,849,1995-06-29,Canadian,2.0,20,0.0,0.0,0.0,0.0,...,50% and more,50% and more,18.0,1001-3000 Laps,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
3483,2022,852,2000-05-11,Japanese,12.0,16,0.0,0.0,0.0,0.0,...,50% and more,50% and more,13.0,1001-3000 Laps,50% and more,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Top 10
3484,2022,854,1999-03-22,German,12.0,16,0.0,0.0,0.0,0.0,...,50% and more,50% and more,14.0,1001-3000 Laps,0-25%,50% and more,50% and more,Mid-race Pitstop,Beyond Top 10,Beyond Top 10
3485,2022,855,1999-05-30,Chinese,6.0,18,0.0,0.0,0.0,0.0,...,50% and more,50% and more,14.0,1001-3000 Laps,0-25%,50% and more,50% and more,Early Pitstop,Beyond Top 10,Top 10


In [37]:
df = final_df.sort_values(by='year', ascending=False)
df = df.astype(str)

numeric_columns = ['year','driver_points', 'driver_end_position', 'constructor_points', 'constructor_place','median_lap_position', 'total_laps']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')

aggregation_functions = {
    'year': 'max',
    #'date_of_birth': 'first',
    'driver_nationality': 'first',
    'age_at_first_race': 'first',
    #'driver_end_position': lambda x: round(x.mean()) if x.notna().any() else 'Unknown',
    'first_place_count': 'first',
    'second_place_count': 'first',
    'third_place_count': 'first',
    'top_5_count': 'first',
    'constructorName': 'first',
    'constructor_nationality': 'first',
    #'constructor_place': lambda x: round(x.mean()) if x.notna().any() else 'Unknown',
    'constructor_first_place_count': 'first',
    'constructor_second_place_count': 'first',
    'constructor_third_place_count': 'first',
    'constructor_top_5_count': 'first',
    'best_performing_circuit_name': lambda x: x.mode().iloc[0], 
    'min_lap_time_diff': lambda x: x.mode().iloc[0],
    'avg_lap_time_diff': lambda x: x.mode().iloc[0],
    'max_lap_time_diff': lambda x: x.mode().iloc[0],
    'median_lap_position':lambda x: round(x.mean()) if x.notna().any() else 'Unknown',
    'total_laps': lambda x: round(x.sum()) if x.notna().any() else 'Unknown',
    'min_pit_stop_time': lambda x: x.mode().iloc[0],
    'avg_pit_time': lambda x: x.mode().iloc[0],
    'max_pit_stop_time': lambda x: x.mode().iloc[0],
    'most_common_pitstop_lap': lambda x: x.mode().iloc[0],
    'most_common_starting_position': lambda x: x.mode().iloc[0] if x.notna().any() else 'Unknown',
    'most_common_ending_position': lambda x: x.mode().iloc[0] 
}

aggregated_df = df.groupby('driverId').agg(aggregation_functions)

aggregated_df = aggregated_df.reset_index()

In [38]:
aggregated_df.fillna('Unknown', inplace=True)

In [39]:
aggregated_df = aggregated_df.drop('year', axis=1)

In [40]:
aggregated_df

Unnamed: 0,driverId,driver_nationality,age_at_first_race,first_place_count,second_place_count,third_place_count,top_5_count,constructorName,constructor_nationality,constructor_first_place_count,...,avg_lap_time_diff,max_lap_time_diff,median_lap_position,total_laps,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap,most_common_starting_position,most_common_ending_position
0,1,British,22,7.0,3.0,0.0,15.0,Mercedes,German,8.0,...,50% and more,50% and more,3,Unknown,25-50%,50% and more,50% and more,Mid-race Pitstop,Top 5,Top 5
1,10,German,22,0.0,0.0,0.0,0.0,Marussia,Russian,0.0,...,50% and more,50% and more,15,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
2,100,French,28,0.0,0.0,0.0,0.0,Larrousse,French,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
3,101,Australian,25,0.0,0.0,0.0,0.0,Simtek,British,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
4,102,Brazilian,24,3.0,2.0,1.0,9.0,Williams,British,7.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Top 5,Beyond Top 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843,95,British,27,1.0,3.0,0.0,6.0,McLaren,British,7.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Top 5,Beyond Top 10
844,96,French,26,0.0,0.0,0.0,0.0,Sauber,Swiss,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Top 10
845,97,Italian,26,0.0,0.0,0.0,0.0,Footwork,British,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
846,98,Swiss,31,0.0,0.0,0.0,0.0,Pacific,British,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10


In [41]:
#aggregated_df = aggregated_df[aggregated_df.ne('Unknown').any(axis=1)]

In [42]:
#aggregated_df.replace('Unknown', pd.NA, inplace=True)
#aggregated_df.dropna(inplace=True)

In [43]:
aggregated_df

Unnamed: 0,driverId,driver_nationality,age_at_first_race,first_place_count,second_place_count,third_place_count,top_5_count,constructorName,constructor_nationality,constructor_first_place_count,...,avg_lap_time_diff,max_lap_time_diff,median_lap_position,total_laps,min_pit_stop_time,avg_pit_time,max_pit_stop_time,most_common_pitstop_lap,most_common_starting_position,most_common_ending_position
0,1,British,22,7.0,3.0,0.0,15.0,Mercedes,German,8.0,...,50% and more,50% and more,3,Unknown,25-50%,50% and more,50% and more,Mid-race Pitstop,Top 5,Top 5
1,10,German,22,0.0,0.0,0.0,0.0,Marussia,Russian,0.0,...,50% and more,50% and more,15,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
2,100,French,28,0.0,0.0,0.0,0.0,Larrousse,French,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
3,101,Australian,25,0.0,0.0,0.0,0.0,Simtek,British,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
4,102,Brazilian,24,3.0,2.0,1.0,9.0,Williams,British,7.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Top 5,Beyond Top 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
843,95,British,27,1.0,3.0,0.0,6.0,McLaren,British,7.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Top 5,Beyond Top 10
844,96,French,26,0.0,0.0,0.0,0.0,Sauber,Swiss,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Top 10
845,97,Italian,26,0.0,0.0,0.0,0.0,Footwork,British,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10
846,98,Swiss,31,0.0,0.0,0.0,0.0,Pacific,British,0.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Late Pitstop,Beyond Top 10,Beyond Top 10


In [44]:
aggregated_df.to_csv("aggregated.csv", index=False, encoding='utf-8')