In [1]:
#!pip install nbimporter

In [2]:
import sqlite3
import pandas as pd
import nbimporter
from final_scores import calculate_final_driver_places 

conn = sqlite3.connect('formula1.db')

drivers_final_positions = calculate_final_driver_places(conn)

print(drivers_final_positions)

drivers_final_positions.to_csv("drivers_standing.csv", index=False)

      year  driverId      driver_name  points  place
73    1950       642      Nino Farina    30.0      1
66    1950       579      Juan Fangio    27.0      2
72    1950       786    Luigi Fagioli    24.0      3
70    1950       627     Louis Rosier    13.0      4
56    1950       647   Alberto Ascari    11.0      5
...    ...       ...              ...     ...    ...
3133  2022       855      Guanyu Zhou     6.0     18
3136  2022       848  Alexander Albon     4.0     19
3139  2022       849  Nicholas Latifi     2.0     20
3145  2022       856    Nyck de Vries     2.0     20
3140  2022       807  Nico Hülkenberg     0.0     22

[3146 rows x 5 columns]


In [3]:
#lap_times calculations

sql_query = """
SELECT
    lt.raceId,
    lt.driverId,
    lt.milliseconds AS lap_time,
    fl.fastest_lap,
    (lt.milliseconds - fl.fastest_lap) AS difference,
    ((CAST(lt.milliseconds AS FLOAT) - fl.fastest_lap) / fl.fastest_lap) * 100.0 AS percentage_difference
FROM
    lap_times lt
    JOIN (
        SELECT
            lt.raceId,
            MIN(lt.milliseconds) AS fastest_lap
        FROM
            lap_times lt
        GROUP BY
            lt.raceId
    ) fl ON lt.raceId = fl.raceId
JOIN races r ON lt.raceId = r.raceId
WHERE
    r.year != 2023;

"""
lap_times = pd.read_sql_query(sql_query, conn)
print(lap_times)


        raceId  driverId  lap_time  fastest_lap  difference  \
0            1         1    109088        87706       21382   
1            1         1     93740        87706        6034   
2            1         1     91600        87706        3894   
3            1         1     91067        87706        3361   
4            1         1     92129        87706        4423   
...        ...       ...       ...          ...         ...   
538116    1096       822     92998        88391        4607   
538117    1096       822     92995        88391        4604   
538118    1096       822     91236        88391        2845   
538119    1096       822     90566        88391        2175   
538120    1096       822     90743        88391        2352   

        percentage_difference  
0                   24.379176  
1                    6.879803  
2                    4.439833  
3                    3.832121  
4                    5.042985  
...                       ...  
538116             

In [4]:
#lap times
sql_query = """
SELECT
    d.driverId,
    r.year,
    MIN((CAST(lt.milliseconds AS FLOAT) - fl.fastest_lap) / fl.fastest_lap * 100.0) AS min_percentage_difference,
    AVG((CAST(lt.milliseconds AS FLOAT) - fl.fastest_lap) / fl.fastest_lap * 100.0) AS avg_percentage_difference,
    MAX((CAST(lt.milliseconds AS FLOAT) - fl.fastest_lap) / fl.fastest_lap * 100.0) AS max_percentage_difference
FROM
    lap_times lt
    JOIN drivers d ON lt.driverId = d.driverId
    JOIN races r ON lt.raceId = r.raceId
    JOIN (
        SELECT
            lt.raceId,
            MIN(lt.milliseconds) AS fastest_lap
        FROM
            lap_times lt
        GROUP BY
            lt.raceId
    ) fl ON lt.raceId = fl.raceId
WHERE
    r.year != 2023
GROUP BY
    d.driverId,
    r.year;
"""
lap_times_stats = pd.read_sql_query(sql_query, conn)
print(lap_times_stats)

print("lap_times_stats columns:", lap_times_stats.columns)

     driverId  year  min_percentage_difference  avg_percentage_difference  \
0           1  2007                   0.000000                   7.852700   
1           1  2008                   0.000000                   7.666610   
2           1  2009                   0.074346                   6.171578   
3           1  2010                   0.000000                  10.709896   
4           1  2011                   0.000000                   8.270089   
..        ...   ...                        ...                        ...   
642       853  2021                   2.817997                  21.635194   
643       854  2021                   2.234354                  18.256438   
644       854  2022                   0.704944                  14.242761   
645       855  2022                   0.000000                  12.770114   
646       856  2022                   3.086993                   8.555862   

     max_percentage_difference  
0                  1465.791089  
1        

In [5]:
total_laps_query = '''
WITH RankedLaps AS (
    SELECT
        lt.driverId,
        r.year,
        lt.position,
        ROW_NUMBER() OVER (PARTITION BY lt.driverId, r.year ORDER BY lt.position) AS row_num,
        COUNT(*) OVER (PARTITION BY lt.driverId, r.year) AS total_count
    FROM
        lap_times lt
    JOIN
        races r ON lt.raceId = r.raceId
    WHERE
        strftime('%Y', r.date) <> '2023'
),
MedianCandidates AS (
    SELECT
        driverId,
        year,
        AVG(position) AS median_candidate
    FROM
        RankedLaps
    WHERE
        row_num = (total_count + 1) / 2 OR row_num = (total_count + 2) / 2
    GROUP BY
        driverId, year
)
SELECT
    driverId,
    year,
    AVG(median_candidate) AS median_lap_position
FROM
    MedianCandidates
GROUP BY
    driverId, year;

'''

total_laps_df = pd.read_sql_query(total_laps_query, conn)

lap_times_stats = pd.merge(lap_times_stats, total_laps_df, on=['driverId', 'year'], how='left')

print(lap_times_stats)


     driverId  year  min_percentage_difference  avg_percentage_difference  \
0           1  2007                   0.000000                   7.852700   
1           1  2008                   0.000000                   7.666610   
2           1  2009                   0.074346                   6.171578   
3           1  2010                   0.000000                  10.709896   
4           1  2011                   0.000000                   8.270089   
..        ...   ...                        ...                        ...   
642       853  2021                   2.817997                  21.635194   
643       854  2021                   2.234354                  18.256438   
644       854  2022                   0.704944                  14.242761   
645       855  2022                   0.000000                  12.770114   
646       856  2022                   3.086993                   8.555862   

     max_percentage_difference  median_lap_position  
0                  14

In [6]:
laps_ridden_query = f'''
        SELECT
            lt.driverId,
            r.year,
            COUNT(*) AS laps_ridden
        FROM
            lap_times lt
        JOIN
            races r ON lt.raceId = r.raceId
        WHERE
            strftime('%Y', r.date) <> '2023'
        GROUP BY
            lt.driverId, r.year;
    '''
laps_ridden_df = pd.read_sql_query(laps_ridden_query, conn)

lap_times_stats = pd.merge(lap_times_stats, laps_ridden_df, on=['driverId', 'year'], how='left')

print(lap_times_stats)



     driverId  year  min_percentage_difference  avg_percentage_difference  \
0           1  2007                   0.000000                   7.852700   
1           1  2008                   0.000000                   7.666610   
2           1  2009                   0.074346                   6.171578   
3           1  2010                   0.000000                  10.709896   
4           1  2011                   0.000000                   8.270089   
..        ...   ...                        ...                        ...   
642       853  2021                   2.817997                  21.635194   
643       854  2021                   2.234354                  18.256438   
644       854  2022                   0.704944                  14.242761   
645       855  2022                   0.000000                  12.770114   
646       856  2022                   3.086993                   8.555862   

     max_percentage_difference  median_lap_position  laps_ridden  
0       

In [7]:
sql_query = """
WITH FastestPitStops AS (
    SELECT
        ps.raceId,
        MIN(ps.milliseconds) AS fastest_pit_stop
    FROM
        pit_stops ps
    GROUP BY
        ps.raceId
)

SELECT
    ps.raceId,
    ps.driverId,
    r.year,
    ps.milliseconds AS pit_stop_time,
    fps.fastest_pit_stop,
    (ps.milliseconds - fps.fastest_pit_stop) AS difference,
    (CAST(ps.milliseconds AS FLOAT) - fps.fastest_pit_stop) / fps.fastest_pit_stop * 100.0 AS percentage_difference
FROM
    pit_stops ps
    JOIN FastestPitStops fps ON ps.raceId = fps.raceId
    JOIN races r ON ps.raceId = r.raceId;

"""
pit_stops = pd.read_sql_query(sql_query, conn)
print(pit_stops)

       raceId  driverId  year  pit_stop_time  fastest_pit_stop  difference  \
0         841       153  2011          26898             16867       10031   
1         841        30  2011          25021             16867        8154   
2         841        17  2011          23426             16867        6559   
3         841         4  2011          23251             16867        6384   
4         841        13  2011          23842             16867        6975   
...       ...       ...   ...            ...               ...         ...   
10084    1110         4  2023          23798             22610        1188   
10085    1110       830  2023          23012             22610         402   
10086    1110       848  2023          23529             22610         919   
10087    1110       858  2023          23109             22610         499   
10088    1110         1  2023          23156             22610         546   

       percentage_difference  
0                  59.471157  
1

In [8]:
def calculate_pit_stop_stats(connection):
    # Query to calculate fastest pit stops and differences
    pit_stop_query = """
    WITH FastestPitStops AS (
        SELECT
            ps.raceId,
            MIN(ps.milliseconds) AS fastest_pit_stop
        FROM
            pit_stops ps
        GROUP BY
            ps.raceId
    )

    SELECT
        ps.raceId,
        ps.driverId,
        r.year,
        ps.milliseconds AS pit_stop_time,
        fps.fastest_pit_stop,
        (ps.milliseconds - fps.fastest_pit_stop) AS difference,
        (CAST(ps.milliseconds AS FLOAT) - fps.fastest_pit_stop) / fps.fastest_pit_stop * 100.0 AS percentage_difference
    FROM
        pit_stops ps
        JOIN FastestPitStops fps ON ps.raceId = fps.raceId
        JOIN races r ON ps.raceId = r.raceId
    """

    # Read the query results into a DataFrame
    pit_stop_df = pd.read_sql_query(pit_stop_query, connection)

    # Calculate min, avg, and max for each driver and year
    pit_stop_stats_df = pit_stop_df.groupby(['driverId', 'year']).agg(
        min_percentage_difference=pd.NamedAgg(column='percentage_difference', aggfunc='min'),
        avg_percentage_difference=pd.NamedAgg(column='percentage_difference', aggfunc='mean'),
        max_percentage_difference=pd.NamedAgg(column='percentage_difference', aggfunc='max')
    ).reset_index()

    return pit_stop_stats_df


pit_stop_stats = calculate_pit_stop_stats(conn)
print(pit_stop_stats)

     driverId  year  min_percentage_difference  avg_percentage_difference  \
0           1  2011                   0.000000                  30.862876   
1           1  2012                   0.000000                  39.127907   
2           1  2013                   0.265298                  20.120236   
3           1  2014                   0.987032                 131.604073   
4           1  2015                   0.000000                  13.673431   
..        ...   ...                        ...                        ...   
291       855  2023                   2.312901                 738.323357   
292       856  2022                   5.704107                   5.704107   
293       856  2023                   0.648610                 536.227602   
294       857  2023                   0.938927                1091.200926   
295       858  2023                   1.983098                 381.043815   

     max_percentage_difference  
0                   156.287253  
1        

In [9]:
query = '''
    WITH MostCommonPitStopLap AS (
        SELECT
            drivers.driverId,
            races.year,
            pit_stops.lap AS most_common_pitstop_lap,
            ROW_NUMBER() OVER (PARTITION BY drivers.driverId, races.year ORDER BY COUNT(pit_stops.lap) DESC) AS lap_rank
        FROM
            drivers
        JOIN
            pit_stops ON drivers.driverId = pit_stops.driverId
        JOIN
            races ON pit_stops.raceId = races.raceId
        WHERE
            races.year != '2023'
        GROUP BY
            drivers.driverId, races.year, pit_stops.lap
    )
    SELECT
        driverId,
        year,
        most_common_pitstop_lap
    FROM
        MostCommonPitStopLap
    WHERE
        lap_rank = 1;
'''

df = pd.read_sql_query(query, conn)

print(df)

# Specify suffixes to distinguish the columns
pit_stop_stats = pd.merge(pit_stop_stats, df, on=['driverId', 'year'], how='left')


print(pit_stop_stats)


     driverId  year  most_common_pitstop_lap
0           1  2011                       16
1           1  2012                       36
2           1  2013                       31
3           1  2014                       26
4           1  2015                       13
..        ...   ...                      ...
270       853  2021                       31
271       854  2021                       34
272       854  2022                       14
273       855  2022                        9
274       856  2022                       19

[275 rows x 3 columns]
     driverId  year  min_percentage_difference  avg_percentage_difference  \
0           1  2011                   0.000000                  30.862876   
1           1  2012                   0.000000                  39.127907   
2           1  2013                   0.265298                  20.120236   
3           1  2014                   0.987032                 131.604073   
4           1  2015                   0.000000     

In [10]:
from final_scores import calculate_final_constructor_places 
constructors_final_positions = calculate_final_constructor_places(conn)

In [11]:
print(constructors_final_positions)

     year  constructorId constructor_name  points  place
0    2022              9         Red Bull   759.0      1
1    2022              6          Ferrari   554.0      2
2    2022            131         Mercedes   515.0      3
3    2022            214   Alpine F1 Team   173.0      4
4    2022              1          McLaren   159.0      5
..    ...            ...              ...     ...    ...
895  1958            105         Maserati     6.0      5
896  1958             32       Team Lotus     3.0      6
897  1958            125        Connaught     0.0      7
898  1958            127             OSCA     0.0      7
899  1958             95          Porsche     0.0      7

[900 rows x 5 columns]


In [17]:
constructor_points_df = calculate_final_constructor_places(conn)

constructor_points_df.sort_values(by=['year', 'points'], ascending=[True, False], inplace=True)

constructor_points_df['times_first'] = constructor_points_df.groupby('constructorId').cumcount() + 1

print(constructor_points_df)

pit_stop_stats = pd.merge(pit_stop_stats, constructor_points_df[['year', 'constructorId', 'place', 'times_first']], on=['year', 'constructorId'], how='left')

pit_stop_stats['place'].fillna(0, inplace=True)
pit_stop_stats['times_first'].fillna(0, inplace=True)

pit_stop_stats['times_first'] = pit_stop_stats['times_first'].astype(int)

print(df)


     year  constructorId constructor_name  points  place  times_first
891  1958            118          Vanwall    48.0      1            1
892  1958              6          Ferrari    40.0      2            1
893  1958             87           Cooper    31.0      3            1
894  1958             66              BRM    18.0      4            1
895  1958            105         Maserati     6.0      5            1
..    ...            ...              ...     ...    ...          ...
5    2022             51       Alfa Romeo    55.0      6           13
6    2022            117     Aston Martin    55.0      6            4
7    2022            210     Haas F1 Team    37.0      8            7
8    2022            213       AlphaTauri    35.0      9            3
9    2022              3         Williams     8.0     10           47

[900 rows x 6 columns]


KeyError: 'constructorId'

In [14]:
merged_df = pd.merge(lap_times_stats, pit_stop_stats, on=['driverId', 'year'], how='outer', suffixes=('_lap', '_pit'))

merged_df = merged_df.fillna('unknown')

print(merged_df)

     driverId  year min_percentage_difference_lap  \
0           1  2007                           0.0   
1           1  2008                           0.0   
2           1  2009                      0.074346   
3           1  2010                           0.0   
4           1  2011                           0.0   
..        ...   ...                           ...   
663       852  2023                       unknown   
664       855  2023                       unknown   
665       856  2023                       unknown   
666       857  2023                       unknown   
667       858  2023                       unknown   

    avg_percentage_difference_lap max_percentage_difference_lap  \
0                          7.8527                   1465.791089   
1                         7.66661                    101.287948   
2                        6.171578                     79.374498   
3                       10.709896                   2567.450593   
4                        8.2