In [1]:
import json
import pandas as pd
import sqlite3

In [2]:
def get_avg_pit_stops(conn, existing_df):
    query = '''
    SELECT
        drivers.driverId,
        races.year,
        AVG(pit_stops.stop) AS avg_pit_stops
    FROM
        drivers
    JOIN
        pit_stops ON drivers.driverId = pit_stops.driverId
    JOIN
        races ON pit_stops.raceId = races.raceId    
    WHERE
        races.year != '2023'
    GROUP BY
        drivers.driverId, races.year;
    '''

    df = pd.read_sql_query(query, conn)

    avg_pit_stops_json_df = df.groupby('driverId').apply(
        lambda group: json.dumps({year: avg_pit_stops for _, year, avg_pit_stops in group.itertuples(index=False)})
    ).reset_index(name='avg_num_of_pit_stops')

    result_df = pd.merge(existing_df, avg_pit_stops_json_df, on='driverId', how='left')
    
    return result_df


In [3]:
def get_avg_pit_time(conn, existing_df):
    query = '''
    SELECT
        drivers.driverId,
        races.year,
        AVG(pit_stops.milliseconds) AS avg_pit_time
    FROM
        drivers 
    JOIN
        pit_stops ON drivers.driverId = pit_stops.driverId
    JOIN
        races ON pit_stops.raceId = races.raceId
    WHERE
        races.year != '2023'
    GROUP BY
        drivers.driverId, races.year;
    '''

    df = pd.read_sql_query(query, conn)

    avg_pit_time_json_df = df.groupby('driverId').apply(
        lambda group: json.dumps({year: avg_pit_time for _, year, avg_pit_time in group.itertuples(index=False)})
    ).reset_index(name='avg_pit_stop_time')

    result_df = pd.merge(existing_df, avg_pit_time_json_df, on='driverId', how='left')

    return result_df


In [4]:
def get_min_pit_stop_time(conn, existing_df):
    query = '''
    SELECT
        drivers.driverId,
        races.year,
        MIN(pit_stops.milliseconds) AS min_pit_stop_time
    FROM
        drivers 
    JOIN
        pit_stops ON drivers.driverId = pit_stops.driverId
    JOIN
        races ON pit_stops.raceId = races.raceId
    WHERE
        races.year != '2023'
    GROUP BY
        drivers.driverId, races.year;
    '''

    df = pd.read_sql_query(query, conn)

    min_pit_stop_time_json_df = df.groupby('driverId').apply(
        lambda group: json.dumps({year: min_pit_stop_time for _, year, min_pit_stop_time in group.itertuples(index=False)})
    ).reset_index(name='min_pit_stop_time')

    result_df = pd.merge(existing_df, min_pit_stop_time_json_df, on='driverId', how='left')

    return result_df


In [5]:
def get_max_pit_stop_time(conn, existing_df):
    query = '''
    SELECT
        drivers.driverId,
        races.year,
        MAX(pit_stops.milliseconds) AS max_pit_stop_time
    FROM
        drivers 
    JOIN
        pit_stops ON drivers.driverId = pit_stops.driverId
    JOIN
        races ON pit_stops.raceId = races.raceId
    WHERE
        races.year != '2023'
    GROUP BY
        drivers.driverId, races.year;
    '''

    df = pd.read_sql_query(query, conn)

    max_pit_stop_time_json_df = df.groupby('driverId').apply(
        lambda group: json.dumps({year: max_pit_stop_time for _, year, max_pit_stop_time in group.itertuples(index=False)})
    ).reset_index(name='max_pit_stop_time')

    result_df = pd.merge(existing_df, max_pit_stop_time_json_df, on='driverId', how='left')

    return result_df

In [6]:
def get_pit_stop_stats(conn: str) -> pd.DataFrame: 
    """
    Calculate pit stop statistics for each driver and year.

    :param connection: The database connection.
    :return: DataFrame containing driverId, year, min_pit_stop_time_diff, avg_pit_stop_time_diff, and max_pit_stop_time_diff columns.
    """
    pit_stop_query = """
    WITH FastestPitStops AS (
        SELECT
            ps.raceId,
            MIN(ps.milliseconds) AS fastest_pit_stop
        FROM
            pit_stops ps
        GROUP BY
            ps.raceId
    )

    SELECT
        ps.raceId,
        ps.driverId,
        r.year,
        ps.milliseconds AS pit_stop_time,
        fps.fastest_pit_stop,
        (ps.milliseconds - fps.fastest_pit_stop) AS difference,
        (CAST(ps.milliseconds AS FLOAT) - fps.fastest_pit_stop) / fps.fastest_pit_stop * 100.0 AS percentage_difference
    FROM
        pit_stops ps
        JOIN FastestPitStops fps ON ps.raceId = fps.raceId
        JOIN races r ON ps.raceId = r.raceId
    WHERE
         r.year != 2023
    """

    pit_stop_df = pd.read_sql_query(pit_stop_query, conn)

    pit_stop_stats_df = pit_stop_df.groupby(['driverId', 'year']).agg(
        min_pit_stop_time_diff=pd.NamedAgg(column='difference', aggfunc='min'),
        avg_pit_stop_time_diff=pd.NamedAgg(column='difference', aggfunc='mean'),
        max_pit_stop_time_diff=pd.NamedAgg(column='difference', aggfunc='max')
    ).reset_index()

    return pit_stop_stats_df


In [7]:
def get_most_common_pit_stop_lap(conn):
    query = '''
    WITH MostCommonPitStopLap AS (
        SELECT
            races.year,
            drivers.driverId,
            pit_stops.lap AS most_common_pitstop_lap,
            ROW_NUMBER() OVER (PARTITION BY drivers.driverId, races.year ORDER BY COUNT(pit_stops.lap) DESC) AS lap_rank
        FROM
            drivers
        JOIN
            pit_stops ON drivers.driverId = pit_stops.driverId
        JOIN
            races ON pit_stops.raceId = races.raceId
        WHERE
            races.year != '2023'
        GROUP BY
            drivers.driverId, races.year, pit_stops.lap
    )
    SELECT
        driverId,
        year,
        most_common_pitstop_lap
    FROM
        MostCommonPitStopLap
    WHERE
        lap_rank = 1;
    '''
    
    pit_stop_lap = pd.read_sql_query(query, conn)

    return pit_stop_lap


In [None]:
def get_pit_stop_stats_in_miliseconds(conn):
    """
    Retrieve pit stop statistics (average, minimum, and maximum pit stop times) for each driver and year.
    
    :param conn: The database connection.
    :return: DataFrame containing driverId, year, avg_pit_time, min_pit_stop_time, and max_pit_stop_time.
    """
    query = '''
    SELECT
        drivers.driverId,
        races.year,
        MIN(pit_stops.milliseconds) AS min_pit_stop_time,
         AVG(pit_stops.milliseconds) AS avg_pit_time,
        MAX(pit_stops.milliseconds) AS max_pit_stop_time
    FROM
        drivers 
    JOIN
        pit_stops ON drivers.driverId = pit_stops.driverId
    JOIN
        races ON pit_stops.raceId = races.raceId
    WHERE
        races.year != '2023'
    GROUP BY
        drivers.driverId, races.year;
    '''

    df = pd.read_sql_query(query, conn)

    return df
