In [1]:
import sqlite3
import pandas as pd

In [2]:
def get_total_laps_count(conn: str) -> pd.DataFrame:
    """
    Get the total number of laps completed by each driver in each year.

    :param conn: The database connection.
    :return: DataFrame containing driverId, year, and total_laps columns.
    """
    total_laps_query = '''
    SELECT
        r.year,
        lt.driverId,
        COUNT(*) AS total_laps
    FROM
        lap_times lt
    JOIN
        races r ON lt.raceId = r.raceId
    WHERE
        r.year != '2023'
    GROUP BY
        lt.driverId, r.year;
    '''
    
    total_laps_df = pd.read_sql_query(total_laps_query, conn)

    return total_laps_df

In [3]:
def get_lap_times_stats(conn: str) -> pd.DataFrame:
    """
    This function executes an SQL query to retrieve lap time statistics for each driver and year,
    including the minimum, average, and maximum percentage difference from the fastest lap.

    :param conn: The database connection.

    :return: A DataFrame containing driverId, year, min_lap_time_diff, avg_lap_time_diff and max_lap_time_diff columns.
    """
    lap_times_query = """
        SELECT
            r.year,
            d.driverId,
            MIN((CAST(lt.milliseconds AS FLOAT) - fl.fastest_lap) / fl.fastest_lap * 100.0) AS min_lap_time_diff,
            AVG((CAST(lt.milliseconds AS FLOAT) - fl.fastest_lap) / fl.fastest_lap * 100.0) AS avg_lap_time_diff,
            MAX((CAST(lt.milliseconds AS FLOAT) - fl.fastest_lap) / fl.fastest_lap * 100.0) AS max_lap_time_diff
        FROM
            lap_times lt
            JOIN drivers d ON lt.driverId = d.driverId
            JOIN races r ON lt.raceId = r.raceId
            JOIN (
                SELECT
                    lt.raceId,
                    MIN(lt.milliseconds) AS fastest_lap
                FROM
                    lap_times lt
                GROUP BY
                    lt.raceId
            ) fl ON lt.raceId = fl.raceId
        WHERE
            r.year != 2023
        GROUP BY
            d.driverId,
            r.year;
    """
    lap_times_stats = pd.read_sql_query(lap_times_query, conn)

    return lap_times_stats

In [4]:
def get_median_lap_position(conn: str) -> pd.DataFrame:
    """
    Calculate the median lap position for each driver and year based on lap times.

    :param conn: The database connection.
    :return: A DataFrame containing driverId, year, and median_lap_position columns.
    """
    median_lap_position_query = '''
        WITH RankedLaps AS (
            SELECT
                r.year,
                lt.driverId,
                lt.position,
                ROW_NUMBER() OVER (PARTITION BY lt.driverId, r.year ORDER BY lt.position) AS row_num,
                COUNT(*) OVER (PARTITION BY lt.driverId, r.year) AS total_count
            FROM
                lap_times lt
            JOIN
                races r ON lt.raceId = r.raceId
            WHERE
                r.date != '2023'
        ),
        MedianCandidates AS (
            SELECT
                driverId,
                year,
                AVG(position) AS median_candidate
            FROM
                RankedLaps
            WHERE
                row_num = (total_count + 1) / 2 OR row_num = (total_count + 2) / 2
            GROUP BY
                driverId, year
        )
        SELECT
            driverId,
            year,
            CAST(AVG(median_candidate) AS INTEGER) AS median_lap_position
        FROM
            MedianCandidates
        GROUP BY
            driverId, year;
    '''
    median_lap_position_df = pd.read_sql_query(median_lap_position_query, conn)

    return median_lap_position_df

In [5]:
def categorize_total_laps(laps):
    if laps <= 1000:
        return '0-1000 Laps'
    elif laps <= 3000:
        return '1001-3000 Laps'
    elif laps <= 6000:
        return '3001-6000 Laps'
    elif laps <= 9000:
        return '6001-9000 Laps'
    elif laps <= 12000:
        return '9001-12000 Laps'
    else:
        return 'More than 12000 Laps'