In [3]:
import json
import sqlite3
import pandas as pd

In [4]:
conn = sqlite3.connect('formula1-originalDB.db')

In [5]:
query = '''
    SELECT
        driverId,
        forename,
        surname,
        dob,
        nationality
    FROM
        drivers
'''

drivers = pd.read_sql_query(query, conn)

print(drivers)

     driverId  forename     surname         dob nationality
0           1     Lewis    Hamilton  1985-01-07     British
1           2      Nick    Heidfeld  1977-05-10      German
2           3      Nico     Rosberg  1985-06-27      German
3           4  Fernando      Alonso  1981-07-29     Spanish
4           5    Heikki  Kovalainen  1981-10-19     Finnish
..        ...       ...         ...         ...         ...
852       854      Mick  Schumacher  1999-03-22      German
853       855    Guanyu        Zhou  1999-05-30     Chinese
854       856      Nyck    de Vries  1995-02-06       Dutch
855       857     Oscar     Piastri  2001-04-06  Australian
856       858     Logan    Sargeant  2000-12-31    American

[857 rows x 5 columns]


In [8]:
max_query = '''
    SELECT
        drivers.driverId,
        strftime('%Y', races.date) AS season,
        MAX(lap_times.milliseconds) AS max_lap_time
    FROM
        lap_times
    JOIN
        races ON lap_times.raceId = races.raceId
    JOIN
        drivers ON lap_times.driverId = drivers.driverId
    GROUP BY
        drivers.driverId, season
'''

avg_query = '''
    SELECT
        drivers.driverId,
        strftime('%Y', races.date) AS season,
        AVG(lap_times.milliseconds) AS avg_lap_time
    FROM
        lap_times
    JOIN
        races ON lap_times.raceId = races.raceId
    JOIN
        drivers ON lap_times.driverId = drivers.driverId
    GROUP BY
        drivers.driverId, season
'''

min_query = '''
    SELECT
        drivers.driverId,
        strftime('%Y', races.date) AS season,
        MIN(lap_times.milliseconds) AS min_lap_time
    FROM
        lap_times
    JOIN
        races ON lap_times.raceId = races.raceId
    JOIN
        drivers ON lap_times.driverId = drivers.driverId
    GROUP BY
        drivers.driverId, season
'''

max_lap_times_df = pd.read_sql_query(max_query, conn)
avg_lap_times_df = pd.read_sql_query(avg_query, conn)
min_lap_times_df = pd.read_sql_query(min_query, conn)

max_lap_times_df['max_lap_times_json'] = max_lap_times_df.apply(
    lambda row: json.dumps({row['season']: row['max_lap_time']}), axis=1
)

avg_lap_times_df['avg_lap_times_json'] = avg_lap_times_df.apply(
    lambda row: json.dumps({row['season']: row['avg_lap_time']}), axis=1
)

min_lap_times_df['min_lap_times_json'] = min_lap_times_df.apply(
    lambda row: json.dumps({row['season']: row['min_lap_time']}), axis=1
)

max_lap_times_json_df = max_lap_times_df.groupby('driverId')['max_lap_times_json'].apply(list).reset_index()
avg_lap_times_json_df = avg_lap_times_df.groupby('driverId')['avg_lap_times_json'].apply(list).reset_index()
min_lap_times_json_df = min_lap_times_df.groupby('driverId')['min_lap_times_json'].apply(list).reset_index()

drivers = pd.merge(drivers, max_lap_times_json_df, on='driverId', how='left')
drivers = pd.merge(drivers, avg_lap_times_json_df, on='driverId', how='left')
drivers = pd.merge(drivers, min_lap_times_json_df, on='driverId', how='left')

drivers.rename(columns={'max_lap_times_json': 'max_lap_times', 'avg_lap_times_json': 'avg_lap_times', 'min_lap_times_json': 'min_lap_times'}, inplace=True)
drivers['max_lap_times'].fillna(value='[]', inplace=True)
drivers['avg_lap_times'].fillna(value='[]', inplace=True)
drivers['min_lap_times'].fillna(value='[]', inplace=True)

del min_lap_times_json_df, max_lap_times_json_df, avg_lap_times_json_df

print(drivers)


     driverId  forename     surname         dob nationality  \
0           1     Lewis    Hamilton  1985-01-07     British   
1           2      Nick    Heidfeld  1977-05-10      German   
2           3      Nico     Rosberg  1985-06-27      German   
3           4  Fernando      Alonso  1981-07-29     Spanish   
4           5    Heikki  Kovalainen  1981-10-19     Finnish   
..        ...       ...         ...         ...         ...   
852       854      Mick  Schumacher  1999-03-22      German   
853       855    Guanyu        Zhou  1999-05-30     Chinese   
854       856      Nyck    de Vries  1995-02-06       Dutch   
855       857     Oscar     Piastri  2001-04-06  Australian   
856       858     Logan    Sargeant  2000-12-31    American   

                                         max_lap_times  \
0    [{"2007": 1453884}, {"2008": 195953}, {"2009":...   
1    [{"2000": 218816}, {"2001": 232209}, {"2002": ...   
2    [{"2006": 157809}, {"2007": 191891}, {"2008": ...   
3    [{"200

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers['max_lap_times'].fillna(value='[]', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers['avg_lap_times'].fillna(value='[]', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  drivers['min_lap_times'].fillna(value='[]', inplace=True)


In [5]:
import json
import sqlite3
import pandas as pd

conn = sqlite3.connect('formula1-originalDB.db')

query = '''
WITH RankedPositions AS (
    SELECT
        d.driverId,
        lt.position AS position,
        COUNT(lt.position) AS position_count,
        ROW_NUMBER() OVER (PARTITION BY d.driverId ORDER BY COUNT(lt.position) DESC) AS position_rank
    FROM
        drivers d
    JOIN
        lap_times lt ON d.driverId = lt.driverId
    GROUP BY
        d.driverId, lt.position
)
SELECT
    driverId,
    position,
    position_count
FROM
    RankedPositions
WHERE
    position_rank <= 3;
'''

df = pd.read_sql_query(query, conn)

print(df)

     driverId  position  position_count
0           1         1            5448
1           1         2            4079
2           1         3            2199
3           2         9             882
4           2         7             842
..        ...       ...             ...
410       857        13              81
411       857        19              74
412       858        13             115
413       858        20             104
414       858        15              90

[415 rows x 3 columns]


In [6]:
import json
import sqlite3
import pandas as pd

conn = sqlite3.connect('formula1-originalDB.db')

# Your SQL query
query = '''
WITH RankedPositions AS (
    SELECT
        d.driverId,
        r.year,
        lt.position AS position,
        COUNT(lt.position) AS position_count,
        ROW_NUMBER() OVER (PARTITION BY d.driverId, r.year ORDER BY COUNT(lt.position) DESC) AS position_rank,
        COUNT(lt.position) OVER (PARTITION BY d.driverId, r.year) AS total_races
    FROM
        drivers d
    JOIN
        lap_times lt ON d.driverId = lt.driverId
    JOIN
        races r ON lt.raceId = r.raceId
    GROUP BY
        d.driverId, r.year, lt.position
)
SELECT
    driverId,
    year,
    position,
    position_count
FROM
    RankedPositions
WHERE
    position_rank <= 3
'''

# Execute the SQL query and read the result into a DataFrame
df = pd.read_sql_query(query, conn)

# Group by driverId and aggregate the lap positions into a list for each driver
lap_positions_json_df = df.groupby('driverId').apply(
    lambda group: json.dumps({year: group[['position', 'position_count']].to_dict(orient='records') for year, group in group.groupby('year')})
).reset_index(name='lap_positions')

# Merge the lap positions DataFrame with the 'drivers' DataFrame
drivers = pd.merge(drivers, lap_positions_json_df, on='driverId', how='left')

# Fill NaN values with an empty list
drivers['lap_positions'].fillna(value='[]', inplace=True)

# Drop the lap_positions_json_df DataFrame to free up memory
del lap_positions_json_df

print(drivers)


     driverId  forename     surname         dob nationality  \
0           1     Lewis    Hamilton  1985-01-07     British   
1           2      Nick    Heidfeld  1977-05-10      German   
2           3      Nico     Rosberg  1985-06-27      German   
3           4  Fernando      Alonso  1981-07-29     Spanish   
4           5    Heikki  Kovalainen  1981-10-19     Finnish   
..        ...       ...         ...         ...         ...   
852       854      Mick  Schumacher  1999-03-22      German   
853       855    Guanyu        Zhou  1999-05-30     Chinese   
854       856      Nyck    de Vries  1995-02-06       Dutch   
855       857     Oscar     Piastri  2001-04-06  Australian   
856       858     Logan    Sargeant  2000-12-31    American   

                                         max_lap_times  \
0    [{"2007": 1453884}, {"2008": 195953}, {"2009":...   
1    [{"2000": 218816}, {"2001": 232209}, {"2002": ...   
2    [{"2006": 157809}, {"2007": 191891}, {"2008": ...   
3    [{"200

In [7]:
import json
import sqlite3
import pandas as pd

conn = sqlite3.connect('formula1-originalDB.db')

query = '''
WITH RankedPositions AS (
    SELECT
        d.driverId,
        r.year,
        lt.position AS position,
        COUNT(lt.position) AS position_count,
        ROW_NUMBER() OVER (PARTITION BY d.driverId, r.year ORDER BY COUNT(lt.position) DESC) AS position_rank,
        COUNT(lt.position) OVER (PARTITION BY d.driverId, r.year) AS total_races
    FROM
        drivers d
    JOIN
        lap_times lt ON d.driverId = lt.driverId
    JOIN
        races r ON lt.raceId = r.raceId
    GROUP BY
        d.driverId, r.year, lt.position
)
SELECT
    driverId,
    year,
    position,
    position_count
FROM
    RankedPositions
WHERE
    position_rank <= 3
'''

df = pd.read_sql_query(query, conn)

print(df)

      driverId  year  position  position_count
0            1  2007         1             321
1            1  2007         2             311
2            1  2007         3             184
3            1  2008         1             293
4            1  2008         2             210
...        ...   ...       ...             ...
1994       857  2023        13              81
1995       857  2023        19              74
1996       858  2023        13             115
1997       858  2023        20             104
1998       858  2023        15              90

[1999 rows x 4 columns]


In [15]:
total_laps_query = '''
SELECT
    driverId,
    COUNT(*) AS total_laps
FROM
    lap_times 
GROUP BY
    driverId;
'''


total_laps_df = pd.read_sql_query(total_laps_query, conn)

print(total_laps_d

      # Merge the total laps information into the 'drivers' DataFrame
drivers = pd.merge(drivers, total_laps_df, on='driverId', how='left')

# Fill NaN values with 0 if necessary
drivers['total_laps'].fillna(0, inplace=True)

# Drop the total_laps_df DataFrame to free up memory
del total_laps_df

print(drivers)f)


     driverId  total_laps
0           1       18407
1           2        9701
2           3       11161
3           4       19879
4           5        5971
..        ...         ...
134       854        2277
135       855        1764
136       856         614
137       857         632
138       858         649

[139 rows x 2 columns]


In [17]:
drivers.to_csv('enadrivers.csv', index=False)