# Update circuits

This script searches for new circuits from the 'circuits.csv' file in the Minio object store. It compares them against circuits already listed in the database and inserts any new ones.

#### ToDo:
- Improve the matching logic, it is susceptible to false positives.
- Quarantine records that may be incorrect.
- Add a results reporting function.
- Add additional metadata.
- Make logic performance improvements.

In [122]:
import pandas as pd
from io import BytesIO
from minio import Minio
from sqlalchemy import create_engine, text
from fuzzywuzzy import fuzz

In [123]:
# Initialize Minio client
minio_client = Minio(
    "minio:9000",
    access_key="minioadmin",
    secret_key="minioadmin",
    secure=False
)

In [124]:
# Initialize connection to the PostgreSQL database using SQLAlchemy
engine = create_engine('postgresql://admin:admin@pgdb/postgres')

In [112]:
# Download the drivers CSV file from the Minio bucket into a pandas DataFrame
driverdata = minio_client.get_object("track.data-raw", "drivers.csv")
driverdata = BytesIO(driverdata.read())
df_drivercsv = pd.read_csv(driverdata)
# display(df_drivercsv)

df_drivercsv = df_drivercsv.rename(columns={"driverId":"driver_id","driverRef":"driver_ref"})

df_drivercsv = df_drivercsv[['driver_id','driver_ref','number','code','forename','surname','dob','nationality']]

df_drivercsv['surname'] = df_drivercsv['surname'].apply(lambda x: x.replace("'", ""))

# df2=df_drivercsv.where(df_drivercsv.driver_id == 567).dropna()
# display(df_drivercsv)
# display(df_drivercsv)
# len(df_drivercsv)

if len(df_drivercsv)>0:
    
    driversquery = text(f""" 
    INSERT INTO race_data.drivers(driver_id,driver_ref,number,code,forename,surname,dob,nationality)
    VALUES {','.join([str(i) for i in list(df_drivercsv.to_records(index=False))])}
    """)

    # engine.execute(query)
    with engine.begin() as connection:
        connection.execute(driversquery)


Unnamed: 0,driver_id,driver_ref,number,code,forename,surname,dob,nationality
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish
...,...,...,...,...,...,...,...,...
852,854,mick_schumacher,47,MSC,Mick,Schumacher,1999-03-22,German
853,855,zhou,24,ZHO,Guanyu,Zhou,1999-05-30,Chinese
854,856,de_vries,21,DEV,Nyck,de Vries,1995-02-06,Dutch
855,857,piastri,81,PIA,Oscar,Piastri,2001-04-06,Australian


In [111]:
# Download the constructor CSV file from the Minio bucket into a pandas DataFrame

constructordata = minio_client.get_object("track.data-raw", "constructors.csv")
constructordata = BytesIO(constructordata.read())
df_constructorcsv = pd.read_csv(constructordata)

# display(df_constructorcsv)
# df_constructorcsv = df_constructorcsv.rename(columns={"constructorId":"constructor_id","nationality":"country"})

# df_constructorcsv = df_constructorcsv[['constructor_id','name']]

if len(df_constructorcsv)>0:
    
    constructorquery = text(f""" 
    INSERT INTO race_data.constructors(constructor_id,name)
    VALUES {','.join([str(i) for i in list(df_constructorcsv.to_records(index=False))])}
    """)

    # engine.execute(query)
    with engine.begin() as connection:
        connection.execute(constructorquery)

Unnamed: 0,constructorId,constructorRef,name,nationality,url
0,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
1,2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
2,3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Pr...
3,4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formul...
4,5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso
...,...,...,...,...,...
206,209,manor,Manor Marussia,British,http://en.wikipedia.org/wiki/Manor_Motorsport
207,210,haas,Haas F1 Team,American,http://en.wikipedia.org/wiki/Haas_F1_Team
208,211,racing_point,Racing Point,British,http://en.wikipedia.org/wiki/Racing_Point_F1_Team
209,213,alphatauri,AlphaTauri,Italian,http://en.wikipedia.org/wiki/Scuderia_AlphaTauri


In [115]:
# Download the status CSV file from the Minio bucket into a pandas DataFrame

statusdata = minio_client.get_object("track.data-raw", "status.csv")
statusdata = BytesIO(statusdata.read())
df_statuscsv = pd.read_csv(statusdata)

# display(df_statuscsv)
df_statuscsv = df_statuscsv.rename(columns={"statusId":"status_id"})

df_statuscsv = df_statuscsv[['status_id','status']]

# display(df_statuscsv)
if len(df_statuscsv)>0:
    
    statusquery = text(f""" 
    INSERT INTO race_data.status(status_id,status)
    VALUES {','.join([str(i) for i in list(df_statuscsv.to_records(index=False))])}
    """)
    # engine.execute(query)
    with engine.begin() as connection:
        connection.execute(statusquery)

In [121]:
# Download the Results CSV file from the Minio bucket into a pandas DataFrame
resultdata = minio_client.get_object("track.data-raw", "results.csv")
resultdata = BytesIO(resultdata.read())
df_resultcsv = pd.read_csv(resultdata)
display(df_resultcsv)
# df = df_csv.where(df_csv["circuitId"] == 2)
# len(df_resultcsv)

df_resultcsv = df_resultcsv.rename(columns={"resultId":"result_id","driverId":"driver_id","constructorId":"constructor_id","fastestLapTime":"fastest_lap_time","fastestLapSpeed":"fastest_lap_speed","statusId":"status_id"})

df_resultcsv = df_resultcsv[['result_id','driver_id','constructor_id','number','grid','position','points','laps','time','fastest_lap_time','rank','fastest_lap_speed','status_id']]

# display(df_resultcsv)


Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.300,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26035,26041,1108,852,213,22,16,16,16,16,0.0,52,+31.225,5148163,48,17,1:32.084,230.307,1
26036,26042,1108,856,213,21,18,17,17,17,0.0,52,+33.128,5150066,48,18,1:32.353,229.636,1
26037,26043,1108,842,214,10,10,18,18,18,0.0,46,\N,\N,42,12,1:31.539,231.678,130
26038,26044,1108,825,210,20,19,\N,R,19,0.0,31,\N,\N,29,19,1:33.356,227.169,5


In [120]:
# Query the database to get the events data 
df_events = pd.read_sql("SELECT * FROM race_data.events", engine)
display(df_events)

Unnamed: 0,event_id,season_id,race_round,circuit_id,official_name,date
0,1,71,1,3.0,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2020,
1,2,71,2,3.0,FORMULA 1 PIRELLI GROSSER PREIS DER STEIERMARK...,
2,3,71,3,,FORMULA 1 ARAMCO MAGYAR NAGYDÍJ 2020,
3,4,71,4,4.0,FORMULA 1 PIRELLI BRITISH GRAND PRIX 2020,
4,5,71,5,4.0,EMIRATES FORMULA 1 70TH ANNIVERSARY GRAND PRIX...,
...,...,...,...,...,...,...
68,69,74,7,14.0,FORMULA 1 GRAND PRIX DE MONACO 2023,
69,70,74,8,5.0,FORMULA 1 AWS GRAN PREMIO DE ESPAÑA 2023,
70,71,74,9,,FORMULA 1 PIRELLI GRAND PRIX DU CANADA 2023,
71,72,74,10,3.0,FORMULA 1 ROLEX GROSSER PREIS VON ÖSTERREICH 2023,


In [9]:
# Query the database to get the Season data 
df_season = pd.read_sql("SELECT * FROM race_data.season", engine)
display(df_season)

Unnamed: 0,season_id,championship_id,year
0,1,1,1950
1,2,1,1951
2,3,1,1952
3,4,1,1953
4,5,1,1954
...,...,...,...
69,70,1,2019
70,71,1,2020
71,72,1,2021
72,73,1,2022


In [15]:
# Get evengts dataset from 2021 onwards
df_2021events = df_events.merge(df_season,how='inner',on =["season_id"])
df_2021events = df_2021events[df_2021events['year'] >= 2021]
# len(df_2021events)
display(df_2021events)

Unnamed: 0,event_id,season_id,race_round,circuit_id,official_name,date,championship_id,year
17,18,72,1,12.0,FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2021,,1,2021
18,19,72,2,13.0,FORMULA 1 PIRELLI GRAN PREMIO DEL MADE IN ITAL...,,1,2021
19,20,72,3,10.0,FORMULA 1 HEINEKEN GRANDE PRÉMIO DE PORTUGAL 2021,,1,2021
20,21,72,4,5.0,FORMULA 1 ARAMCO GRAN PREMIO DE ESPAÑA 2021,,1,2021
21,22,72,5,14.0,FORMULA 1 GRAND PRIX DE MONACO 2021,,1,2021
22,23,72,6,15.0,FORMULA 1 AZERBAIJAN GRAND PRIX 2021,,1,2021
23,24,72,7,16.0,FORMULA 1 EMIRATES GRAND PRIX DE FRANCE 2021,,1,2021
24,25,72,8,3.0,FORMULA 1 BWT GROSSER PREIS DER STEIERMARK 2021,,1,2021
25,26,72,9,3.0,FORMULA 1 BWT GROSSER PREIS VON ÖSTERREICH 2021,,1,2021
26,27,72,10,4.0,FORMULA 1 PIRELLI BRITISH GRAND PRIX 2021,,1,2021


In [None]:
# Get the results dataset from 2021 onwards
df_results = df_resultcsv.merge(df_2021events,how ='inner',on =["event_id"])

In [11]:
# Merge the both dataframes and get result dataframe
df_result = pd.merge(df_csv, df_db['name'],
                       how='left', on=['name'],indicator=True)

len(df_result)
# display(df_result)

NameError: name 'df_csv' is not defined