### Data generator from Ergast

Retrieve the data from the ergast API. Currently used for lap times only. The results will be saved in csv files, and should completely replace the files currently in the database with the same name. New data coming from Ergast will be merged with the previous data

In [50]:
import pandas as pd
from pandas import DataFrame
from zipfile import ZipFile
import os
from urllib import request
from unidecode import unidecode


In [51]:
# Path of the current data
INPUT_PATH = "./../data/csv/"

# Path to generate the new db files
OUTPUT_PATH = "generated/Data to replace to the DB"

if not os.path.exists(OUTPUT_PATH):
   os.makedirs(OUTPUT_PATH)

In [52]:
ZIP_URL = 'http://ergast.com/downloads/f1db_csv.zip'
TEMP_FILEZIP = "tempfile.zip"
TEMP_FOLDER = "temp/"

zipresp = request.urlopen(ZIP_URL)
tempzip = open(TEMP_FILEZIP, "wb")
tempzip.write(zipresp.read())
tempzip.close()

zf = ZipFile(TEMP_FILEZIP)
zf.extractall(path = TEMP_FOLDER)
zf.close()
os.remove(TEMP_FILEZIP)

In [53]:
from math import nan


def get_new_df(old_json_filename: str, new_df: DataFrame, on_columns: list[str]):
    """Returns a new dataframe with the rows that are in the new_df and in the old data (from json file). If the data are in both, it the data of the new_df will be used"""

    old_df: DataFrame | None = None

    try:
        old_df = pd.read_csv(f"{INPUT_PATH}/{old_json_filename}.csv")
    except:
        print(
            "WARN -> No previous file found. All the values will be appended to the new file"
        )

    if old_df is not None:
        df_all = new_df.merge(old_df, on=on_columns, how="outer", indicator=True)

        df_all['pos'] = df_all['pos_x'].astype("Int64")
        df_all.loc[df_all['pos'].isna(), 'pos'] = df_all['pos_y']

        df_all['time'] = df_all['time_x'].astype("Int64")
        df_all.loc[df_all['time'].isna(), 'time'] = df_all['time_y']

        return (
            df_all.drop(columns=["_merge", "pos_x", "time_x", "pos_y", "time_y"])
            .drop_duplicates(subset=on_columns)
            .reset_index(drop=True)
        )

    else:
        return new_df.drop_duplicates(subset=on_columns).reset_index(drop=True)


In [54]:
""" ---------------------------------------------
------------------ DRIVERS ----------------------
--------------------------------------------- """

drivers_in_db = pd.read_csv(INPUT_PATH + "drivers.csv")
drivers_from_ergast = pd.read_csv(TEMP_FOLDER + "drivers.csv")


def find_driverId(driver_name: str):
    driver_name = unidecode(driver_name).replace("-", " ").lower()

    for index, row in drivers_in_db.iterrows():
        if driver_name == unidecode(row["name"]).replace("-", " ").lower():
            return str(row["id"])

    for index, row in drivers_in_db.iterrows():
        if driver_name == unidecode(row["fullName"]).replace("-", " ").lower():
            return str(row["id"])

    for index, row in drivers_in_db.iterrows():
        if (
            driver_name.split(" ")[-1]
            in unidecode(row["fullName"]).replace("-", " ").lower()
        ):
            return str(row["id"])

    return ""


drivers_from_ergast["dbId"] = drivers_from_ergast.apply(
    lambda x: find_driverId(x["forename"] + " " + x["surname"]), axis=1
)


In [55]:
""" ---------------------------------------------
----------------- LAP TIMES ---------------------
--------------------------------------------- """

lap_times_from_ergast = pd.read_csv(TEMP_FOLDER + "lap_times.csv")
races_from_ergast = pd.read_csv(TEMP_FOLDER + "races.csv")


lap_times_from_ergast = pd.merge(
    lap_times_from_ergast,
    drivers_from_ergast.loc[:, ["driverId", "dbId"]],
    on="driverId",
).merge(races_from_ergast.loc[:, ["raceId", "round", "year"]], on="raceId")

ROWS_WITHOUT_DRIVER_ID = lap_times_from_ergast[lap_times_from_ergast["dbId"] == ""]
if len(ROWS_WITHOUT_DRIVER_ID) > 0:
    raise Exception(
        "Can not find any id for this drivers: ",
        ROWS_WITHOUT_DRIVER_ID["driverId"].unique(),
    )

lap_times_from_ergast.head()

Unnamed: 0,raceId,driverId,lap,position,time,milliseconds,dbId,round,year
0,841,20,1,1,1:38.109,98109,seb-vettel,1,2011
1,841,20,2,1,1:33.006,93006,seb-vettel,1,2011
2,841,20,3,1,1:32.713,92713,seb-vettel,1,2011
3,841,20,4,1,1:32.803,92803,seb-vettel,1,2011
4,841,20,5,1,1:32.342,92342,seb-vettel,1,2011


In [56]:
lap_times_from_ergast["eventId"] = (
    lap_times_from_ergast["year"].apply(str)
    + "-"
    + lap_times_from_ergast["round"].apply(str)
)

lap_times_from_ergast.drop(columns=["driverId", "time"], inplace=True)

lap_times_from_ergast.rename(
    columns={"position": "pos", "milliseconds": "time", "dbId": "driverId"},
    inplace=True,
)

lap_times_from_ergast.head()


Unnamed: 0,raceId,lap,pos,time,driverId,round,year,eventId
0,841,1,1,98109,seb-vettel,1,2011,2011-1
1,841,2,1,93006,seb-vettel,1,2011,2011-1
2,841,3,1,92713,seb-vettel,1,2011,2011-1
3,841,4,1,92803,seb-vettel,1,2011,2011-1
4,841,5,1,92342,seb-vettel,1,2011,2011-1


In [57]:
to_save = get_new_df("lapTimes", lap_times_from_ergast, ["driverId", "lap", "eventId"])

to_save = to_save.sort_values(by=["year", "round", "lap", "pos"])

to_save.drop(columns=["raceId", "year", "round"], inplace=True)

to_save.insert(0, 'driverId', to_save.pop('driverId'))
to_save.insert(0, 'eventId', to_save.pop('eventId'))

to_save.head()

Unnamed: 0,eventId,driverId,lap,pos,time
515862,1996-1,jacques-villeneuve,1,1,103702
515804,1996-1,damon-hill,1,2,104243
515679,1996-1,eddie-irvine,1,3,104981
515647,1996-1,michael-schumacher,1,4,105188
515737,1996-1,jean-alesi,1,5,106506


In [58]:
to_save.to_csv(f"{OUTPUT_PATH}/lapTimes.csv", index=False)