# FastF1 Functionality Exploration

## Package Import and Configs

In [25]:
import os
import logging
import fastf1
import pandas as pd
import pyarrow.csv as csv
import pyarrow.parquet as pq
import pyarrow as pa
import duckdb as db
import yaml


from time import sleep
from pathlib import Path
from dataclasses import dataclass

from typing import Union

# Configuring Logging
logging.basicConfig(
    encoding='utf-8',
    level=logging.ERROR,
    datefmt='%m/%d/%Y %I:%M:%S %p'
)

CACHE_PATH = Path('/workspaces/formula-one-analytics/data/.cache/')
TMP_PATH = Path('/workspaces/formula-one-analytics/data/_tmp/')

# fastF1 data cache config
if not CACHE_PATH.exists():
    logging.info('CACHE_PATH does not exist creating...')
    CACHE_PATH.mkdir(parents=True, exist_ok=True)

if not TMP_PATH.exists():
    logging.info('TMP_PATH does not exist creating...')
    TMP_PATH.mkdir(parents=True, exist_ok=True)

fastf1.Cache.enable_cache('/workspaces/formula-one-analytics/data/.cache/')


## Data Formatting and Saving Funcs

[{'pandas_dtype': 'int64', 'duckdb_dtype': 'BIGINIT'},
 {'pandas_dtype': 'int32', 'duckdb_dtype': 'INTEGER'},
 {'pandas_dtype': 'float64', 'duckdb_dtype': 'DOUBLE'},
 {'pandas_dtype': 'float32', 'duckdb_dtype': 'FLOAT'},
 {'pandas_dtype': 'bool', 'duckdb_dtype': 'BOOLEAN'},
 {'pandas_dtype': 'datetime64[ns]', 'duckdb_dtype': 'TIMESTAMP'},
 {'pandas_dtype': 'timedelta64[ns]', 'duckdb_dtype': 'INTERVAL'}]

In [10]:
# YAML File Importers
def yaml_loaders(file_path: str, header: str) -> list:
    with open(file_path, 'r') as yaml_file:
        data = yaml.safe_load(yaml_file)

        return data[header]
    

# Defining function to event calendar and save to parquet for later
def get_event_calendar(start_year: int, end_year: int, iteration_sleep: int=10) -> list:
    '''
    Sources a list of dataframes with Formula 1 Event Calendar schedule information fro
    the fastF1 API package and API.

    :param start_year: (int) Beginning year of range to pull event calendar schedules for
    :param end_year: (int) End year of range to pull event calendar schedules for
    :iteration_sleep: (int, default=10) Adjustable sleep interval to keep fastF1 api from
    blocking requests for data. fastF1 has protection built in, this allows for further 
    request safety.
    '''
    totalEventList = []
    for idx, year in enumerate(list(range(start_year, end_year+1))):
        totalEventList.append(fastf1.get_event_schedule(year))
        logging.info(f'Requesting the {year=} event schedule')
        sleep(iteration_sleep) # to throttle requests beyond built in fastF1 throttling

    return totalEventList


def concat_event_calendar(event_list: list) -> pd.DataFrame:
    '''
    Concats data from list of dataframes into a single dataframe

    :param event_list: (list) list of dataframes to concat into single dataframe
    :return: pandas Dataframe

    '''

    df = pd.concat(event_list)
    return df


def create_date_part_col(df: pd.DataFrame, date_column: str, date_part:str) -> pd.DataFrame:
    '''
    Creates a new column in supplied pd.DataFrame that is the specified DatetimeIndex part 
    of the given date_column.

    :param df: (pd.DataFrame) pandas DataFrame containing a date or datetime column
    :param date_column: (str) The name of the column with dates to extract the date_part from
    :param date_part: (str) The part of the date to extract ('year', 'month','day')

    :return: pandas DataFrame with the extracted date part in a new column.

    ::Example::
    -----------

    data = [{'eventDate': '2000-03-12', 'event': 'Woodstock'}, 
            {'eventDate': '2003-08-12', 'event': 'EdgeFest'}, 
            {'eventDate': '2009-03-12', 'event': 'Warped Tour'}, 
            {'eventDate': '2020-01-26', 'event': 'Electric Forest'}
       ]
    
    df = pd.DataFrame(data)
    df['eventDate'] = pd.to_datetime(df['eventDate'])

    |  eventDate  |    event           |
    | 2000-03-12  | 'Woodstock'        |
    | 2003-08-12  | 'EdgeFest'         |
    | 2009-03-12  | 'Warped Tour'      |
    | 2020-01-26  | 'Electric Forest'  |

    df = create_date_part_col(df=df, date_column='eventDate', date_part='year')

    |  eventDate  |    event           | eventDate_year |
    | 2000-03-12  | 'Woodstock'        |      2000      |
    | 2003-08-12  | 'EdgeFest'         |      2003      |
    | 2009-03-12  | 'Warped Tour'      |      2009      |
    | 2020-01-26  | 'Electric Forest'  |      2020      |


    '''

    valid_date_parts = {'day', 'month', 'year'}
    if date_part not in valid_date_parts:
        raise ValueError(f'Given date_part must be a valid date part of {valid_date_parts}')

    df['_'.join([date_column, date_part.lower()])] = (
        getattr(pd.DatetimeIndex(df[date_column]), date_part)
        )
    
    return df


def temp_csver(df: pd.DataFrame, path: str|Path, temp_file_name: str, **kwargs) -> None:
    '''
    Takes pd.DataFrame, path and filename string and saves a csv file to that location.
    Used instead of pd.to_csv() to handle using pathlib.Path objects with filenames.
    By default pd.to_csv() does not allow for the path_or_buf argument to be overloaded.

    :param df: pandas DataFrame
    :param path: (str|Path) path to location where csv will be saved
    :param temp_file_name: (str) the filename of temp .csv file to be written
    :param header: (bool) csv to have header True|False

    :return: NoneType
    '''

    if isinstance(path, Path):
        path_and_file = path/temp_file_name
    elif isinstance(path, str):
        path_and_file = '/'.join([path, temp_file_name])
        
    return df.to_csv(path_and_file, **kwargs)


## Reading in Event Schedules by Year

In [None]:
# Pulling down Formula Event Calendars from 2000 to 2023 seasons.
my_event_list = get_event_calendar(2000, 2024)
eventDF = concat_event_calendar(my_event_list)

# Adding new date part columns for parquet partitioning optimization
eventDF = create_date_part_col(eventDF, 'EventDate', 'year')
eventDF = create_date_part_col(eventDF, 'EventDate', 'month')

In [6]:
# Saving data to parquet file for easier access later.
eventDF.to_parquet('/workspaces/formula-one-analytics/data/eventCalendar.parquet',
                   partition_cols=['EventDate_year', 'EventDate_month'])

## Raw Race Data Temp File Saving

In [3]:
my_race_session = fastf1.get_session(year=2023, gp=1, identifier=1)
# my_race_session.load(telemetry=False, laps=False, weather=False, messages=False)
my_race_session.load()

core           INFO 	Loading data for Bahrain Grand Prix - Practice 1 [v3.1.6]
INFO:fastf1.fastf1.core:Loading data for Bahrain Grand Prix - Practice 1 [v3.1.6]
req            INFO 	Using cached data for session_info
INFO:fastf1.fastf1.req:Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO:fastf1.fastf1.req:Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
INFO:fastf1.fastf1.req:Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
INFO:fastf1.fastf1.req:Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO:fastf1.fastf1.req:Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
INFO:fastf1.fastf1.req:Using cached data for timing_app_data
core           INFO 	Processing timing data...
INFO:fastf1.fastf1.core:Processing timing data...
req         

In [4]:
# Getting session lap data, save to temp file
sessionLabData = pd.DataFrame(my_race_session.laps)
# temp_csver(df=y, path=TMP_PATH, temp_file_name='temp_laps.csv', header=True)

In [35]:
# creating mapping dict to convert pandas to duckdb
dtypeConversions = yaml_loaders(file_path= '../src/schemas/DtypeMappings.yaml', header='mappings')

In [24]:
sessionLabData.dtypes

Time                  timedelta64[ns]
Driver                         object
DriverNumber                   object
LapTime               timedelta64[ns]
LapNumber                     float64
Stint                         float64
PitOutTime            timedelta64[ns]
PitInTime             timedelta64[ns]
Sector1Time           timedelta64[ns]
Sector2Time           timedelta64[ns]
Sector3Time           timedelta64[ns]
Sector1SessionTime    timedelta64[ns]
Sector2SessionTime    timedelta64[ns]
Sector3SessionTime    timedelta64[ns]
SpeedI1                       float64
SpeedI2                       float64
SpeedFL                       float64
SpeedST                       float64
IsPersonalBest                   bool
Compound                       object
TyreLife                      float64
FreshTyre                        bool
Team                           object
LapStartTime          timedelta64[ns]
LapStartDate           datetime64[ns]
TrackStatus                    object
Position    

In [36]:
class SchemaCreator:

    @dataclass
    class DtypeMapping:
        pandas_dtype: str
        duckdb_dtype: str

    def __init__(self, dtype_mappings: list, schema_title: str) -> None:
        self.dtype_mappings = [self.DtypeMapping(**m) for m in dtype_mappings]
        self.schema_title = schema_title

    def convert_to_duckdb_dtypes(self, pandas_dtypes):
        for mapping in self.dtype_mappings:
            if pandas_dtypes.name == mapping.pandas_dtype:
                return mapping.duckdb_dtype
        return 'VARCHAR'


In [41]:
schema_dtype_mappings = yaml_loaders(file_path= '../src/schemas/DtypeMappings.yaml', header='mappings')
converter = SchemaCreator(schema_dtype_mappings, 'sessionLapData')


In [42]:
xdf = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [1.1, 2.2, 3.3],
    'C': ['a', 'b', 'c']
})


schema_parts = []
for col, series in xdf.items():
    col_dtype = series.dtype

    duckdb_dtype = converter.convert_to_duckdb_dtypes(col_dtype)
    schema_parts.append(f'{col} {duckdb_dtype}')

In [39]:
xdf.dtypes

A      int64
B    float64
C     object
dtype: object

['A BIGINIT', 'B DOUBLE', 'C VARCHAR']

In [11]:
# Getting car meta-data, combining list[dataframe] into single dataframe and saving to temp .csv file/
c = my_race_session.car_data
car_data_df = pd.concat([df.assign(Key=key) for key, df in c.items()],
                        ignore_index=True)
temp_csver(df=car_data_df, path=TMP_PATH, temp_file_name='temp_car_data.csv', header=True, index=False)

In [20]:
trackStsTmp = pd.DataFrame(my_race_session.track_status)
temp_csver(df=trackStsTmp, path=TMP_PATH, temp_file_name='temp_track_status.csv', header=True, index=False)

In [57]:
raceResults = my_race_session.results
temp_csver(df=raceResults, path=TMP_PATH, temp_file_name='temp_race_results.csv', header=True, index=False)

In [59]:
raceControlMsg = my_race_session.race_control_messages
temp_csver(df=raceControlMsg, path=TMP_PATH, temp_file_name='temp_race_control_msg.csv', header=True, index=False)

In [61]:
weatherData = my_race_session.weather_data
temp_csver(df=weatherData, path=TMP_PATH, temp_file_name='temp_weather_data.csv', header=True, index=False)

In [71]:
positionData = my_race_session.pos_data
pos_data_df = pd.concat([df.assign(Key=key) for key, df in positionData.items()],
                        ignore_index=True)
temp_csver(df=pos_data_df, path=TMP_PATH, temp_file_name='temp_pos_data.csv', header=True, index=False)


In [52]:
# Fetching Circuit Info
circuitInfo = my_race_session.get_circuit_info()

# Getting session event series, convert to DF and save to temp PDF
evntDF = pd.DataFrame(my_race_session.event).T
evntDF.columns = my_race_session.event.index
evntDF['t0_date'] = my_race_session.t0_date
evntDF['map_rotation'] = circuitInfo.rotation

temp_csver(df=evntDF, path=TMP_PATH, temp_file_name='temp_events.csv', header=True)

In [44]:
# Circuit info coordinate markers
temp_csver(df=circuitInfo.corners, path=TMP_PATH, temp_file_name='temp_track_corners.csv', header=True, index=False)
temp_csver(df=circuitInfo.marshal_sectors, path=TMP_PATH, temp_file_name='temp_marshal_sectors.csv', header=True, index=False)
temp_csver(df=circuitInfo.marshal_lights, path=TMP_PATH, temp_file_name='temp_marshal_lights.csv', header=True, index=False)

## Temp Data Convert to Parquet


In [3]:
carDataTable = csv.read_csv('/workspaces/formula-one-analytics/data/_tmp/temp_car_data.csv')

In [13]:
carDataTable.column_names

['Date',
 'RPM',
 'Speed',
 'nGear',
 'Throttle',
 'Brake',
 'DRS',
 'Source',
 'Time',
 'SessionTime',
 'Key']

In [17]:
new_column_names = [
    'session_date',
    'engine_rpm',
    'car_speed',
    'car_ngear',
    'car_throttle',
    'car_braking',
    'drs',
    'source',
    'time',
    'session_time',
    'key'
]

In [21]:
schema = pa.schema([
    ('session_date', pa.timestamp('ns')),
    ('engine_rpm', pa.int64()),
    ('car_speed', pa.int64()),
    ('car_ngear', pa.int64()),
    ('car_throttle', pa.int64()),
    ('car_braking', pa.int8()),
    ('drs', pa.int64()),
    ('source', pa.string()),
    ('time', pa.string()),
    ('session_time', pa.string()),
    ('key', pa.int64())
])

In [22]:
carDataTable = carDataTable.rename_columns(new_column_names)
carDataTable = carDataTable.cast(schema)
pq.write_table(carDataTable, '/fomula_one_project/data/2023/race_01/car_data.parquet')

FileNotFoundError: [Errno 2] Failed to open local file '/fomula_one_project/data/2023/race_01/car_data.parquet'. Detail: [errno 2] No such file or directory