# FastF1 Functionality Exploration

## Package Import and Configs

In [88]:
import os
import logging
import fastf1
import pandas as pd
import pyarrow.csv as csv
import pyarrow.parquet as pq
import pyarrow as pa
import duckdb as db
import yaml
import datetime


from time import sleep
from pathlib import Path
from dataclasses import dataclass

from typing import Union, Optional

# Configuring Logging
logging.basicConfig(
    encoding='utf-8',
    level=logging.ERROR,
    datefmt='%m/%d/%Y %I:%M:%S %p'
)

pd.set_option('display.max_columns', None)

CACHE_PATH = Path('/workspaces/formula-one-analytics/data/.cache/')
TMP_PATH = Path('/workspaces/formula-one-analytics/data/_tmp/')
DB_PATH = Path('/workspaces/formula-one-analytics/src/duckdb/formula_one.db')

# fastF1 data cache config
if not CACHE_PATH.exists():
    logging.info('CACHE_PATH does not exist creating...')
    CACHE_PATH.mkdir(parents=True, exist_ok=True)

if not TMP_PATH.exists():
    logging.info('TMP_PATH does not exist creating...')
    TMP_PATH.mkdir(parents=True, exist_ok=True)

fastf1.Cache.enable_cache('/workspaces/formula-one-analytics/data/.cache/')


## Data Formatting and Saving Funcs

In [44]:
# YAML File Importers
def yaml_loaders(file_path: str, header: str) -> list:
    with open(file_path, 'r') as yaml_file:
        data = yaml.safe_load(yaml_file)

        return data[header]
    

# Defining function to event calendar and save to parquet for later
def get_event_calendar(start_year: int, end_year: int, iteration_sleep: int=10) -> list:
    '''
    Sources a list of dataframes with Formula 1 Event Calendar schedule information fro
    the fastF1 API package and API.

    :param start_year: (int) Beginning year of range to pull event calendar schedules for
    :param end_year: (int) End year of range to pull event calendar schedules for
    :iteration_sleep: (int, default=10) Adjustable sleep interval to keep fastF1 api from
    blocking requests for data. fastF1 has protection built in, this allows for further 
    request safety.
    '''
    totalEventList = []
    for idx, year in enumerate(list(range(start_year, end_year+1))):
        totalEventList.append(fastf1.get_event_schedule(year))
        logging.info(f'Requesting the {year=} event schedule')
        sleep(iteration_sleep) # to throttle requests beyond built in fastF1 throttling

    return totalEventList


def concat_event_calendar(event_list: list) -> pd.DataFrame:
    '''
    Concats data from list of dataframes into a single dataframe

    :param event_list: (list) list of dataframes to concat into single dataframe
    :return: pandas Dataframe

    '''

    df = pd.concat(event_list)
    return df


def create_date_part_col(df: pd.DataFrame, date_column: str, date_part:str) -> pd.DataFrame:
    '''
    Creates a new column in supplied pd.DataFrame that is the specified DatetimeIndex part 
    of the given date_column.

    :param df: (pd.DataFrame) pandas DataFrame containing a date or datetime column
    :param date_column: (str) The name of the column with dates to extract the date_part from
    :param date_part: (str) The part of the date to extract ('year', 'month','day')

    :return: pandas DataFrame with the extracted date part in a new column.

    ::Example::
    -----------

    data = [{'eventDate': '2000-03-12', 'event': 'Woodstock'}, 
            {'eventDate': '2003-08-12', 'event': 'EdgeFest'}, 
            {'eventDate': '2009-03-12', 'event': 'Warped Tour'}, 
            {'eventDate': '2020-01-26', 'event': 'Electric Forest'}
       ]
    
    df = pd.DataFrame(data)
    df['eventDate'] = pd.to_datetime(df['eventDate'])

    |  eventDate  |    event           |
    | 2000-03-12  | 'Woodstock'        |
    | 2003-08-12  | 'EdgeFest'         |
    | 2009-03-12  | 'Warped Tour'      |
    | 2020-01-26  | 'Electric Forest'  |

    df = create_date_part_col(df=df, date_column='eventDate', date_part='year')

    |  eventDate  |    event           | eventDate_year |
    | 2000-03-12  | 'Woodstock'        |      2000      |
    | 2003-08-12  | 'EdgeFest'         |      2003      |
    | 2009-03-12  | 'Warped Tour'      |      2009      |
    | 2020-01-26  | 'Electric Forest'  |      2020      |


    '''

    valid_date_parts = {'day', 'month', 'year'}
    if date_part not in valid_date_parts:
        raise ValueError(f'Given date_part must be a valid date part of {valid_date_parts}')

    df['_'.join([date_column, date_part.lower()])] = (
        getattr(pd.DatetimeIndex(df[date_column]), date_part)
        )
    
    return df


def temp_csver(df: pd.DataFrame, path: str|Path, temp_file_name: str, **kwargs) -> None:
    '''
    Takes pd.DataFrame, path and filename string and saves a csv file to that location.
    Used instead of pd.to_csv() to handle using pathlib.Path objects with filenames.
    By default pd.to_csv() does not allow for the path_or_buf argument to be overloaded.

    :param df: pandas DataFrame
    :param path: (str|Path) path to location where csv will be saved
    :param temp_file_name: (str) the filename of temp .csv file to be written
    :param header: (bool) csv to have header True|False

    :return: NoneType
    '''

    if isinstance(path, Path):
        path_and_file = path/temp_file_name
    elif isinstance(path, str):
        path_and_file = '/'.join([path, temp_file_name])
        
    return df.to_csv(path_and_file, **kwargs)


## duckDB functions and DataClass

In [48]:
class SchemaCreator:

    @dataclass
    class DtypeMapping:
        pandas_dtype: str
        duckdb_dtype: str

    def __init__(self, dtype_mappings: list) -> None:
        self.dtype_mappings = [self.DtypeMapping(**m) for m in dtype_mappings]


    def convert_to_duckdb_dtypes(self, pandas_dtypes):
        for mapping in self.dtype_mappings:
            if pandas_dtypes.name == mapping.pandas_dtype:
                return mapping.duckdb_dtype
        return 'VARCHAR'
    

    def column_dtype_converter(self, to_convert: pd.DataFrame) -> list:

        schema_parts = []

        for column, series in to_convert.items():
            column_dtype = series.dtype
            duckdb_dtype = self.convert_to_duckdb_dtypes(column_dtype)
            schema_parts.append(f'{column} {duckdb_dtype}')

        return schema_parts


    def create_duckdb_stg_schema(self, table_name: str,
                                table_dataframe: pd.DataFrame,
                                primary_key_field: Optional[str] = None,
                                index_field: Optional[str] = None) -> str:
        
        schema_parts = self.column_dtype_converter(to_convert=table_dataframe)
        schema_sql = ',\n'.join(schema_parts)

        create_stg_table_sql = f'CREATE TABLE {table_name.upper()}_STG ({schema_sql})'

        return create_stg_table_sql


    def create_duckdb_schema(self, table_name: str,
                            table_dataframe: pd.DataFrame,
                            primary_key_field: Optional[str] = None,
                            index_field: Optional[str] = None) -> str:

        schema_parts = self.column_dtype_converter(to_convert=table_dataframe)
        schema_sql = ',\n'.join(schema_parts)

        create_table_sql = f'CREATE TABLE {table_name.upper()} ({schema_sql})'

        return create_table_sql
    

class FormulaOneDBManager(SchemaCreator):

    def __init__(self, db_module, dtype_mappings: list[str], db_path: Optional[Path|str] = None) -> None:

        super().__init__(dtype_mappings=dtype_mappings)
        self.db_module=db_module
        self.db_path=db_path
        self.con = self.db_module.connect(self.db_path)


    def create_table(self, table_type: str = 'prod', *args, **kwargs) -> None:
        
        if table_type == 'prod':
            # self.create_duckdb_schema(table_name=table_name, table_dataframe=table_dataframe,
            #                           primary_key_field=primary_key_field, index_field=index_field)
            self.con.sql(self.create_duckdb_schema(*args, **kwargs))
        elif table_type == 'stg':
            self.con.sql(self.create_duckdb_stg_schema(*args, **kwargs))
        else:
            raise ValueError('''The supplied table_type value is not an acceptable option or 'stg' or 'prod'. ''')


    def insert_dataframe(self, dataframe: pd.DataFrame, table_name: str) -> None:
        df = dataframe
        tn = table_name.upper()
        self.con.sql(f'INSERT INTO {tn} SELECT * FROM df')


    def close(self) -> None:
        self.con.close()


### Creating DB Manager

- Creating `manager` object of class `FormulaOneDBManager` to manage db operations.
- Importing Schema dtype mappings from pandas datatype to duckdb dtypes


In [69]:
# creating mapping dict to convert pandas to duckdb
schema_dtype_mappings = yaml_loaders(file_path= '../src/schemas/DtypeMappings.yaml', header='mappings')

# Creating database manager object
manager = FormulaOneDBManager(db_module=db, dtype_mappings=schema_dtype_mappings, 
                              db_path=str(DB_PATH))


## Reading in Event Schedules by Year

### Creating Event Calendar DB Table

In [70]:
# Sourcing Event Calendar
event_calendar_list = get_event_calendar(2000, datetime.datetime.now().year)
event_calendar_df = concat_event_calendar(event_calendar_list)

# Adding DAtePart Columns
event_calendar_df = create_date_part_col(event_calendar_df, 'EventDate', 'year')
event_calendar_df = create_date_part_col(event_calendar_df, 'EventDate', 'month')

# Creating DuckDB Table for the event_calendar
manager.create_table(table_type='prod', table_name='fone_event_calendar', table_dataframe=event_calendar_df)
manager.insert_dataframe(dataframe=event_calendar_df, table_name='fone_event_calendar')

DEBUG:fastf1.fastf1.utils:Failed to parse datetime string ''
Traceback (most recent call last):
  File "/venv/lib/python3.11/site-packages/fastf1/utils.py", line 209, in to_datetime
    date, time = x.strip('Z').split('T')
    ^^^^^^^^^^
ValueError: not enough values to unpack (expected 2, got 1)
DEBUG:fastf1.fastf1.utils:Failed to parse datetime string ''
Traceback (most recent call last):
  File "/venv/lib/python3.11/site-packages/fastf1/utils.py", line 209, in to_datetime
    date, time = x.strip('Z').split('T')
    ^^^^^^^^^^
ValueError: not enough values to unpack (expected 2, got 1)


### Creating Free Practice Lap Data DB Table

In [72]:
# Sourcing in circuit session data. 
circuit_session = fastf1.get_session(year=2024, gp=1, identifier=1)

# Loading data into DataFrame type object
circuit_session.load()

circuit_session_lap_data = pd.DataFrame(circuit_session.laps)

manager.create_table(table_type='prod', table_name='fone_fp_laps', 
                     table_dataframe=circuit_session_lap_data)

manager.insert_dataframe(dataframe=circuit_session_lap_data, table_name='fone_fp_laps')

core           INFO 	Loading data for Bahrain Grand Prix - Practice 1 [v3.3.0]
INFO:fastf1.fastf1.core:Loading data for Bahrain Grand Prix - Practice 1 [v3.3.0]
req            INFO 	No cached data found for session_info. Loading data...
INFO:fastf1.fastf1.req:No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
INFO:fastf1.api:Fetching session info data...
req            INFO 	Data has been written to cache!
INFO:fastf1.fastf1.req:Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
INFO:fastf1.fastf1.req:No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
INFO:fastf1.api:Fetching driver list...
req            INFO 	Data has been written to cache!
INFO:fastf1.fastf1.req:Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
INFO:fastf1.fastf1.req:No cached data found for sess

In [86]:
circuit_session = fastf1.get_session(year=2024, gp=1, identifier=2)
circuit_session.load()
circuit_session_lap_data = pd.DataFrame(circuit_session.laps)

manager.insert_dataframe(dataframe=circuit_session_lap_data, table_name='fone_fp_laps')

core           INFO 	Loading data for Bahrain Grand Prix - Practice 2 [v3.3.0]
INFO:fastf1.fastf1.core:Loading data for Bahrain Grand Prix - Practice 2 [v3.3.0]
req            INFO 	No cached data found for session_info. Loading data...
INFO:fastf1.fastf1.req:No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
INFO:fastf1.api:Fetching session info data...
req            INFO 	Data has been written to cache!
INFO:fastf1.fastf1.req:Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
INFO:fastf1.fastf1.req:No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
INFO:fastf1.api:Fetching driver list...
req            INFO 	Data has been written to cache!
INFO:fastf1.fastf1.req:Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
INFO:fastf1.fastf1.req:No cached data found for sess

In [83]:
manager.con.sql('SHOW ALL TABLES')

┌─────────────┬─────────┬─────────────────────┬──────────────────────┬─────────────────────────────────────┬───────────┐
│  database   │ schema  │        name         │     column_names     │            column_types             │ temporary │
│   varchar   │ varchar │       varchar       │      varchar[]       │              varchar[]              │  boolean  │
├─────────────┼─────────┼─────────────────────┼──────────────────────┼─────────────────────────────────────┼───────────┤
│ formula_one │ main    │ FONE_EVENT_CALENDAR │ [RoundNumber, Coun…  │ [BIGINT, VARCHAR, VARCHAR, VARCHA…  │ false     │
│ formula_one │ main    │ FONE_FP_LAPS        │ [Time, Driver, Dri…  │ [INTERVAL, VARCHAR, VARCHAR, INTE…  │ false     │
└─────────────┴─────────┴─────────────────────┴──────────────────────┴─────────────────────────────────────┴───────────┘

In [80]:
manager.con.sql('SELECT * FROM FONE_EVENT_CALENDAR LIMIT 10')

┌─────────────┬───────────┬─────────────┬───┬─────────────────────┬──────────────┬────────────────┬─────────────────┐
│ RoundNumber │  Country  │  Location   │ … │   Session5DateUtc   │ F1ApiSupport │ EventDate_year │ EventDate_month │
│    int64    │  varchar  │   varchar   │   │      timestamp      │   boolean    │     int32      │      int32      │
├─────────────┼───────────┼─────────────┼───┼─────────────────────┼──────────────┼────────────────┼─────────────────┤
│           1 │ Australia │ Melbourne   │ … │ 2000-03-12 00:00:00 │ false        │           2000 │               3 │
│           2 │ Brazil    │ São Paulo   │ … │ 2000-03-26 00:00:00 │ false        │           2000 │               3 │
│           3 │ Italy     │ Imola       │ … │ 2000-04-09 00:00:00 │ false        │           2000 │               4 │
│           4 │ UK        │ Silverstone │ … │ 2000-04-23 00:00:00 │ false        │           2000 │               4 │
│           5 │ Spain     │ Montmeló    │ … │ 2000-05-07

In [94]:
manager.con.execute('SELECT * FROM FONE_FP_LAPS LIMIT 3').df()

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate
0,0 days 00:23:29.480000,VER,1,NaT,1.0,1.0,0 days 00:21:31.781000,NaT,NaT,0 days 00:00:53.688000,0 days 00:00:26.442000,NaT,0 days 00:23:03.124000,0 days 00:23:29.480000,157.0,225.0,277.0,189.0,False,MEDIUM,1.0,True,Red Bull Racing,0 days 00:21:31.781000,2024-02-29 11:36:31.847,1,,False,,False,False
1,0 days 00:25:03.380000,VER,1,0 days 00:01:33.900000,2.0,1.0,NaT,NaT,0 days 00:00:30.147000,0 days 00:00:40.365000,0 days 00:00:23.388000,0 days 00:23:59.627000,0 days 00:24:39.992000,0 days 00:25:03.380000,234.0,268.0,280.0,312.0,True,MEDIUM,2.0,True,Red Bull Racing,0 days 00:23:29.480000,2024-02-29 11:38:29.546,1,,False,,False,True
2,0 days 00:27:38.741000,VER,1,NaT,3.0,1.0,NaT,0 days 00:27:36.999000,0 days 00:00:51.006000,0 days 00:01:05.853000,0 days 00:00:38.534000,0 days 00:25:54.524000,0 days 00:27:00.540000,0 days 00:27:38.741000,146.0,133.0,,128.0,False,MEDIUM,3.0,True,Red Bull Racing,0 days 00:25:03.380000,2024-02-29 11:40:03.446,1,,False,,False,False


In [78]:

.con.close()

## Raw Race Data Temp File Saving

In [4]:
# Getting session lap data, save to temp file
# sessionLapData = pd.DataFrame(my_race_session.laps)
sessionLapData = pd.read_csv(f'{TMP_PATH}/temp_laps.csv')
# temp_csver(df=y, path=TMP_PATH, temp_file_name='temp_laps.csv', header=True)

In [6]:
# creating mapping dict to convert pandas to duckdb
dtypeConversions = yaml_loaders(file_path= '../src/schemas/DtypeMappings.yaml', header='mappings')

In [8]:
sessionLapData.dtypes

Unnamed: 0              int64
Time                   object
Driver                 object
DriverNumber            int64
LapTime                object
LapNumber             float64
Stint                 float64
PitOutTime             object
PitInTime              object
Sector1Time            object
Sector2Time            object
Sector3Time            object
Sector1SessionTime     object
Sector2SessionTime     object
Sector3SessionTime     object
SpeedI1               float64
SpeedI2               float64
SpeedFL               float64
SpeedST               float64
IsPersonalBest           bool
Compound               object
TyreLife              float64
FreshTyre                bool
Team                   object
LapStartTime           object
LapStartDate           object
TrackStatus             int64
Position              float64
Deleted                  bool
DeletedReason         float64
FastF1Generated          bool
IsAccurate               bool
dtype: object

In [54]:
schema_dtype_mappings = yaml_loaders(file_path= '../src/schemas/DtypeMappings.yaml', header='mappings')
converter = SchemaCreator(schema_dtype_mappings)

lap_data_table_create = converter.create_duckdb_stg_schema(table_name='fone_session_lap_data',
                                                          table_dataframe=sessionLapData)

con = db.connect(f'../src/duckdb/formula_one.db')
con.sql(lap_data_table_create)
con.sql('INSERT INTO fone_session_lap_data SELECT * from sessionLapData')

In [10]:
con = db.connect(f'../src/duckdb/formula_one.db')

In [16]:
con.sql('show all tables'),
con.sql('describe fone_session_lap_data')

┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│   column_name   │ column_type │  null   │   key   │ default │ extra │
│     varchar     │   varchar   │ varchar │ varchar │ varchar │ int32 │
├─────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ Time            │ INTERVAL    │ YES     │ NULL    │ NULL    │  NULL │
│ Driver          │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ DriverNumber    │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ LapTime         │ INTERVAL    │ YES     │ NULL    │ NULL    │  NULL │
│ LapNumber       │ DOUBLE      │ YES     │ NULL    │ NULL    │  NULL │
│ Stint           │ DOUBLE      │ YES     │ NULL    │ NULL    │  NULL │
│ PitOutTime      │ INTERVAL    │ YES     │ NULL    │ NULL    │  NULL │
│ PitInTime       │ INTERVAL    │ YES     │ NULL    │ NULL    │  NULL │
│ Sector1Time     │ INTERVAL    │ YES     │ NULL    │ NULL    │  NULL │
│ Sector2Time     │ INTERVAL    │ YES     │ NULL    │ NULL    │ 

In [20]:
sqltst = con.sql('describe fone_session_lap_data').df()

In [22]:
sqltst

Unnamed: 0,column_name,column_type,null,key,default,extra
0,Time,INTERVAL,YES,,,
1,Driver,VARCHAR,YES,,,
2,DriverNumber,VARCHAR,YES,,,
3,LapTime,INTERVAL,YES,,,
4,LapNumber,DOUBLE,YES,,,
5,Stint,DOUBLE,YES,,,
6,PitOutTime,INTERVAL,YES,,,
7,PitInTime,INTERVAL,YES,,,
8,Sector1Time,INTERVAL,YES,,,
9,Sector2Time,INTERVAL,YES,,,


In [11]:
con.sql('select * from fone_session_lap_data LIMIT 10')

┌──────────────┬─────────┬──────────────┬──────────────┬───┬─────────┬───────────────┬─────────────────┬────────────┐
│     Time     │ Driver  │ DriverNumber │   LapTime    │ … │ Deleted │ DeletedReason │ FastF1Generated │ IsAccurate │
│   interval   │ varchar │   varchar    │   interval   │   │ boolean │    varchar    │     boolean     │  boolean   │
├──────────────┼─────────┼──────────────┼──────────────┼───┼─────────┼───────────────┼─────────────────┼────────────┤
│ 00:21:14.563 │ VER     │ 1            │ NULL         │ … │ false   │               │ false           │ false      │
│ 00:22:49.992 │ VER     │ 1            │ 00:01:35.429 │ … │ false   │               │ false           │ true       │
│ 00:25:33.677 │ VER     │ 1            │ NULL         │ … │ false   │               │ false           │ false      │
│ 00:28:19.697 │ VER     │ 1            │ NULL         │ … │ false   │               │ false           │ false      │
│ 00:30:13.359 │ VER     │ 1            │ 00:01:53.662 │

In [11]:
# Getting car meta-data, combining list[dataframe] into single dataframe and saving to temp .csv file/
c = my_race_session.car_data
car_data_df = pd.concat([df.assign(Key=key) for key, df in c.items()],
                        ignore_index=True)
temp_csver(df=car_data_df, path=TMP_PATH, temp_file_name='temp_car_data.csv', header=True, index=False)

In [20]:
trackStsTmp = pd.DataFrame(my_race_session.track_status)
temp_csver(df=trackStsTmp, path=TMP_PATH, temp_file_name='temp_track_status.csv', header=True, index=False)

In [57]:
raceResults = my_race_session.results
temp_csver(df=raceResults, path=TMP_PATH, temp_file_name='temp_race_results.csv', header=True, index=False)

In [59]:
raceControlMsg = my_race_session.race_control_messages
temp_csver(df=raceControlMsg, path=TMP_PATH, temp_file_name='temp_race_control_msg.csv', header=True, index=False)

In [61]:
weatherData = my_race_session.weather_data
temp_csver(df=weatherData, path=TMP_PATH, temp_file_name='temp_weather_data.csv', header=True, index=False)

In [71]:
positionData = my_race_session.pos_data
pos_data_df = pd.concat([df.assign(Key=key) for key, df in positionData.items()],
                        ignore_index=True)
temp_csver(df=pos_data_df, path=TMP_PATH, temp_file_name='temp_pos_data.csv', header=True, index=False)


In [52]:
# Fetching Circuit Info
circuitInfo = my_race_session.get_circuit_info()

# Getting session event series, convert to DF and save to temp PDF
evntDF = pd.DataFrame(my_race_session.event).T
evntDF.columns = my_race_session.event.index
evntDF['t0_date'] = my_race_session.t0_date
evntDF['map_rotation'] = circuitInfo.rotation

temp_csver(df=evntDF, path=TMP_PATH, temp_file_name='temp_events.csv', header=True)

In [44]:
# Circuit info coordinate markers
temp_csver(df=circuitInfo.corners, path=TMP_PATH, temp_file_name='temp_track_corners.csv', header=True, index=False)
temp_csver(df=circuitInfo.marshal_sectors, path=TMP_PATH, temp_file_name='temp_marshal_sectors.csv', header=True, index=False)
temp_csver(df=circuitInfo.marshal_lights, path=TMP_PATH, temp_file_name='temp_marshal_lights.csv', header=True, index=False)

## Temp Data Convert to Parquet


In [3]:
carDataTable = csv.read_csv('/workspaces/formula-one-analytics/data/_tmp/temp_car_data.csv')

In [13]:
carDataTable.column_names

['Date',
 'RPM',
 'Speed',
 'nGear',
 'Throttle',
 'Brake',
 'DRS',
 'Source',
 'Time',
 'SessionTime',
 'Key']

In [17]:
new_column_names = [
    'session_date',
    'engine_rpm',
    'car_speed',
    'car_ngear',
    'car_throttle',
    'car_braking',
    'drs',
    'source',
    'time',
    'session_time',
    'key'
]

In [21]:
schema = pa.schema([
    ('session_date', pa.timestamp('ns')),
    ('engine_rpm', pa.int64()),
    ('car_speed', pa.int64()),
    ('car_ngear', pa.int64()),
    ('car_throttle', pa.int64()),
    ('car_braking', pa.int8()),
    ('drs', pa.int64()),
    ('source', pa.string()),
    ('time', pa.string()),
    ('session_time', pa.string()),
    ('key', pa.int64())
])

In [22]:
carDataTable = carDataTable.rename_columns(new_column_names)
carDataTable = carDataTable.cast(schema)
pq.write_table(carDataTable, '/fomula_one_project/data/2023/race_01/car_data.parquet')

FileNotFoundError: [Errno 2] Failed to open local file '/fomula_one_project/data/2023/race_01/car_data.parquet'. Detail: [errno 2] No such file or directory