# FastF1 Functionality Exploration

## Package Import and Configs

In [5]:
import os
import logging
import fastf1
import pandas as pd
import pyarrow as pa


from time import sleep
from pathlib import Path


# Configuring Logging
logging.basicConfig(
    encoding='utf-8',
    level=logging.ERROR,
    datefmt='%m/%d/%Y %I:%M:%S %p'
)

CACHE_PATH = Path('/workspaces/formula-one-analytics/data/.cache/')
TMP_PATH = Path('/workspaces/formula-one-analytics/data/_tmp/')

# fastF1 data cache config
if not CACHE_PATH.exists():
    logging.info('CACHE_PATH does not exist creating...')
    CACHE_PATH.mkdir(parents=True, exist_ok=True)

if not TMP_PATH.exists():
    logging.info('TMP_PATH does not exist creating...')
    TMP_PATH.mkdir(parents=True, exist_ok=True)

fastf1.Cache.enable_cache('/workspaces/formula-one-analytics/data/.cache/')


## Data Formatting and Saving Funcs

In [3]:
   
# Defining function to event calendar and save to parquet for later
def get_event_calendar(start_year: int, end_year: int, iteration_sleep: int=10) -> list:
    '''
    Sources a list of dataframes with Formula 1 Event Calendar schedule information fro
    the fastF1 API package and API.

    :param start_year: (int) Beginning year of range to pull event calendar schedules for
    :param end_year: (int) End year of range to pull event calendar schedules for
    :iteration_sleep: (int, default=10) Adjustable sleep interval to keep fastF1 api from
    blocking requests for data. fastF1 has protection built in, this allows for further 
    request safety.
    '''
    totalEventList = []
    for idx, year in enumerate(list(range(start_year, end_year+1))):
        totalEventList.append(fastf1.get_event_schedule(year))
        logging.info(f'Requesting the {year=} event schedule')
        sleep(iteration_sleep) # to throttle requests beyond built in fastF1 throttling

    return totalEventList


def concat_event_calendar(event_list: list) -> pd.DataFrame:
    '''
    Concats data from list of dataframes into a single dataframe

    :param event_list: (list) list of dataframes to concat into single dataframe
    :return: pandas Dataframe

    '''

    df = pd.concat(event_list)
    return df


def create_date_part_col(df: pd.DataFrame, date_column: str, date_part:str) -> pd.DataFrame:
    '''
    Creates a new column in supplied pd.DataFrame that is the specified DatetimeIndex part 
    of the given date_column.

    :param df: (pd.DataFrame) pandas DataFrame containing a date or datetime column
    :param date_column: (str) The name of the column with dates to extract the date_part from
    :param date_part: (str) The part of the date to extract ('year', 'month','day')

    :return: pandas DataFrame with the extracted date part in a new column.

    ::Example::
    -----------

    data = [{'eventDate': '2000-03-12', 'event': 'Woodstock'}, 
            {'eventDate': '2003-08-12', 'event': 'EdgeFest'}, 
            {'eventDate': '2009-03-12', 'event': 'Warped Tour'}, 
            {'eventDate': '2020-01-26', 'event': 'Electric Forest'}
       ]
    
    df = pd.DataFrame(data)
    df['eventDate'] = pd.to_datetime(df['eventDate'])

    |  eventDate  |    event           |
    | 2000-03-12  | 'Woodstock'        |
    | 2003-08-12  | 'EdgeFest'         |
    | 2009-03-12  | 'Warped Tour'      |
    | 2020-01-26  | 'Electric Forest'  |

    df = create_date_part_col(df=df, date_column='eventDate', date_part='year')

    |  eventDate  |    event           | eventDate_year |
    | 2000-03-12  | 'Woodstock'        |      2000      |
    | 2003-08-12  | 'EdgeFest'         |      2003      |
    | 2009-03-12  | 'Warped Tour'      |      2009      |
    | 2020-01-26  | 'Electric Forest'  |      2020      |


    '''

    valid_date_parts = {'day', 'month', 'year'}
    if date_part not in valid_date_parts:
        raise ValueError(f'Given date_part must be a valid date part of {valid_date_parts}')

    df['_'.join([date_column, date_part.lower()])] = (
        getattr(pd.DatetimeIndex(df[date_column]), date_part)
        )
    
    return df


def temp_csver(df: pd.DataFrame, path: str|Path, temp_file_name: str, **kwargs) -> None:
    '''
    Takes pd.DataFrame, path and filename string and saves a csv file to that location.
    Used instead of pd.to_csv() to handle using pathlib.Path objects with filenames.
    By default pd.to_csv() does not allow for the path_or_buf argument to be overloaded.

    :param df: pandas DataFrame
    :param path: (str|Path) path to location where csv will be saved
    :param temp_file_name: (str) the filename of temp .csv file to be written
    :param header: (bool) csv to have header True|False

    :return: NoneType
    '''

    if isinstance(path, Path):
        path_and_file = path/temp_file_name
    elif isinstance(path, str):
        path_and_file = '/'.join([path, temp_file_name])
        
    return df.to_csv(path_and_file, **kwargs)


## Reading in Event Schedules by Year

In [4]:
# Pulling down Formula Event Calendars from 2000 to 2023 seasons.
my_event_list = get_event_calendar(2000, 2024)
eventDF = concat_event_calendar(my_event_list)

# Adding new date part columns for parquet partitioning optimization
eventDF = create_date_part_col(eventDF, 'EventDate', 'year')
eventDF = create_date_part_col(eventDF, 'EventDate', 'month')

DEBUG:requests_cache.policy.actions:Cache directives from request headers: CacheDirectives()
DEBUG:requests_cache.policy.actions:Pre-read cache checks: Passed
DEBUG:requests_cache.policy.actions:Post-read cache actions: CacheActions(expire_after=datetime.timedelta(seconds=43200), resend_request=True)
DEBUG:requests_cache.session:Stale response; attempting to re-send request
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): ergast.com:443
DEBUG:urllib3.connectionpool:https://ergast.com:443 "GET /api/f1/2000.json HTTP/1.1" 200 6608
DEBUG:requests_cache.policy.actions:Cache directives from response headers: CacheDirectives(max_age=300)
DEBUG:requests_cache.policy.actions:Pre-write cache checks: Passed
INFO:root:Requesting the year=2000 event schedule
DEBUG:requests_cache.policy.actions:Cache directives from request headers: CacheDirectives()
DEBUG:requests_cache.policy.actions:Pre-read cache checks: Passed
DEBUG:requests_cache.policy.actions:Post-read cache actions: CacheAct

In [6]:
# Saving data to parquet file for easier access later.
eventDF.to_parquet('/workspaces/formula-one-analytics/data/eventCalendar.parquet',
                   partition_cols=['EventDate_year', 'EventDate_month'])

## Raw Race Data Temp Saving

In [7]:
my_race_session = fastf1.get_session(year=2023, gp=1, identifier=1)
# my_race_session.load(telemetry=False, laps=False, weather=False, messages=False)
my_race_session.load()

DEBUG:requests_cache.policy.actions:Cache directives from request headers: CacheDirectives()
DEBUG:requests_cache.policy.actions:Pre-read cache checks: Passed
DEBUG:requests_cache.policy.actions:Post-read cache actions: CacheActions(expire_after=datetime.timedelta(seconds=43200), send_request=True)
DEBUG:urllib3.connectionpool:Resetting dropped connection: raw.githubusercontent.com
DEBUG:urllib3.connectionpool:https://raw.githubusercontent.com:443 "GET /theOehrly/f1schedule/master/schedule_2023.json HTTP/1.1" 304 0
DEBUG:requests_cache.policy.actions:Cache directives from response headers: CacheDirectives(expires='Mon, 22 Jan 2024 00:09:12 GMT', max_age=300, etag='W/"2c123e151d63536e30cca380a63aabfa7b8ff9d8507e1967b959eb74e24a3b02"')
DEBUG:requests_cache.policy.actions:Pre-write cache checks: disabled status
DEBUG:requests_cache.policy.actions:Response for URL https://raw.githubusercontent.com/theOehrly/f1schedule/master/schedule_2023.json has not been modified
core           INFO 	Loa

In [10]:
# Getting session lap data, save to temp file
y = pd.DataFrame(my_race_session.laps)
temp_csver(df=y, path=TMP_PATH, temp_file_name='temp_laps.csv', header=True)

In [11]:
# Getting car meta-data, combining list[dataframe] into single dataframe and saving to temp .csv file/
c = my_race_session.car_data
car_data_df = pd.concat([df.assign(Key=key) for key, df in c.items()],
                        ignore_index=True)
temp_csver(df=car_data_df, path=TMP_PATH, temp_file_name='temp_car_data.csv', header=True, index=False)

In [20]:
trackStsTmp = pd.DataFrame(my_race_session.track_status)
temp_csver(df=trackStsTmp, path=TMP_PATH, temp_file_name='temp_track_status.csv', header=True, index=False)

In [57]:
raceResults = my_race_session.results
temp_csver(df=raceResults, path=TMP_PATH, temp_file_name='temp_race_results.csv', header=True, index=False)

In [59]:
raceControlMsg = my_race_session.race_control_messages
temp_csver(df=raceControlMsg, path=TMP_PATH, temp_file_name='temp_race_control_msg.csv', header=True, index=False)

In [61]:
weatherData = my_race_session.weather_data
temp_csver(df=weatherData, path=TMP_PATH, temp_file_name='temp_weather_data.csv', header=True, index=False)

In [71]:
positionData = my_race_session.pos_data
pos_data_df = pd.concat([df.assign(Key=key) for key, df in positionData.items()],
                        ignore_index=True)
temp_csver(df=pos_data_df, path=TMP_PATH, temp_file_name='temp_pos_data.csv', header=True, index=False)


In [52]:
# Fetching Circuit Info
circuitInfo = my_race_session.get_circuit_info()

# Getting session event series, convert to DF and save to temp PDF
evntDF = pd.DataFrame(my_race_session.event).T
evntDF.columns = my_race_session.event.index
evntDF['t0_date'] = my_race_session.t0_date
evntDF['map_rotation'] = circuitInfo.rotation

temp_csver(df=evntDF, path=TMP_PATH, temp_file_name='temp_events.csv', header=True)

In [44]:
# Circuit info coordinate markers
temp_csver(df=circuitInfo.corners, path=TMP_PATH, temp_file_name='temp_track_corners.csv', header=True, index=False)
temp_csver(df=circuitInfo.marshal_sectors, path=TMP_PATH, temp_file_name='temp_marshal_sectors.csv', header=True, index=False)
temp_csver(df=circuitInfo.marshal_lights, path=TMP_PATH, temp_file_name='temp_marshal_lights.csv', header=True, index=False)

## Temp Data Convert to Parquet


In [1]:
import pyarrow.csv as csv
import pyarrow.parquet as pq
import pyarrow as pa

In [3]:
carDataTable = csv.read_csv('/workspaces/formula-one-analytics/data/_tmp/temp_car_data.csv')

In [4]:
carDataSchema = pa.schema([
    'engine_rpm': pyar
])

pyarrow.Table
Date: timestamp[ns]
RPM: int64
Speed: int64
nGear: int64
Throttle: int64
Brake: bool
DRS: int64
Source: string
Time: string
SessionTime: string
Key: int64
----
Date: [[2023-03-03 11:15:04.633000000,2023-03-03 11:15:04.833000000,2023-03-03 11:15:05.073000000,2023-03-03 11:15:05.313000000,2023-03-03 11:15:05.593000000,...,2023-03-03 12:03:46.721000000,2023-03-03 12:03:46.921000000,2023-03-03 12:03:47.322000000,2023-03-03 12:03:47.522000000,2023-03-03 12:03:47.841000000],[2023-03-03 12:03:48.121000000,2023-03-03 12:03:48.281000000,2023-03-03 12:03:48.561000000,2023-03-03 12:03:48.761000000,2023-03-03 12:03:49.001000000,...,2023-03-03 11:28:51.835000000,2023-03-03 11:28:52.235000000,2023-03-03 11:28:52.435000000,2023-03-03 11:28:52.635000000,2023-03-03 11:28:52.835000000],...,[2023-03-03 11:19:31.434000000,2023-03-03 11:19:31.634000000,2023-03-03 11:19:31.834000000,2023-03-03 11:19:32.074000000,2023-03-03 11:19:32.274000000,...,2023-03-03 12:07:34.322000000,2023-03-03 12:07:3