In [109]:
import os
import logging
import fastf1
import pyarrow
import pandas as pd


from time import sleep
from pathlib import Path


# Configuring Logging
logging.basicConfig(
    encoding='utf-8',
    level=logging.DEBUG,
    datefmt='%m/%d/%Y %I:%M:%S %p'
)

CACHE_PATH = Path('/workspaces/formula-one-analytics/data/.cache/')
TMP_PATH = Path('/workspaces/formula-one-analytics/data/_tmp/')

# fastF1 data cache config
if not CACHE_PATH.exists():
    logging.info('CACHE_PATH does not exist creating...')
    CACHE_PATH.mkdir(parents=True, exist_ok=True)

if not TMP_PATH.exists():
    logging.info('TMP_PATH does not exist creating...')
    TMP_PATH.mkdir(parents=True, exist_ok=True)

fastf1.Cache.enable_cache('/workspaces/formula-one-analytics/data/.cache/')


INFO:root:TMP_PATH does not exist creating...
DEBUG:requests_cache.backends:Initializing backend: sqlite /workspaces/formula-one-analytics/data/.cache/fastf1_http_cache
DEBUG:requests_cache.backends.base:Initialized SQLiteDict with serializer: SerializerPipeline(name=pickle, n_stages=2)
DEBUG:requests_cache.backends.sqlite:Opening connection to /workspaces/formula-one-analytics/data/.cache/fastf1_http_cache.sqlite:responses
DEBUG:requests_cache.backends.base:Initialized SQLiteDict with serializer: None
DEBUG:requests_cache.backends.sqlite:Opening connection to /workspaces/formula-one-analytics/data/.cache/fastf1_http_cache.sqlite:redirects


In [25]:
   
# Defining function to event calendar and save to parquet for later
def get_event_calendar(start_year: int, end_year: int, iteration_sleep: int=10) -> list:
    '''
    Sources a list of dataframes with Formula 1 Event Calendar schedule information fro
    the fastF1 API package and API.

    :param start_year: (int) Beginning year of range to pull event calendar schedules for
    :param end_year: (int) End year of range to pull event calendar schedules for
    :iteration_sleep: (int, default=10) Adjustable sleep interval to keep fastF1 api from
    blocking requests for data. fastF1 has protection built in, this allows for further 
    request safety.
    '''
    totalEventList = []
    for idx, year in enumerate(list(range(start_year, end_year+1))):
        totalEventList.append(fastf1.get_event_schedule(year))
        logging.info(f'Requesting the {year=} event schedule')
        sleep(iteration_sleep) # to throttle requests beyond built in fastF1 throttling

    return totalEventList


def concat_event_calendar(event_list: list) -> pd.DataFrame:
    '''
    Concats data from list of dataframes into a single dataframe

    :param event_list: (list) list of dataframes to concat into single dataframe
    :return: pandas Dataframe

    '''

    df = pd.concat(event_list)
    return df


def create_date_part_col(df: pd.DataFrame, date_column: str, date_part:str) -> pd.DataFrame:
    '''
    Creates a new column in supplied pd.DataFrame that is the specified DatetimeIndex part 
    of the given date_column.

    :param df: (pd.DataFrame) pandas DataFrame containing a date or datetime column
    :param date_column: (str) The name of the column with dates to extract the date_part from
    :param date_part: (str) The part of the date to extract ('year', 'month','day')

    :return: pandas DataFrame with the extracted date part in a new column.

    ::Example::
    -----------

    data = [{'eventDate': '2000-03-12', 'event': 'Woodstock'}, 
            {'eventDate': '2003-08-12', 'event': 'EdgeFest'}, 
            {'eventDate': '2009-03-12', 'event': 'Warped Tour'}, 
            {'eventDate': '2020-01-26', 'event': 'Electric Forest'}
       ]
    
    df = pd.DataFrame(data)
    df['eventDate'] = pd.to_datetime(df['eventDate'])

    |  eventDate  |    event           |
    | 2000-03-12  | 'Woodstock'        |
    | 2003-08-12  | 'EdgeFest'         |
    | 2009-03-12  | 'Warped Tour'      |
    | 2020-01-26  | 'Electric Forest'  |

    df = create_date_part_col(df=df, date_column='eventDate', date_part='year')

    |  eventDate  |    event           | eventDate_year |
    | 2000-03-12  | 'Woodstock'        |      2000      |
    | 2003-08-12  | 'EdgeFest'         |      2003      |
    | 2009-03-12  | 'Warped Tour'      |      2009      |
    | 2020-01-26  | 'Electric Forest'  |      2020      |


    '''

    valid_date_parts = {'day', 'month', 'year'}
    if date_part not in valid_date_parts:
        raise ValueError(f'Given date_part must be a valid date part of {valid_date_parts}')

    df['_'.join([date_column, date_part.lower()])] = (
        getattr(pd.DatetimeIndex(df[date_column]), date_part)
        )
    
    return df


def temp_csver(df: pd.DataFrame, path: str|Path, temp_file_name: str, header: bool=True) -> None:
    '''
    Takes pd.DataFrame, path and filename string and saves a csv file to that location.
    Used instead of pd.to_csv() to handle using pathlib.Path objects with filenames.
    By default pd.to_csv() does not allow for the path_or_buf argument to be overloaded.

    :param df: pandas DataFrame
    :param path: (str|Path) path to location where csv will be saved
    :param temp_file_name: (str) the filename of temp .csv file to be written
    :param header: (bool) csv to have header True|False

    :return: NoneType
    '''

    if isinstance(path, Path):
        path_and_file = path/temp_file_name
    elif isinstance(path, str):
        path_and_file = '/'.join([path, temp_file_name])
        
    return df.to_csv(path_and_file, header=header)


In [None]:
# Pulling down Formula Event Calendars from 2000 to 2023 seasons.
my_event_list = get_event_calendar(2000, 2023)
eventDF = concat_event_calendar(my_event_list)

# Adding new date part columns for parquet partitioning optimization
eventDF = create_date_part_col(eventDF, 'EventDate', 'year')
eventDF = create_date_part_col(eventDF, 'EventDate', 'month')

In [None]:
eventDF[['EventDate_year', 'RoundNumber']]

In [39]:
# Saving data to parquet file for easier access later.
eventDF.to_parquet('/workspaces/formula-one-analytics/data/eventCalendar.parquet',
                   partition_cols=['EventDate_year', 'EventDate_month'])

In [None]:
my_race_session = fastf1.get_session(year=2018, gp=1, identifier=5)
# my_race_session.load(telemetry=False, laps=False, weather=False, messages=False)
my_race_session.load()

In [114]:
# Getting session event series, convert to DF and save to temp PDF

x = pd.DataFrame(my_race_session.event).T
x.columns = my_race_session.event.index
temp_csver(df=x, path=TMP_PATH, temp_file_name='temp_events.csv', header=True)

In [118]:
# Getting session lap data, save to temp file
y = pd.DataFrame(my_race_session.laps)
temp_csver(df=y, path=TMP_PATH, temp_file_name='temp_laps.csv', header=True)

In [125]:
my_race_session.car_data['5']


Unnamed: 0,Date,RPM,Speed,nGear,Throttle,Brake,DRS,Source,Time,SessionTime
0,2018-03-25 05:06:03.659,0,0,0,0,False,1,car,-1 days +23:59:52.478000,-1 days +23:59:52.478000
1,2018-03-25 05:06:03.898,0,0,0,0,False,1,car,-1 days +23:59:52.717000,-1 days +23:59:52.717000
2,2018-03-25 05:06:04.138,0,0,0,0,False,1,car,-1 days +23:59:52.957000,-1 days +23:59:52.957000
3,2018-03-25 05:06:04.378,0,0,0,0,False,1,car,-1 days +23:59:53.197000,-1 days +23:59:53.197000
4,2018-03-25 05:06:04.618,0,0,0,0,False,1,car,-1 days +23:59:53.437000,-1 days +23:59:53.437000
...,...,...,...,...,...,...,...,...,...,...
26082,2018-03-25 06:49:14.121,0,0,0,0,False,0,car,0 days 01:43:02.940000,0 days 01:43:02.940000
26083,2018-03-25 06:49:14.361,0,0,0,0,False,0,car,0 days 01:43:03.180000,0 days 01:43:03.180000
26084,2018-03-25 06:49:14.722,0,0,0,0,False,0,car,0 days 01:43:03.541000,0 days 01:43:03.541000
26085,2018-03-25 06:49:14.961,0,0,0,0,False,0,car,0 days 01:43:03.780000,0 days 01:43:03.780000
