In [3]:
import os
import pandas as pd
import numpy as np
import fastf1
from keras.preprocessing.sequence import TimeseriesGenerator
pd.set_option('display.max_columns', None)
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

# Fast F1 - collect the data

In [10]:
years = [2018, 2019, 2020, 2021, 2022, 2023]

rounds = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25]

cols_laps = ['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint', 'PitOutTime', 'PitInTime', 'Sector1Time', 
        'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'SpeedI1', 
        'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest', 'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime', 
        'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason', 'FastF1Generated', 'IsAccurate', 'uid']


In [None]:
years = [2018]

rounds = [1]

cols_laps = ['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint', 'PitOutTime', 'PitInTime', 'Sector1Time', 
        'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'SpeedI1', 
        'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest', 'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime', 
        'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason', 'FastF1Generated', 'IsAccurate', 'uid']

In [3]:
# create empty dataframes for the laps and for the weather data

df_complete = pd.DataFrame(columns=cols_laps)

# loop to collect the data from Fastf1 API
for y in years:
    for r in rounds:
        try:
            # Load and get access to session, creayte unique identifier for this session
            ses = fastf1.get_session(y, r, "R")
            uid = str(y) + "_" + str(r)
            ses.load(laps=True, weather=True)
            
            df_ses_laps = ses.laps
            df_ses_laps["uid"] = uid

            df_ses_w = ses.laps.get_weather_data()
            
            # Prepare lap and weather data for joining
            df_ses_laps = df_ses_laps.reset_index(drop=True)
            df_ses_w = df_ses_w.reset_index(drop=True)

            # Joining the laps and weather data: exclude the 'Time' column from weather data when joining
            joined = pd.concat([df_ses_laps, df_ses_w.loc[:, ~(df_ses_w.columns == 'Time')]], axis=1)

            # Concat to the dataframes
            df_complete = pd.concat([df_complete, joined], axis=0)
        except:
            continue

core           INFO 	Loading data for Australian Grand Prix - Race [v3.3.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	No cached data found for position_data. Loading data...
_api           INFO 	Fetching position data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['5', '44', '7', '3', '14', '33', '27', '77', '2', '55', '11', '31', '16', '18', '28', '8', '20', '10', '9', 

In [4]:
print(df_complete.shape)

(136122, 39)


In [5]:
df_complete.head()

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,uid,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 00:08:53.241000,GAS,10,0 days 00:01:45.060000,1.0,,NaT,NaT,NaT,0 days 00:00:25.495000,0 days 00:00:37.955000,NaT,0 days 00:08:15.609000,0 days 00:08:53.829000,257.0,283.0,286.0,237.0,False,,,True,Toro Rosso,0 days 00:07:07.988000,2018-03-25 05:13:19.169,1,17.0,False,,False,False,2018_1,24.2,36.3,997.0,False,38.9,307,4.1
1,0 days 00:10:26.613000,GAS,10,0 days 00:01:33.372000,2.0,1.0,NaT,NaT,0 days 00:00:31.357000,0 days 00:00:24.825000,0 days 00:00:37.190000,0 days 00:09:24.632000,0 days 00:09:49.457000,0 days 00:10:26.647000,266.0,286.0,277.0,270.0,True,ULTRASOFT,1.0,True,Toro Rosso,0 days 00:08:53.241000,2018-03-25 05:15:04.422,1,17.0,False,,False,True,2018_1,24.2,36.3,996.9,False,38.2,296,3.8
2,0 days 00:11:59.474000,GAS,10,0 days 00:01:32.861000,3.0,1.0,NaT,NaT,0 days 00:00:31.160000,0 days 00:00:24.725000,0 days 00:00:36.976000,0 days 00:10:57.807000,0 days 00:11:22.532000,0 days 00:11:59.508000,260.0,283.0,294.0,277.0,True,ULTRASOFT,2.0,True,Toro Rosso,0 days 00:10:26.613000,2018-03-25 05:16:37.794,1,17.0,False,,False,True,2018_1,23.9,36.5,997.1,False,36.7,289,4.3
3,0 days 00:13:31.658000,GAS,10,0 days 00:01:32.184000,4.0,1.0,NaT,NaT,0 days 00:00:30.835000,0 days 00:00:24.730000,0 days 00:00:36.619000,0 days 00:12:30.343000,0 days 00:12:55.073000,0 days 00:13:31.692000,264.0,283.0,294.0,296.0,True,ULTRASOFT,3.0,True,Toro Rosso,0 days 00:11:59.474000,2018-03-25 05:18:10.655,1,17.0,False,,False,True,2018_1,23.8,35.7,997.1,False,36.6,210,2.7
4,0 days 00:15:03.990000,GAS,10,0 days 00:01:32.332000,5.0,1.0,NaT,NaT,0 days 00:00:30.716000,0 days 00:00:24.821000,0 days 00:00:36.795000,0 days 00:14:02.408000,0 days 00:14:27.229000,0 days 00:15:04.024000,266.0,283.0,283.0,295.0,False,ULTRASOFT,4.0,True,Toro Rosso,0 days 00:13:31.658000,2018-03-25 05:19:42.839,2,17.0,False,,False,True,2018_1,23.5,36.3,997.2,False,36.4,267,2.5


In [7]:
df_complete.to_csv("complete_data.csv", index=False)

# EDA

In [3]:
df_complete = pd.read_csv("complete_data.csv", low_memory=False)
df_complete.shape

(136122, 39)

In [4]:
df_complete.head()

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,uid,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 00:08:53.241000,GAS,10,0 days 00:01:45.060000,1.0,,,,,0 days 00:00:25.495000,0 days 00:00:37.955000,,0 days 00:08:15.609000,0 days 00:08:53.829000,257.0,283.0,286.0,237.0,False,,,True,Toro Rosso,0 days 00:07:07.988000,2018-03-25 05:13:19.169,1.0,17.0,False,,False,False,2018_1,24.2,36.3,997.0,False,38.9,307,4.1
1,0 days 00:10:26.613000,GAS,10,0 days 00:01:33.372000,2.0,1.0,,,0 days 00:00:31.357000,0 days 00:00:24.825000,0 days 00:00:37.190000,0 days 00:09:24.632000,0 days 00:09:49.457000,0 days 00:10:26.647000,266.0,286.0,277.0,270.0,True,ULTRASOFT,1.0,True,Toro Rosso,0 days 00:08:53.241000,2018-03-25 05:15:04.422,1.0,17.0,False,,False,True,2018_1,24.2,36.3,996.9,False,38.2,296,3.8
2,0 days 00:11:59.474000,GAS,10,0 days 00:01:32.861000,3.0,1.0,,,0 days 00:00:31.160000,0 days 00:00:24.725000,0 days 00:00:36.976000,0 days 00:10:57.807000,0 days 00:11:22.532000,0 days 00:11:59.508000,260.0,283.0,294.0,277.0,True,ULTRASOFT,2.0,True,Toro Rosso,0 days 00:10:26.613000,2018-03-25 05:16:37.794,1.0,17.0,False,,False,True,2018_1,23.9,36.5,997.1,False,36.7,289,4.3
3,0 days 00:13:31.658000,GAS,10,0 days 00:01:32.184000,4.0,1.0,,,0 days 00:00:30.835000,0 days 00:00:24.730000,0 days 00:00:36.619000,0 days 00:12:30.343000,0 days 00:12:55.073000,0 days 00:13:31.692000,264.0,283.0,294.0,296.0,True,ULTRASOFT,3.0,True,Toro Rosso,0 days 00:11:59.474000,2018-03-25 05:18:10.655,1.0,17.0,False,,False,True,2018_1,23.8,35.7,997.1,False,36.6,210,2.7
4,0 days 00:15:03.990000,GAS,10,0 days 00:01:32.332000,5.0,1.0,,,0 days 00:00:30.716000,0 days 00:00:24.821000,0 days 00:00:36.795000,0 days 00:14:02.408000,0 days 00:14:27.229000,0 days 00:15:04.024000,266.0,283.0,283.0,295.0,False,ULTRASOFT,4.0,True,Toro Rosso,0 days 00:13:31.658000,2018-03-25 05:19:42.839,2.0,17.0,False,,False,True,2018_1,23.5,36.3,997.2,False,36.4,267,2.5


Calculate average stint duration

In [3]:
grouped_stint_length = df_complete.groupby(["uid", "Driver", "Stint"]).describe()

In [21]:
stint_lenghths = grouped_stint_length["LapNumber"]["count"]

In [23]:
print(stint_lenghths)
print(stint_lenghths.shape)

uid     Driver  Stint
2018_1  ALO     1.0      25.0
                2.0      32.0
        BOT     1.0      24.0
                2.0      33.0
        ERI     1.0       5.0
                         ... 
2023_9  VER     5.0       2.0
        ZHO     1.0       2.0
                2.0       6.0
                3.0      30.0
                4.0      32.0
Name: count, Length: 6706, dtype: float64
(6706,)


In [25]:
sum_lenghths = 0
stints = 0

for l in stint_lenghths:
    sum_lenghths += l
    stints += 1

In [28]:
average_stint_length = sum_lenghths/stints
print(average_stint_length)

20.247390396659707


# Pre-processing

Delete all first laps

In [5]:
# Delete all first laps 
df_complete = df_complete[df_complete["LapNumber"]!= 1.0]

Feature selection

In [107]:
df_complete.head(1)

Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,Sector3Time,Sector1SessionTime,Sector2SessionTime,Sector3SessionTime,SpeedI1,SpeedI2,SpeedFL,SpeedST,IsPersonalBest,Compound,TyreLife,FreshTyre,Team,LapStartTime,LapStartDate,TrackStatus,Position,Deleted,DeletedReason,FastF1Generated,IsAccurate,uid,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed
0,0 days 00:08:53.241000,GAS,10,0 days 00:01:45.060000,1.0,,,,,0 days 00:00:25.495000,0 days 00:00:37.955000,,0 days 00:08:15.609000,0 days 00:08:53.829000,257.0,283.0,286.0,237.0,False,,,True,Toro Rosso,0 days 00:07:07.988000,2018-03-25 05:13:19.169,1.0,17.0,False,,False,False,2018_1,24.2,36.3,997.0,False,38.9,307,4.1


In [6]:
# Select the variables that we will be using 
df_complete = df_complete[["uid", "Driver", "LapTime", "Stint", "PitOutTime", "PitInTime", "Compound", "TyreLife", "Team", "TrackStatus", "Position",
                           "AirTemp", "Humidity", "Pressure", "Rainfall", "TrackTemp", "WindDirection", "WindSpeed"]]

print(df_complete.shape)

(133627, 18)


Feature creation - Laptype

In [7]:
def get_lap_type(row):
    if pd.notnull(row['PitInTime']):
        return 'InLap'
    elif pd.notnull(row['PitOutTime']):
        return 'Outlap'
    else:
        return 'Lap'

In [8]:
# Apply the function to create new column "LapType"
df_complete['LapType'] = df_complete.apply(get_lap_type, axis=1)
df_complete.drop(["PitOutTime", "PitInTime"], axis=1, inplace=True)

Lap Time pre-processing

In [9]:
# Make LapTime of type timedelta
df_complete["LapTime"] = pd.to_timedelta(df_complete["LapTime"])

In [10]:
# Custom function to convert timedelta to milliseconds
def convert_timedelta_to_milliseconds(td):
    return td.total_seconds() * 1000 + td.microseconds / 1000

# Apply the custom function to the 'lap_time' columm
df_complete['LapTime'] = df_complete['LapTime'].apply(convert_timedelta_to_milliseconds)

Create the windows - sliding windows univariate

In [None]:
# Group by "uid" and "Driver"
grouped_df = df_complete.groupby(["uid", "Driver"])

# Initialize an empty array to append the resulting arrays

total_input_windows_u = np.empty((0, 10))
total_output_windows_u = np.empty((0, 10))

# Iterate over each group and convert to sliding windows
for (uid, driver), groupu in grouped_df:

   input_seq_u = groupu["LapTime"].to_list()

   # Make sure that sequences that have less than 20 data points are not included
   if len(input_seq_u) < 20:
      continue

   output_multi_u = pd.concat([groupu["LapTime"], groupu["LapTime"].shift(-1), groupu["LapTime"].shift(-2), 
                          groupu["LapTime"].shift(-3), groupu["LapTime"].shift(-4), groupu["LapTime"].shift(-5), groupu["LapTime"].shift(-6), 
                          groupu["LapTime"].shift(-7), groupu["LapTime"].shift(-8), groupu["LapTime"].shift(-9)], axis=1).to_numpy().tolist()
   
   ts_generator_u = TimeseriesGenerator(input_seq_u, output_multi_u, length = 10)

   input_windows_u = ts_generator_u[0][0][:-9] # --> [:-9] because the last n widows (n = how many shifts are executed) are not complete (non existent)
   output_windows_u = ts_generator_u[0][1][:-9]

   total_input_windows_u = np.vstack((total_input_windows_u, input_windows_u))
   total_output_windows_u = np.vstack((total_output_windows_u, output_windows_u))

In [32]:
print(total_output_windows_u.shape)
print(total_input_windows_u.shape)

(88757, 10)
(88757, 10)


In [116]:
np.save("input_windows_univariate", total_input_windows_u)
np.save("output_windows_univariate", total_output_windows_u)

Create the windows - sliding windows multivariate

In [88]:
# Group by "uid" and "Driver"
grouped_df = df_complete.groupby(["uid", "Driver"])

# Initialize an empty array to append the resulting arrays

total_input_windows_m = np.empty((0, 10, 16))
total_output_windows_m = np.empty((0, 10))

# Iterate over each group and convert to sliding windows
for (uid, driver), groupm in grouped_df:

   # THIS HAS TO BE ADAPTED TO THE ENCODINGS
   input_seq_m = groupm[["LapTime", "Driver", "Stint", "Compound", "TyreLife", "Team", "TrackStatus", "Position",
                         "AirTemp", "Humidity", "Pressure", "Rainfall", "TrackTemp", "WindDirection", "WindSpeed", "LapType"]].to_numpy().tolist()

   # Make sure that sequences that have less than 20 data points are not included
   if len(input_seq_m) < 20:
      continue

   output_multi_m = pd.concat([groupm["LapTime"], groupm["LapTime"].shift(-1), groupm["LapTime"].shift(-2), 
                          groupm["LapTime"].shift(-3), groupm["LapTime"].shift(-4), groupm["LapTime"].shift(-5), groupm["LapTime"].shift(-6), 
                          groupm["LapTime"].shift(-7), groupm["LapTime"].shift(-8), groupm["LapTime"].shift(-9)], axis=1).to_numpy().tolist()
   
   ts_generator_m = TimeseriesGenerator(input_seq_m, output_multi_m, length = 10)

   input_windows_m = ts_generator_m[0][0][:-9] # --> [:-9] because the last n widows (n = how many shifts are executed) are not complete (non existent)
   output_windows_m = ts_generator_m[0][1][:-9]

   total_input_windows_m = np.vstack((total_input_windows_m, input_windows_m))
   total_output_windows_m = np.vstack((total_output_windows_m, output_windows_m))

In [89]:
print(total_output_windows_m.shape)
print(total_input_windows_m.shape)

(88757, 10)
(88757, 10, 16)


In [115]:
np.save("input_windows_multivariate", total_input_windows_m)
np.save("output_windows_multivariate", total_output_windows_m )

Dealing with missing values for laptimes - consecutive missing

In [129]:
total_input_windows_u = np.load("unprocessed_windows/unprocessed_windows/input_windows_univariate.npy")
total_output_windows_u = np.load("unprocessed_windows/unprocessed_windows/output_windows_univariate.npy")
total_input_windows_m = np.load("unprocessed_windows/unprocessed_windows/input_windows_multivariate.npy")
total_output_windows_m = np.load("unprocessed_windows/unprocessed_windows/output_windows_multivariate.npy")

In [130]:
def find_consecutive_null_rows(arr):
    null_rows = []
    for i, row in enumerate(arr):
        consecutive_null_count = 0
        for value in row:
            if np.isnan(value):  
                consecutive_null_count += 1
                if consecutive_null_count >= 2:
                    null_rows.append(i)
                    break
            else:
                consecutive_null_count = 0
    return null_rows

In [131]:
# Delete rows that have consecutive null values in the input windows
null_indexes_univariate_input = find_consecutive_null_rows(total_input_windows_u)

input_windows_u_c = np.delete(total_input_windows_u, null_indexes_univariate_input, axis=0)
output_windows_u_c = np.delete(total_output_windows_u, null_indexes_univariate_input, axis=0)

# Delete rows that have consecutive null values in the output windows
null_indexes_univariate_output = find_consecutive_null_rows(output_windows_u_c)

output_windows_u_c = np.delete(output_windows_u_c, null_indexes_univariate_output, axis=0)
input_windows_u_c = np.delete(input_windows_u_c, null_indexes_univariate_output, axis=0)

In [132]:
# Delete rows multivariate
input_windows_m_c = np.delete(total_input_windows_m, null_indexes_univariate_input, axis=0)
output_windows_m_c = np.delete(total_output_windows_m, null_indexes_univariate_input, axis=0)

output_windows_m_c = np.delete(output_windows_m_c, null_indexes_univariate_output, axis=0)
input_windows_m_c = np.delete(input_windows_m_c, null_indexes_univariate_output, axis=0)

In [133]:
print(input_windows_u_c.shape)
print(output_windows_u_c.shape)

(83516, 10)
(83516, 10)


In [134]:
print(input_windows_m_c.shape)
print(output_windows_m_c.shape)

(83516, 10, 16)
(83516, 10)


Dealing with missing values for laptimes - individual missing

In [135]:
# Univariate windows
df_uni_input = pd.DataFrame(input_windows_u_c)
df_uni_input.fillna(axis=1, method='backfill', inplace=True)
df_uni_input.fillna(axis=1, method='ffill', inplace=True)
input_windows_uni = df_uni_input.to_numpy()

df_uni_output = pd.DataFrame(output_windows_u_c)
df_uni_output.fillna(axis=1, method='backfill', inplace=True)
df_uni_output.fillna(axis=1, method='ffill', inplace=True)
output_windows_uni = df_uni_output.to_numpy()

In [35]:
# Multivariate windows
# Create an empty array to store the filled values
input_windows_multi = np.empty_like(input_windows_m_c)

# Iterate over each 2-dimensional array
for i in range(input_windows_m_c.shape[0]):
    df_multi_input = pd.DataFrame(input_windows_m_c[i])
    df_multi_input.fillna(axis=0, method='backfill', inplace=True)
    df_multi_input.fillna(axis=0, method='ffill', inplace=True)
    input_windows_multi[i] = df_multi_input.to_numpy()


In [36]:
df_multi_output = pd.DataFrame(output_windows_m_c)
df_multi_output.fillna(axis=1, method='backfill', inplace=True)
df_multi_output.fillna(axis=1, method='ffill', inplace=True)
output_windows_multi = df_multi_output.to_numpy()

In [11]:
np.save("data_preprocessed/input_windows_uni", input_windows_uni)
np.save("data_preprocessed/input_windows_multi", input_windows_multi)
np.save("data_preprocessed/output_windows_uni", output_windows_uni)
np.save("data_preprocessed/output_windows_multi", output_windows_multi)

Encoding

In [11]:
df_complete.head()

Unnamed: 0,uid,Driver,LapTime,Stint,Compound,TyreLife,Team,TrackStatus,Position,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed,LapType
1,2018_1,GAS,93744.0,1.0,ULTRASOFT,1.0,Toro Rosso,1.0,17.0,24.2,36.3,996.9,False,38.2,296,3.8,Lap
2,2018_1,GAS,93722.0,1.0,ULTRASOFT,2.0,Toro Rosso,1.0,17.0,23.9,36.5,997.1,False,36.7,289,4.3,Lap
3,2018_1,GAS,92368.0,1.0,ULTRASOFT,3.0,Toro Rosso,1.0,17.0,23.8,35.7,997.1,False,36.6,210,2.7,Lap
4,2018_1,GAS,92664.0,1.0,ULTRASOFT,4.0,Toro Rosso,2.0,17.0,23.5,36.3,997.2,False,36.4,267,2.5,Lap
5,2018_1,GAS,98360.0,1.0,ULTRASOFT,5.0,Toro Rosso,2.0,16.0,23.5,36.6,997.0,False,36.2,295,3.5,Lap


In [48]:
# Define column indices for numerical and categorical columns
numerical_columns = [0, 2, 4, 7, 8, 9, 10, 12, 13, 14]
categorical_columns = [1, 3, 5, 11, 15]
track_status = [6]

# Separate numerical and categorical data using column indices
numerical_data = input_windows_multi[:, :, numerical_columns].reshape(-1, len(numerical_columns))
categorical_data = input_windows_multi[:, :, categorical_columns].reshape(-1, len(categorical_columns))
track_status = input_windows_multi[:, :, track_status].reshape(-1, len(track_status))

In [49]:
print(numerical_data.shape)
print(categorical_data.shape)
print(track_status.shape)

(835160, 10)
(835160, 5)
(835160, 1)


In [50]:
# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse=True)

# Fit and transform the categorical data
encoded_categorical_data = encoder.fit_transform(categorical_data).toarray()

In [51]:
# Encoding trackstatus

# Get the track status code in the right formnat
track_status = track_status.astype(float)
track_status = track_status.astype(int)
track_status = track_status.astype(str)

# Transform each element into a tuple of characters
track_status_r = np.array([tuple(val[0]) for val in track_status], dtype=object)

mlb = MultiLabelBinarizer()
track_status_encoded = mlb.fit_transform(track_status_r)
mlb.classes_

array(['1', '2', '4', '5', '6', '7'], dtype=object)

In [52]:
print(numerical_data.shape)
print(encoded_categorical_data.shape)
print(track_status_encoded.shape)

(835160, 10)
(835160, 67)
(835160, 6)


In [53]:
numerical_data = numerical_data.astype(float)
encoded_categorical_data = encoded_categorical_data.astype(float)
track_status_encoded = track_status_encoded.astype(float)

In [55]:
# Concatenate numerical and encoded categorical data along axis=1
encoded_data = np.concatenate((numerical_data, encoded_categorical_data, track_status_encoded), axis=1)

In [56]:
print(encoded_data.shape)

(835160, 83)


In [59]:
input_windows_multi_encoded = encoded_data.reshape(83516, 10, encoded_data.shape[1])
print(input_windows_multi_encoded.shape)

(83516, 10, 83)


In [60]:
print(input_windows_multi_encoded[0])

[['92130.0' '1.0' '1.0' '10.0' '24.2' '36.3' '996.9' '38.2' '296' '3.8'
  '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '1.0' '0.0' '0.0' '1.0' '0.0' '1' '0' '0' '0' '0' '0']
 ['91608.0' '1.0' '2.0' '10.0' '23.9' '36.5' '997.1' '36.7' '289' '4.3'
  '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '0.0' '0.0' '0.0' '0.0' '1.0' '0.0' '0.0' '0.0' '0.0' '0.0'
  '0.0' '0.0' '1.0' '0.0' '0.0' '1.0' '0.0' '1'

In [61]:
encoder.get_feature_names_out()

array(['x0_AIT', 'x0_ALB', 'x0_ALO', 'x0_BOT', 'x0_DEV', 'x0_ERI',
       'x0_FIT', 'x0_GAS', 'x0_GIO', 'x0_GRO', 'x0_HAM', 'x0_HAR',
       'x0_HUL', 'x0_KUB', 'x0_KVY', 'x0_LAT', 'x0_LAW', 'x0_LEC',
       'x0_MAG', 'x0_MAZ', 'x0_MSC', 'x0_NOR', 'x0_OCO', 'x0_PER',
       'x0_PIA', 'x0_RAI', 'x0_RIC', 'x0_RUS', 'x0_SAI', 'x0_SAR',
       'x0_SIR', 'x0_STR', 'x0_TSU', 'x0_VAN', 'x0_VER', 'x0_VET',
       'x0_ZHO', 'x1_HARD', 'x1_HYPERSOFT', 'x1_INTERMEDIATE',
       'x1_MEDIUM', 'x1_SOFT', 'x1_SUPERSOFT', 'x1_ULTRASOFT', 'x1_WET',
       'x1_nan', 'x2_Alfa Romeo', 'x2_Alfa Romeo Racing', 'x2_AlphaTauri',
       'x2_Alpine', 'x2_Aston Martin', 'x2_Ferrari', 'x2_Force India',
       'x2_Haas F1 Team', 'x2_McLaren', 'x2_Mercedes', 'x2_Racing Point',
       'x2_Red Bull Racing', 'x2_Renault', 'x2_Sauber', 'x2_Toro Rosso',
       'x2_Williams', 'x3_False', 'x3_True', 'x4_InLap', 'x4_Lap',
       'x4_Outlap'], dtype=object)

In [65]:
np.save("input_windows_multi_encoded", input_windows_multi_encoded)

Normalising

In [70]:
input_windows_multi = np.load("encoded_multi_input/input_windows_multi_encoded.npy")

In [71]:
print(input_windows_multi.shape)
print(output_windows_multi.shape)
print()
print(input_windows_uni.shape)
print(output_windows_uni.shape)

(83516, 10, 83)
(83516, 10)

(83516, 10)
(83516, 10)


In [68]:
# Normalise all values
scaler = StandardScaler()

output_windows_multi = scaler.fit_transform(output_windows_multi)
input_windows_uni = scaler.fit_transform(input_windows_uni)
output_windows_uni = scaler.fit_transform(output_windows_uni)

In [73]:
# Standardize only the numerial values in multi
numericals = input_windows_multi[:,:,:10]
categoricals = input_windows_multi[:,:,10:]
print(numericals.shape)
print(categoricals.shape)


scaler = StandardScaler()


# Reshape the multi input nto a 2D array
num_samples, num_timesteps, num_features = numericals.shape
numericals_2d = numericals.reshape(num_samples * num_timesteps, num_features)


numericals_std_2d = scaler.fit_transform(numericals_2d)

# Reshape multi input back to 3D array
numericals_std = numericals_std_2d.reshape(num_samples, num_timesteps, num_features)

(83516, 10, 10)
(83516, 10, 73)


In [75]:
print(numericals_std.shape)
print(categoricals.shape)

(83516, 10, 10)
(83516, 10, 73)


In [76]:
# concatenate categoricals and numericals_std
input_windows_multi =  np.concatenate((numericals_std, categoricals), axis=2)

In [47]:
np.save("data_ready/input_windows_uni_C", input_windows_uni)
np.save("data_ready/output_windows_uni_C", output_windows_uni)
np.save("data_ready/input_windows_multi_C", input_windows_multi)
np.save("data_ready/output_windows_multi_C", output_windows_multi)