# Pre-processing and Features Selection

## 1. Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OneHotEncoder

In [None]:
# Load the dataset
file_path = '/home/mgasilva/code/diegonbotelho/f1-tire-prediction/raw_data/df_all_races.csv'
all_races_df = pd.read_csv(file_path)

In [None]:
all_races_df.describe()

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_rows', None)
all_races_df.head()

In [None]:
# Display basic info
print("Dataset shape:", all_races_df.shape)

In [None]:
# Missing Data Analysis
print("Missing data per column:")
print(all_races_df.isnull().sum())

In [None]:
# Exclusion of features that are not necessary
columns_to_remove = [
    'Time',
    'DriverNumber',
    'PitOutTime',
    'PitInTime',
    'Sector1SessionTime',
    'Sector2SessionTime',
    'Sector3SessionTime',
    'SpeedI1',
    'SpeedI2',
    'SpeedFL',
    'SpeedST',
    'IsPersonalBest',
    'FreshTyre',
    'Team',
    'LapStartTime',
    'LapStartDate',
    'Deleted',
    'DeletedReason',
    'FastF1Generated',
    'IsAccurate',
    'WindDirection',
    'WindSpeed',
    'Delta_Lap'
]

new_df = all_races_df.drop(columns=columns_to_remove)
new_df.head()

In [None]:
# Display basic info
print("Dataset shape:", new_df.shape)

In [None]:
# Missing Data Analysis
print("Missing data per column:")
print(new_df.isnull().sum())

In [None]:
new_df = new_df.dropna(subset=['Position'])

In [None]:
# Missing Data Analysis
print("Missing data per column:")
print(new_df.isnull().sum())

In [None]:
# Verify duplicates
print("\nNumber of duplicates:")
print(new_df.duplicated().sum())

In [None]:
# Strategy for missing values

# Fill null values in numeric columns with the median
numerical_columns = new_df.select_dtypes(include=[np.number]).columns
new_df.loc[:, numerical_columns] = new_df[numerical_columns].fillna(new_df[numerical_columns].median())

# Verify missing values after treatment
print("\nMissing values after treatment:")
print(new_df.isnull().sum())

In [None]:
new_df['Compound'].value_counts()

## 2 Scale the features

In [None]:
# Numerical variables
numerical_features = [
    'LapTime',            # Lap time in seconds
    'TyreLife',           # Tyre life
    'AirTemp',            # Air temperature
    'TrackTemp',          # Track temperature
    'WindSpeed',          # Wind speed
    'SpeedI1',            # Speed in the first sector
    'SpeedI2',            # Speed in the second sector
    'SpeedFL',            # Speed on the main straight
    'SpeedST',            # Speed in the timing sector
    'Position',           # Position in the race
    'Humidity',           # Relative humidity of the air
    'Pressure',           # Atmospheric pressure
    'WindDirection',      # Wind direction
    'Sector1Time',        # Time in sector 1
    'Sector2Time',        # Time in sector 2
    'Sector3Time',        # Time in sector 3
    'LapNumber',          # Number of the lap
    'Delta_Lap'           # time difference between two consecutives laps for each pilot
]

In [None]:
# Categorical variables
categorical_features = [
    'Compound',    # Tire type (SOFT, MEDIUM, HARD)
    'TrackStatus', # Track status (green flag, yellow flag, etc.)
    'FreshTyre',   # True or false
    'Rainfall'     # True or false
]

In [None]:
new_df.describe()

In [None]:
# Remove Outliers from LapTime

# Calculate Q1 and Q3
Q1 = new_df['LapTime'].quantile(0.25)
Q3 = new_df['LapTime'].quantile(0.75)

# Calculate the IQR
IQR = Q3 - Q1

# Define boundaries for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
new_df = new_df[(new_df['LapTime'] >= lower_bound) & (new_df['LapTime'] <= upper_bound)]

new_df.describe()


In [None]:
# # Remove Outliers from Pressure

# # Calculate Q1 and Q3
# Q1 = new_df['Pressure'].quantile(0.25)
# Q3 = new_df['Pressure'].quantile(0.75)

# # Calculate the IQR
# IQR = Q3 - Q1

# # Define boundaries for outliers
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# # Filter out outliers
# new_df = new_df[(new_df['Pressure'] >= lower_bound) & (new_df['Pressure'] <= upper_bound)]

# new_df.describe()


In [None]:

# First, calculate the max lap number for each unique GrandPrix/Event_Year combination
new_df['LapPct'] = new_df['LapNumber'] / new_df.groupby(['Event_Year', 'GrandPrix'])['LapNumber'].transform('max')

# Display a sample
new_df[['Driver','LapNumber','LapPct','Event_Year','GrandPrix']].head(200)


In [None]:
new_df.dtypes

In [None]:
#We need to save a copy of the dataframe for the one-hot encoder at this point

new_df_copy = new_df.copy()

In [None]:
# # Colummns for Robust Scaler
# columns_for_robust_scaler = ['LapTime',
#                              'TyreLife',
#                              'AirTemp',
#                              'TrackTemp',
#                              'Humidity',
#                              'Pressure',
#                              'Sector1Time',
#                              'Sector2Time',
#                              'Sector3Time']

# # Colummns for MinMax Scaler
# columns_for_minmax_scaler = ['Position', 'LapNumber']

In [None]:
distribution_pipeline = Pipeline([
    ('Median_Imputer', SimpleImputer(strategy = 'median')),
    ('Robust_Scaler', RobustScaler())
])

In [None]:
range_pipeline = Pipeline([
    ('Median_Imputer', SimpleImputer(strategy = 'median')),
    ('Minmax_Scaler', MinMaxScaler())
])

In [None]:
normal_pipeline = Pipeline([
    ('Median_imputer', SimpleImputer(strategy = 'median')),
    ('Standard_Scaler', StandardScaler())
])

In [None]:
scaling_pipeline = Pipeline([
    ('Median_imputer', SimpleImputer(strategy = 'median')),
    ('robust_scaler', RobustScaler()),
    ('minmax_scaler', MinMaxScaler(feature_range=(-1, 1)))  # Customize range here
])

In [None]:
numeric_preprocessor = ColumnTransformer([
    ('robust_transformer', distribution_pipeline, ['LapTime',
                                                     'AirTemp',
                                                     'TrackTemp',
                                                     'Humidity']),
    ('range_transformer', range_pipeline, ['Position', 'Stint', 'TyreLife']),
    ('robust_and_scale', scaling_pipeline, ['Pressure', 'Sector1Time', 'Sector2Time', 'Sector3Time']),
    ('passthrough_cols', 'passthrough', ['LapPct'])
])

In [None]:
numeric_preprocessor

In [None]:
transformed_df = numeric_preprocessor.fit_transform(new_df)

In [None]:
new_df = pd.DataFrame(transformed_df, columns = numeric_preprocessor.get_feature_names_out())

In [None]:
new_df.describe()

In [None]:
new_df.dtypes

In [None]:
new_df.describe()

## 3. Encode features (OneHotEnconder)

In [None]:
categoric_encodable_features = ['Driver', 'GrandPrix', 'Compound']

In [None]:
new_df_copy = new_df_copy[categoric_encodable_features]

In [None]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(new_df_copy[['Driver']])

new_df_copy[ohe.get_feature_names_out()] = ohe.transform(new_df_copy[['Driver']])

new_df_copy = new_df_copy.drop(columns=["Driver"])
new_df_copy.head(3)

In [None]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(new_df_copy[['GrandPrix']])

new_df_copy[ohe.get_feature_names_out()] = ohe.transform(new_df_copy[['GrandPrix']])

new_df_copy = new_df_copy.drop(columns=["GrandPrix"])
new_df_copy.head(3)

In [None]:
ohe = OneHotEncoder(sparse_output=False)
ohe.fit(new_df_copy[['Compound']])

new_df_copy[ohe.get_feature_names_out()] = ohe.transform(new_df_copy[['Compound']])

new_df_copy = new_df_copy.drop(columns=["Compound"])

In [None]:
new_df_copy.head(20)

In [None]:
new_df.shape

In [None]:
new_df.head()

In [None]:
new_df.dtypes

In [None]:
new_df.describe()

In [None]:
new_df_copy.describe()

In [None]:
final_df = pd.concat([new_df, new_df_copy], axis = 1)

In [None]:
final_df.head()

In [None]:
final_df.dtypes