In [None]:
# Import standard libraries
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader

# Import custom modules
from data_preparation import load_and_preprocess_data, prepare_sequence_data, split_data_by_race, save_data_splits, prepare_regression_data
from features import RaceFeatures
from lstm import F1PredictionModel, F1Dataset, F1DataPreprocessor, train_model, save_model_with_preprocessor
from evaluation import evaluate_model, plot_predictions

# Load and preprocess data
print("Loading and preprocessing data...")
df = load_and_preprocess_data()

df.head()


In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA


# One hot encoding for the column "code"


X = df.drop(columns=["cumulative_milliseconds", "positionOrder", "date", "driverRef", "number", "date_race", "time_race", "time", "forename", "surname", "dob", "url_race", "location", "circuitRef"])


X = X.drop(columns=["milliseconds"])
y = df["milliseconds"]

X.head()

X.info()

In [None]:
X = pd.get_dummies(
    X, columns=["code", "nationality", "status", "circuit_type", "country"]
)

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
X_normalized = scaler.fit_transform(X)

# Convert the normalized data back to a DataFrame
X_normalized = pd.DataFrame(X_normalized, columns=X.columns)

# Display the first few rows of the normalized data
X_normalized.head()

In [None]:
# Assuming X_train_scaled is already scaled data from StandardScaler

# Fit PCA
pca = PCA(n_components=None)
dfx_pca = pca.fit(X_normalized)

# Retrieve eigenvectors (components/loadings)
eigenvectors = pca.components_  # Shape: (n_components, n_features)

# Retrieve eigenvalues (explained variance)
eigenvalues = pca.explained_variance_


# Explained variance ratio (proportion of variance explained by each component)
explained_variance_ratio = pca.explained_variance_ratio_

# Create a DataFrame for loadings
loadings = pd.DataFrame(
    eigenvectors.T,  # Transpose to align features as rows and PCs as columns
    columns=[f"PC{i+1}" for i in range(eigenvectors.shape[0])],
    index=X.columns,  # Assuming columns are the feature names
)

# Display the loadings
print("PCA Loadings (Feature Contributions):")
print(loadings)

# Explained variance
print("\nExplained Variance Ratio:")
print(
    pd.Series(
        explained_variance_ratio,
        index=[f"PC{i+1}" for i in range(len(explained_variance_ratio))],
    )
)

# Most important features for PC1
important_features_pc1 = loadings["PC1"].abs().sort_values(ascending=False)
print("\nMost Important Features for PC1:")
print(important_features_pc1)

In [None]:
import matplotlib.pyplot as plt

# Bar plot for feature importance in PC1
plt.figure(figsize=(20, 6))
important_features_pc1.plot(kind="bar", color="skyblue")  # Bar color

# Customize the plot
plt.title("Feature Importance for PC1", fontsize=16, color="white")  # Title in white
plt.ylabel(
    "Loading Score (Absolute Value)", fontsize=12, color="white"
)  # Y-axis label in white
plt.xlabel("Features", fontsize=12, color="white")  # X-axis label in white

# Change tick colors to white
plt.xticks(rotation=90, fontsize=10, color="white")  # X-ticks in white
plt.yticks(fontsize=10, color="white")  # Y-ticks in white

# Change background color
plt.gca().set_facecolor("black")  # Axes background color
plt.gcf().set_facecolor("black")  # Figure background color

plt.tight_layout()
plt.show()

In [None]:
# Scree plot
plt.figure(figsize=(20, 6))
plt.plot(
    range(1, len(eigenvalues) + 1), eigenvalues, marker="o", linestyle="--", color="r"
)
plt.title("Scree Plot", color="white")
plt.xlabel("Principal Component Index", color="white")
plt.ylabel("Eigenvalue", color="white")
plt.xticks(range(1, len(eigenvalues) + 1), color="white")
plt.yticks(color="white")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.gca().set_facecolor("black")  # Set the background color to black
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Perform initial train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y, test_size=0.2, random_state=42
)

# Further split the training set into training and validation sets
X_train, X_validation, y_train, y_validation = train_test_split(
    X_train,
    y_train,
    test_size=0.25,
    random_state=42,  # 0.25 * 0.8 = 0.2 of the total data
)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_validation shape:", X_validation.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_validation shape:", y_validation.shape)
print("y_test shape:", y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.decomposition import PCA

# Define a range of n_components to try
n_components_range = range(1, 165)  # Example range from 1 to 10

# Initialize a dictionary to store RMSE for each n_components
rmse_dict = {}

for n_components in n_components_range:
    # Apply PCA with the current number of components
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_validation_pca = pca.transform(X_validation)
    
    # Initialize the linear regression model
    log_reg_pca = LinearRegression()
    
    # Train the model
    log_reg_pca.fit(X_train_pca, y_train)
    
    # Make predictions on the validation set
    y_pred_pca = log_reg_pca.predict(X_validation_pca)
    
    # Calculate RMSE
    rmse = root_mean_squared_error(y_validation, y_pred_pca)
    rmse_dict[n_components] = rmse
    print(f"n_components: {n_components}, RMSE: {rmse}")

# Find the best n_components with the lowest RMSE
best_n_components = min(rmse_dict, key=rmse_dict.get)
print(f"Best n_components: {best_n_components}, RMSE: {rmse_dict[best_n_components]}")

In [None]:
# Save the trained model to a .pth file
#from linear_regression_utils import save

#torch.save(log_reg_pca, 'models/linear_regression_model.pth')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

# Initialize the RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_rf = rf_regressor.predict(X_validation)

# Calculate RMSE
rmse_rf = root_mean_squared_error(y_validation, y_pred_rf)
print(f"Random Forest RMSE: {rmse_rf}")

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV


# Initialize the XGBoost regressor
xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, random_state=42, )

# Train the model
xgb_regressor.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_xgb = xgb_regressor.predict(X_validation)

# Calculate RMSE
rmse_xgb = root_mean_squared_error(y_validation, y_pred_xgb)
print(f"XGBoost RMSE: {rmse_xgb}")

In [None]:
# Download [houses dataset](https://www.openml.org/d/537) from OpenML. The task is to predict median price of the house in the region based on demographic composition and a state of housing market in the region.
from flaml import AutoML

automl = AutoML()
settings = {
    "time_budget": 200,  # total running time in seconds
    "metric": "mse",  # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "estimator_list": [
        "xgboost"
    ],  # list of ML learners; we tune XGBoost in this example
    "task": "regression",  # task type
    "log_file_name": "f1regression.log",  # flaml log file
    "seed": 7654321,  # random seed
}
automl.fit(X_train=X_train, y_train=y_train, X_val=X_validation, y_val=y_validation, **settings)

In [None]:
print("Best hyperparmeter config:", automl.best_config)
print("Best r2 on validation data: {0:.4g}".format(1 - automl.best_loss))
print("Training duration of best run: {0:.4g} s".format(automl.best_config_train_time))

model = automl.model.estimator

y_pred_flaml = automl.predict(X_test)
print("Predicted labels", y_pred_flaml)

# Calculate RMSE
rmse_flaml = root_mean_squared_error(y_validation, y_pred_flaml)
print(f"FLAML RMSE: {rmse_flaml}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 20))
plt.barh(automl.feature_names_in_, automl.feature_importances_)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Make predictions on the test set for Linear Regression with PCA
X_test_pca = pca.transform(X_test_normalized)
y_pred_pca_test = log_reg_pca.predict(X_test_pca)

# Make predictions on the test set for Random Forest
y_pred_rf_test = rf_regressor.predict(X_test_normalized)

# Make predictions on the test set for XGBoost
y_pred_xgb_test = xgb_regressor.predict(X_test_normalized)

# Make predictions on the test set for FLAML
y_pred_flaml_test = automl.predict(X_test_normalized)

# Calculate RMSE for Linear Regression with PCA
rmse_linear_pca = np.sqrt(mean_squared_error(y_test, y_pred_pca_test))
print(f"Linear Regression with PCA RMSE: {rmse_linear_pca}")

# Calculate RMSE for Random Forest
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf_test))
print(f"Random Forest RMSE: {rmse_rf}")

# Calculate RMSE for XGBoost
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb_test))
print(f"XGBoost RMSE: {rmse_xgb}")

# Calculate RMSE for FLAML
rmse_flaml = np.sqrt(mean_squared_error(y_test, y_pred_flaml_test))
print(f"FLAML RMSE: {rmse_flaml}")

In [None]:
plt.style.use('dark_background')
plot_predictions(y_test, y_pred_flaml_test, model_name="XGBoost")