In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv("../data/processed/export_combined_v1.csv")

In [None]:
def n_to_nan(x):
    if x == "\\N":
        return np.nan
    return x


def race_time_to_milliseconds(race_time_str):
    """
    Converts a Formula 1 race time string (e.g., "1:20.8888") to milliseconds.

    Parameters:
        race_time_str (str): Race time as a string in the format "M:SS.ssss".

    Returns:
        float: Race time in milliseconds.
    """
    if not isinstance(race_time_str, str):
        return np.nan
    try:
        # Split the string into minutes and seconds
        minutes, seconds = race_time_str.split(":")
        
        # Convert minutes to milliseconds
        minutes_ms = int(minutes) * 60 * 1000
        
        # Convert seconds (with fractional part) to milliseconds
        seconds_ms = float(seconds) * 1000
        
        # Total milliseconds
        total_ms = minutes_ms + seconds_ms
        
        return total_ms
    except Exception as e:
        raise ValueError(f"Invalid race time format: {race_time_str}") from e


In [None]:
def unfuck_data(df):

    df = df.copy()
    # Convert race_date and driver_dob to datetime
    df["race_date"] = pd.to_datetime(df["race_date"])
    df["driver_dob"] = pd.to_datetime(df["driver_dob"])

    # Calculate age in milliseconds
    df["driver_age"] = (df["race_date"] - df["driver_dob"]).dt.total_seconds() * 1000
    df["race_date"] = df["race_date"].astype("int64") // 10**6

    df.drop(
        columns=[
            "fp1_date",
            "fp1_time",
            "fp2_date",
            "fp2_time",
            "fp3_date",
            "fp3_time",
            "quali_time",
            "quali_date",
            "driver_dob",
        ],
        inplace=True,
    )
    df = df.apply(lambda x: x.apply(n_to_nan))

    df['number'] = df['number'].apply(lambda x: float(x))

    df["q1"] = df["q1"].apply(race_time_to_milliseconds)
    df["q2"] = df["q2"].apply(race_time_to_milliseconds)
    df["q3"] = df["q3"].apply(race_time_to_milliseconds)

    from sklearn.preprocessing import LabelEncoder
    labelencoder = LabelEncoder()

    for column in df.columns:
        if df[column].dtype == type(object):
            df[column] = labelencoder.fit_transform(df[column])

    return df

In [None]:
df.head()

In [None]:
df = unfuck_data(df)
df.info()

In [None]:
import matplotlib.pyplot as plt

for c in df.columns[1:]:
    df.boxplot(c, by="positionOrder", figsize=(7, 4), fontsize=12)
    plt.title("{}\n".format(c), fontsize=16)
    plt.xlabel("positionOrder", fontsize=16)

In [None]:
def correlation_matrix(df):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure(figsize=(16, 12))
    ax1 = fig.add_subplot(111)
    cmap = plt.get_cmap("jet", 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    plt.title("F1 data set features correlation\n", fontsize=15)

    labels = df.columns
    ax1.set_xticks(range(len(labels)))
    ax1.set_yticks(range(len(labels)))
    ax1.set_xticklabels(labels, fontsize=9, rotation=90)
    ax1.set_yticklabels(labels, fontsize=9)

    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    fig.colorbar(cax, ticks=[0.1 * i for i in range(-11, 11)])
    plt.show()


correlation_matrix(df)

In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("profile_report.html")

## PCA

In [None]:
X_train = unfuck_data(pd.read_csv("../data/train/train.csv"))
y_train = pd.read_csv("../data/train/train_labels.csv")

X_test = unfuck_data(pd.read_csv("../data/test/test.csv"))
y_test = pd.read_csv("../data/test/test_labels.csv")

In [None]:
X_train.shape

In [None]:
X_train.dropna(inplace=True)
X_train.shape

In [None]:
from sklearn.discriminant_analysis import StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.decomposition import PCA


pca = PCA(n_components=None)
dfx_pca = pca.fit(X_train_scaled)

# Retrieve the eigenvectors (components)
eigenvectors = pca.components_

# Retrieve the eigenvalues (explained variance)
eigenvalues = pca.explained_variance_

In [None]:
# Scree plot
plt.figure(figsize=(10, 6))
plt.plot(
    range(1, len(eigenvalues) + 1), eigenvalues, marker="o", linestyle="--", color="b"
)
plt.title("Scree Plot")
plt.xlabel("Principal Component Index")
plt.ylabel("Eigenvalue")
plt.xticks(range(1, len(eigenvalues) + 1))
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
# Cumulative explained variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(10, 6))
plt.plot(
    range(1, len(cumulative_variance) + 1),
    cumulative_variance,
    marker="o",
    linestyle="-",
    color="g",
)
plt.title("Cumulative Explained Variance")
plt.xlabel("Number of Principal Components")
plt.ylabel("Cumulative Explained Variance")
plt.xticks(range(1, len(cumulative_variance) + 1))
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Assuming eigenvalues is a numpy array
eigenvalues_array = np.array(eigenvalues).reshape(1, -1)  # Reshape into a row vector

# Create a heatmap
plt.figure(figsize=(20, 2))  # Adjust height for a row-like heatmap
sns.heatmap(eigenvalues_array, annot=True, fmt=".2f", cmap="viridis", cbar=True)

# Adding labels and title
plt.title("Heatmap of Eigenvalues")
plt.xlabel("Principal Component Index")
plt.ylabel("Eigenvalues")

# Remove y-axis tick labels for aesthetics
plt.yticks([])

plt.show()