## Preprocess data in this file

In [None]:
# Third party imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Local imports
from utils import scroll_df, calculate_class

#### Overview operational training data

In [None]:
train_operational_df = pd.read_csv("raw_data/train_operational_readouts.csv")
print(train_operational_df.shape)
train_operational_sorted_df = train_operational_df.sort_values(by='vehicle_id')
train_operational_df.head(10)

In [None]:
scroll_df(((train_operational_df.isna().sum() / len(train_operational_df)) * 100).to_frame())

#### Overview time to event training data

In [None]:
train_tte_df = pd.read_csv("raw_data/train_tte.csv")
print(train_tte_df.shape)
train_tte_df.head(5)

In [None]:
scroll_df(((train_tte_df.isna().sum() / len(train_tte_df)) * 100).to_frame())

#### Plot the number of healthy vehicles and repaired one

In [None]:
healthy_and_repaired_vehicles = train_tte_df["in_study_repair"].value_counts()

labels = ["Healthy", "Repaired"]
colors = ["skyblue", "lightgreen", "salmon"]

def autopct_format(pct, all_vals):
    absolute = int(round(pct / 100. * sum(all_vals)))  # Calculate count
    return "{:.1f}%\n({:d})".format(pct, absolute)

plt.figure(figsize=(6, 6))
plt.pie(healthy_and_repaired_vehicles, 
        labels=healthy_and_repaired_vehicles.index, 
        autopct=lambda pct: autopct_format(pct, healthy_and_repaired_vehicles),
        startangle=180, 
        colors=colors)

# Add a legend
plt.legend(labels, title="Vehicle Status", loc="upper right")

plt.title("Category Distribution")
plt.show()

#### Merge opertational and time to event dataframes 

In [None]:
combined_train_df = pd.merge(train_operational_df, train_tte_df, on='vehicle_id', how='inner')
print(combined_train_df.shape)
combined_train_df.head(5)

#### Calculate class label, remove timesteps for healthy 48 time steps close to length_of_study_time_step value and plot class distribution

In [None]:
combined_train_df["class"] = combined_train_df.apply(lambda row: calculate_class(row["in_study_repair"], row["time_step"], row["length_of_study_time_step"]), axis=1)

In [None]:
combined_train_df = combined_train_df[
    ~((combined_train_df["in_study_repair"] == 0) & 
      ((combined_train_df["length_of_study_time_step"] - combined_train_df["time_step"]) < 48))
]

combined_train_df.shape

#### Count the number of class occurrences sequence wise

In [None]:
# Count occurrences of the sequence labels and calculate percentages
combined_train_df = combined_train_df.sort_values(by=["vehicle_id", "time_step"]).reset_index(drop=True)
# Obtains the last class variable for each vehicles sequence
last_class_df = combined_train_df.groupby("vehicle_id")["class"].last().reset_index(drop=True)

In [None]:
class_counts = last_class_df.value_counts()
class_percentages = (class_counts / class_counts.sum()) * 100

# Plot the bar chart
plt.figure(figsize=(8, 5))
plt.bar(class_counts.index, class_percentages.values, color=["skyblue", "lightgreen", "salmon", "orange", "purple"])

# Add labels and title
plt.xlabel("Class")
plt.ylabel("Percentage (%)")
plt.title("Distribution of Classes (in % and Count)")
plt.xticks([0, 1, 2, 3, 4])  # Ensure correct class labels
plt.grid(axis="y", linestyle="--", alpha=0.7)  # Add grid for better readability

# Show percentage and count on bars
for i, (count, pct) in enumerate(zip(class_counts.values, class_percentages.values)):
    plt.text(class_counts.index[i], pct + 1, f"{pct:.1f}% ({count})", ha="center", fontsize=10)

# Show the plot
plt.show()

#### Rebalance the data for the classes

In [None]:
# Pick out a number of healthy vehicles and remove them to downsample the majority class 0
healthy_vehicles = train_tte_df.query("in_study_repair == 0")["vehicle_id"]
print("total healthy vehciles", healthy_vehicles.shape)
total_healthy_vehicles = 21278
healthy_vehicles_to_keep = 2272
healthy_vehicles_to_be_removed = healthy_vehicles.sample(n=total_healthy_vehicles - healthy_vehicles_to_keep, random_state=50)
print("healthy_vehicles_to_be_removed", healthy_vehicles_to_be_removed.shape)

rebalanced_train_tte_df = train_tte_df[~train_tte_df["vehicle_id"].isin(healthy_vehicles_to_be_removed)]

In [None]:
rebalanced_healthy_and_repaired_vehicles = rebalanced_train_tte_df["in_study_repair"].value_counts()

labels = ["Healthy", "Repaired"]
colors = ["skyblue", "lightgreen", "salmon"]

def autopct_format(pct, all_vals):
    absolute = int(round(pct / 100. * sum(all_vals)))  # Calculate count
    return "{:.1f}%\n({:d})".format(pct, absolute)

plt.figure(figsize=(6, 6))
plt.pie(rebalanced_healthy_and_repaired_vehicles, 
        labels=rebalanced_healthy_and_repaired_vehicles.index, 
        autopct=lambda pct: autopct_format(pct, rebalanced_healthy_and_repaired_vehicles),
        startangle=180, 
        colors=colors)

# Add a legend
plt.legend(labels, title="Vehicle Status", loc="upper right")

plt.title("Category Distribution")
plt.show()

In [None]:
rebalanced_df = combined_train_df[~combined_train_df["vehicle_id"].isin(healthy_vehicles_to_be_removed)]

# Count occurrences of the sequence labels and calculate percentages
rebalanced_df = rebalanced_df.sort_values(by=["vehicle_id", "time_step"]).reset_index(drop=True)
# Obtains the last class variable for each vehicles sequence
rebalanced_last_class_df = rebalanced_df.groupby("vehicle_id")["class"].last().reset_index(drop=True)

In [None]:
# Count occurrences and calculate percentages
rebalanced_class_counts = rebalanced_last_class_df.value_counts()
rebalanced_class_percentages = (rebalanced_class_counts / rebalanced_class_counts.sum()) * 100

# Plot the bar chart
plt.figure(figsize=(8, 5))
plt.bar(rebalanced_class_counts.index, rebalanced_class_percentages.values, color=["skyblue", "lightgreen", "salmon", "orange", "purple"])

# Add labels and title
plt.xlabel("Class")
plt.ylabel("Percentage (%)")
plt.title("Distribution of Classes (in % and Count)")
plt.xticks([0, 1, 2, 3, 4])  # Ensure correct class labels
plt.grid(axis="y", linestyle="--", alpha=0.7)  # Add grid for better readability

# Show percentage and count on bars
for i, (count, pct) in enumerate(zip(rebalanced_class_counts.values, rebalanced_class_percentages.values)):
    plt.text(rebalanced_class_counts.index[i], pct + 1, f"{pct:.1f}% ({count})", ha="center", fontsize=10)

# Show the plot
plt.show()

#### Split data into train/validation/test 80/10/10%

In [None]:
# Overview the df columns 
scroll_df(rebalanced_df.head(60))
# Sort the df on timestep to obtain the correct sequence to have the class correct class at end

last_class_df = rebalanced_df.groupby("vehicle_id")["class"].last().reset_index()

# Rename column for clarity
print(last_class_df["class"].value_counts())

# Stratify the split based on last_class
train_ids, tmp_ids = train_test_split(last_class_df, test_size=0.2, stratify=last_class_df["class"], random_state=50)
print("Train: ", train_ids.shape)
print(train_ids["class"].value_counts())
val_ids, test_ids = train_test_split(tmp_ids, test_size=0.5, stratify=tmp_ids["class"], random_state=50)

print("Val: ", val_ids.shape)
print(val_ids["class"].value_counts())

print("Test: ", test_ids.shape)
print(test_ids["class"].value_counts())

#### Pick out the correct vehicle ids for the different set from the main df

In [None]:
# Extract vehicle_ids for train, val, and test sets
train_vehicle_ids = train_ids["vehicle_id"]
val_vehicle_ids = val_ids["vehicle_id"]
test_vehicle_ids = test_ids["vehicle_id"]

train_df = rebalanced_df[rebalanced_df["vehicle_id"].isin(train_vehicle_ids)].reset_index(drop=True)
val_df = rebalanced_df[rebalanced_df["vehicle_id"].isin(val_vehicle_ids)].reset_index(drop=True)
test_df = rebalanced_df[rebalanced_df["vehicle_id"].isin(test_vehicle_ids)].reset_index(drop=True)

In [None]:
# check that drop index worked!
scroll_df(val_df.head(100))

#### Imputation strategies to create two datasets

In [None]:
# Before imputing remove the columns that should not be imputed
train_impute_features_df = train_df.drop(columns=[
    "vehicle_id", 
    "time_step", 
    "length_of_study_time_step", 
    "in_study_repair", 
    "class"])
train_impute_features_df.shape

#### Create the mean imputer and fit and apply on training data, and only apply on val and test

In [None]:
mean_imputer = SimpleImputer(strategy="mean")
mean_imputed_train_df = pd.DataFrame(mean_imputer.fit_transform(train_impute_features_df), columns=train_impute_features_df.columns)
mean_imputed_train_df.shape

In [None]:
# Check that there is no nan values left
scroll_df(((mean_imputed_train_df.isna().sum() / len(mean_imputed_train_df)) * 100).to_frame())

#### Create the median imputer and fit and apply on training data

In [None]:
median_imputer = SimpleImputer(strategy="median")
median_imputed_train_df = pd.DataFrame(median_imputer.fit_transform(train_impute_features_df), columns=train_impute_features_df.columns)
median_imputed_train_df.shape

In [None]:
# Check that there is no nan values left
scroll_df(((mean_imputed_train_df.isna().sum() / len(mean_imputed_train_df)) * 100).to_frame())

#### Apply the mean and median imputer fitted on training to val and test

In [None]:
val_impute_features_df = val_df.drop(columns=[
    "vehicle_id", 
    "time_step", 
    "length_of_study_time_step", 
    "in_study_repair", 
    "class"])

test_impute_features_df = test_df.drop(columns=[
    "vehicle_id", 
    "time_step", 
    "length_of_study_time_step", 
    "in_study_repair", 
    "class"])

# Val 
mean_imputed_val_df = pd.DataFrame(mean_imputer.transform(val_impute_features_df), columns=val_impute_features_df.columns)
median_imputed_val_df = pd.DataFrame(median_imputer.transform(val_impute_features_df), columns=val_impute_features_df.columns)

# Test
mean_imputed_test_df = pd.DataFrame(mean_imputer.transform(test_impute_features_df), columns=test_impute_features_df.columns)
median_imputed_test_df = pd.DataFrame(median_imputer.transform(test_impute_features_df), columns=test_impute_features_df.columns)

In [None]:
scroll_df(((mean_imputed_val_df.isna().sum() / len(mean_imputed_val_df)) * 100).to_frame())

In [None]:
scroll_df(((median_imputed_val_df.isna().sum() / len(median_imputed_val_df)) * 100).to_frame())

In [None]:
scroll_df(((mean_imputed_test_df.isna().sum() / len(mean_imputed_test_df)) * 100).to_frame())

In [None]:
scroll_df(((median_imputed_test_df.isna().sum() / len(median_imputed_test_df)) * 100).to_frame())

#### Standardize the data with z-score normalization

In [None]:
scaler_1 = StandardScaler()
# Standardize the data to mean = 0 and variance = 1
dataset_1_train_df = pd.DataFrame(scaler_1.fit_transform(mean_imputed_train_df), columns=mean_imputed_train_df.columns)
dataset_1_val_df = pd.DataFrame(scaler_1.transform(mean_imputed_val_df), columns=mean_imputed_val_df.columns)
dataset_1_test_df = pd.DataFrame(scaler_1.transform(mean_imputed_test_df), columns=mean_imputed_test_df.columns)

print(dataset_1_train_df.head())

In [None]:
scaler_2 = StandardScaler()
# Standardize the data to mean = 0 and variance = 1
dataset_2_train_df = pd.DataFrame(scaler_2.fit_transform(median_imputed_train_df), columns=median_imputed_train_df.columns)
dataset_2_val_df = pd.DataFrame(scaler_2.transform(median_imputed_val_df), columns=median_imputed_val_df.columns)
dataset_2_test_df = pd.DataFrame(scaler_2.transform(median_imputed_test_df), columns=median_imputed_test_df.columns)

print(dataset_2_train_df.head())

#### Add back the necessary non value columns

In [None]:
train_columns_to_add_back = train_df[["vehicle_id", "time_step", "class"]]
val_columns_to_add_back = val_df[["vehicle_id", "time_step", "class"]]
test_columns_to_add_back = test_df[["vehicle_id", "time_step", "class"]]

In [None]:
dataset_1_train_df = pd.concat([train_columns_to_add_back, dataset_1_train_df], axis=1)
print(dataset_1_train_df.shape)
print(dataset_1_train_df.head())
# Check so that index were added back correctly
scroll_df(((dataset_1_train_df.isna().sum() / len(dataset_1_train_df)) * 100).to_frame())

In [None]:
dataset_2_train_df = pd.concat([train_columns_to_add_back, dataset_2_train_df], axis=1)
print(dataset_2_train_df.shape)
print(dataset_2_train_df.head())
# Check so that index were added back correctly
scroll_df(((dataset_2_train_df.isna().sum() / len(dataset_2_train_df)) * 100).to_frame())

In [None]:
dataset_1_val_df = pd.concat([val_columns_to_add_back, dataset_1_val_df], axis=1)
print(dataset_1_val_df.shape)
print(dataset_1_val_df.head())

In [None]:
dataset_2_val_df = pd.concat([val_columns_to_add_back, dataset_2_val_df], axis=1)
print(dataset_2_val_df.shape)
print(dataset_2_val_df.head())

In [None]:
dataset_1_test_df = pd.concat([test_columns_to_add_back, dataset_1_test_df], axis=1)
print(dataset_1_test_df.shape)
print(dataset_1_test_df.head())

In [None]:
dataset_2_test_df = pd.concat([test_columns_to_add_back, dataset_2_test_df], axis=1)
print(dataset_2_test_df.shape)
print(dataset_2_test_df.head())

#### Save datasets to files

In [None]:
dataset_1_train_df.to_csv("datasets/train_dataset_1.csv", index=False)
dataset_2_train_df.to_csv("datasets/train_dataset_2.csv", index=False)

In [None]:
dataset_1_val_df.to_csv("datasets/val_dataset_1.csv", index=False)
dataset_2_val_df.to_csv("datasets/val_dataset_2.csv", index=False)

In [None]:
dataset_1_test_df.to_csv("datasets/test_dataset_1.csv", index=False)
dataset_2_test_df.to_csv("datasets/test_dataset_2.csv", index=False)