# Modelling and Evaluation

## Create a cross validation to find the best model for this task

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: NVIDIA GeForce RTX 5060 Ti


## Load the dataset

In [3]:
# Load dataset
data_path = "../datasets/preprocessed/final_df.csv"
df = pd.read_csv(data_path)
print("Dataset loaded with shape:", df.shape)
# Display first few rows of the dataset
df.head()

Dataset loaded with shape: (49181, 42)


Unnamed: 0,battery_id,cycle_index,protocol_id,discharge_capacity_ah_max,discharge_capacity_ah_mean,discharge_capacity_ah_min,charge_capacity_ah_max,charge_capacity_ah_mean,charge_capacity_ah_min,voltage_v_max,...,rolling_mean_soh_percent,rolling_std_discharge_capacity_ah_max,rolling_std_charge_capacity_ah_max,rolling_std_voltage_v_max,rolling_std_current_a_mean,rolling_std_discharge_energy_wh_max,rolling_std_charge_energy_wh_max,rolling_std_aux_temperature_1_c_mean,rolling_std_current_a_abs_mean,rolling_std_soh_percent
0,1,1,1,2.421769,0.84949,3.200319e-05,2.394174,2.115371,0.00333,4.200036,...,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2,1,2.400751,0.442225,2.473989e-10,2.416324,1.819254,0.00333,4.200032,...,99.566061,0.014862,0.015662,3e-06,0.003854,0.228743,0.055605,0.524013,0.435901,0.613682
2,1,3,1,2.399412,0.442961,2.987066e-05,2.396731,1.805943,0.00333,4.200024,...,99.402985,0.012539,0.012118,6e-06,0.002735,0.187687,0.043485,0.444561,0.355911,0.517769
3,1,4,1,2.397874,0.443705,3.193769e-05,2.394747,1.803746,0.003329,4.200034,...,99.30557,0.011273,0.01061,5e-06,0.002262,0.163473,0.038482,0.387616,0.308554,0.465491
4,1,5,1,2.397038,0.443911,3.313753e-05,2.393583,1.80277,0.003329,4.200028,...,99.240217,0.010384,0.009694,5e-06,0.001984,0.146851,0.035502,0.347763,0.276177,0.428797


In [4]:
# Show updated columns
df.columns

Index(['battery_id', 'cycle_index', 'protocol_id', 'discharge_capacity_ah_max',
       'discharge_capacity_ah_mean', 'discharge_capacity_ah_min',
       'charge_capacity_ah_max', 'charge_capacity_ah_mean',
       'charge_capacity_ah_min', 'voltage_v_max', 'voltage_v_mean',
       'voltage_v_min', 'voltage_v_std', 'current_a_mean', 'current_a_std',
       'discharge_energy_wh_max', 'charge_energy_wh_max',
       'aux_temperature_1_c_mean', 'aux_temperature_1_c_max',
       'aux_temperature_1_c_min', 'aux_temperature_1_c_std',
       'current_a_abs_mean', 'soh_percent', 'rul',
       'rolling_mean_discharge_capacity_ah_max',
       'rolling_mean_charge_capacity_ah_max', 'rolling_mean_voltage_v_max',
       'rolling_mean_current_a_mean', 'rolling_mean_discharge_energy_wh_max',
       'rolling_mean_charge_energy_wh_max',
       'rolling_mean_aux_temperature_1_c_mean',
       'rolling_mean_current_a_abs_mean', 'rolling_mean_soh_percent',
       'rolling_std_discharge_capacity_ah_max',
     

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49181 entries, 0 to 49180
Data columns (total 42 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   battery_id                              49181 non-null  int64  
 1   cycle_index                             49181 non-null  int64  
 2   protocol_id                             49181 non-null  int64  
 3   discharge_capacity_ah_max               49181 non-null  float64
 4   discharge_capacity_ah_mean              49181 non-null  float64
 5   discharge_capacity_ah_min               49181 non-null  float64
 6   charge_capacity_ah_max                  49181 non-null  float64
 7   charge_capacity_ah_mean                 49181 non-null  float64
 8   charge_capacity_ah_min                  49181 non-null  float64
 9   voltage_v_max                           49181 non-null  float64
 10  voltage_v_mean                          49181 non-null  fl

In [6]:
# Convert protocol_id to categorical
categorical_features = ['protocol_id']
for col in categorical_features:
    df[col] = df[col].astype('category')
# Verify conversion
df.dtypes

battery_id                                   int64
cycle_index                                  int64
protocol_id                               category
discharge_capacity_ah_max                  float64
discharge_capacity_ah_mean                 float64
discharge_capacity_ah_min                  float64
charge_capacity_ah_max                     float64
charge_capacity_ah_mean                    float64
charge_capacity_ah_min                     float64
voltage_v_max                              float64
voltage_v_mean                             float64
voltage_v_min                              float64
voltage_v_std                              float64
current_a_mean                             float64
current_a_std                              float64
discharge_energy_wh_max                    float64
charge_energy_wh_max                       float64
aux_temperature_1_c_mean                   float64
aux_temperature_1_c_max                    float64
aux_temperature_1_c_min        

## Split Train and Test data

In [7]:
# Using battery id to split data into train and test sets
battery_ids = df['battery_id'].unique()
train_ids, test_ids = train_test_split(battery_ids, test_size=0.2, random_state=42)
train_df = df[df['battery_id'].isin(train_ids)]
test_df = df[df['battery_id'].isin(test_ids)]
X_train = train_df.drop(columns=['rul', 'battery_id', 'cycle_index'])
y_train = train_df['rul']
X_test = test_df.drop(columns=['rul', 'battery_id', 'cycle_index'])
y_test = test_df['rul']
print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Train set shape: (39923, 39) (39923,)
Test set shape: (9258, 39) (9258,)


In [8]:
# Create a pipeline for preprocessing
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object','category']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
# Fit and transform the training data, transform the test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)
print("Processed train set shape:", X_train_processed.shape)
print("Processed test set shape:", X_test_processed.shape)

Processed train set shape: (39923, 54)
Processed test set shape: (9258, 54)


## Train the model