In [24]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [25]:
# Load dataset
df = pd.read_csv('../01_Dataset/EV_Predictive_Maintenance_Dataset.csv')
print(f"Dataset loaded: {df.shape[0]:,} rows × {df.shape[1]} columns")

Dataset loaded: 175,393 rows × 30 columns


In [26]:
# Convert Timestamp to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
print("Timestamp converted to datetime")
print(f"Date range: {df['Timestamp'].min()} to {df['Timestamp'].max()}")

Timestamp converted to datetime
Date range: 2020-01-01 00:00:00 to 2025-01-01 00:00:00


In [27]:
# Check for non-physical values
initial_rows = len(df)

# Remove negative voltages
df = df[df['Battery_Voltage'] > 0]

# Remove invalid SoC (should be between 0-100)
df = df[(df['SoC'] >= 0) & (df['SoC'] <= 100)]

# Remove invalid SoH (should be between 0-100)
df = df[(df['SoH'] >= 0) & (df['SoH'] <= 100)]

removed_rows = initial_rows - len(df)
print(f"Removed {removed_rows} rows with non-physical values")
print(f"Remaining: {len(df):,} rows")

Removed 0 rows with non-physical values
Remaining: 175,393 rows


In [28]:
# Sort by Timestamp for time-series consistency
df = df.sort_values('Timestamp').reset_index(drop=True)
print("Data sorted chronologically by Timestamp")

Data sorted chronologically by Timestamp


In [29]:
# Feature Engineering: Power_Load (Voltage * Current)
df['Power_Load'] = df['Battery_Voltage'] * df['Battery_Current']
print("Created Power_Load feature")

Created Power_Load feature


In [30]:
# Feature Engineering: Temperature Gradient
df['Temp_Diff'] = df['Battery_Temperature'] - df['Ambient_Temperature']
print("Created Temp_Diff feature")

Created Temp_Diff feature


In [31]:
# Feature Engineering: Voltage per Cycle (rolling average)
# Using a rolling window of 10 rows to capture voltage behavior patterns
df['Voltage_per_Cycle'] = df['Battery_Voltage'].rolling(window=10, min_periods=1).mean()
print("Created Voltage_per_Cycle feature (rolling window=10)")

Created Voltage_per_Cycle feature (rolling window=10)


In [32]:
# Feature Selection - select hybrid feature set
feature_columns = [
    # Core Battery
    'Battery_Voltage', 'Battery_Current', 'Battery_Temperature', 'Charge_Cycles', 'SoC',
    # Vehicle Context
    'Driving_Speed', 'Motor_Torque', 'Power_Consumption',
    # Environment
    'Ambient_Temperature', 'Ambient_Humidity',
    # Engineered
    'Power_Load', 'Temp_Diff', 'Voltage_per_Cycle'
]

X = df[feature_columns]
y = df['SoH']

print(f"Features selected: {len(feature_columns)}")
print(f"Feature set shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features selected: 13
Feature set shape: (175393, 13)
Target shape: (175393,)


In [33]:
# Time-series train-test split (80/20)
split_index = int(len(X) * 0.8)

X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

print(f"Training set: {X_train.shape[0]:,} samples")
print(f"Test set: {X_test.shape[0]:,} samples")
print(f"Split ratio: {X_train.shape[0]/len(X)*100:.1f}% train, {X_test.shape[0]/len(X)*100:.1f}% test")

Training set: 140,314 samples
Test set: 35,079 samples
Split ratio: 80.0% train, 20.0% test


In [34]:
# Feature Scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for display
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_columns, index=X_test.index)

print("Features scaled using StandardScaler")
print("\nFirst 5 rows of preprocessed training set:")
X_train_scaled.head()

Features scaled using StandardScaler

First 5 rows of preprocessed training set:


Unnamed: 0,Battery_Voltage,Battery_Current,Battery_Temperature,Charge_Cycles,SoC,Driving_Speed,Motor_Torque,Power_Consumption,Ambient_Temperature,Ambient_Humidity,Power_Load,Temp_Diff,Voltage_per_Cycle
0,-2.577029,0.554037,-0.721915,-0.416382,0.156951,2.192703,-0.864396,-0.195489,-0.099159,2.66444,0.738302,-0.425864,-8.213814
1,0.20462,0.444791,2.346075,-0.279497,-2.457114,-0.592865,-0.966686,-0.380888,-2.302464,-0.340638,0.41462,3.276945,-3.78078
2,0.654045,0.247292,-0.442976,-0.158398,0.320185,0.064771,-0.783921,-0.553589,1.102311,-0.712264,0.160201,-1.100199,-1.825611
3,0.323427,0.22603,-0.427791,-0.643246,0.249342,-0.681203,0.03619,-0.662181,-0.238865,-0.147461,0.179324,-0.122463,-1.111475
4,0.674962,0.741203,-0.523392,-0.327704,0.573901,-0.819444,-0.084204,-0.48987,0.535557,-0.242515,0.69032,-0.746853,-0.4589
