# Import Module

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import BaseCrossValidator

# Read Dataset

In [2]:
df = pd.read_csv("../Datasets/merged_battery_dataset.csv")
df["source"].unique()

array(['nasa', 'oxford', 'isu'], dtype=object)

In [3]:
df = pd.read_csv("../Datasets/merged_battery_dataset.csv")

# clean from NaN, -inf, and inf
print(f"Initial Dataset entries {len(df)}")

df = df.drop(columns=[
    # 'battery_id',
    # 'soh',
    # 'computed_q_max_Ah',
    'capacity_reported_Ah', # mengurangi dari 6034
    # 'q_gap_abs',
    # 'q_gap_pct',
    'dT_dt_mean', # mengurangi dari 6034
    'dT_dt_std', # mengurangi dari 6034
    'dT_dt_max', # mengurangi dari 6034
    'dT_dt_min', # mengurangi dari 6034
    # 'source',
    # 'test_condition'
])


df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna()
# df = df.apply(pd.to_numeric, errors="coerce")

print(f"Dataset entries after drop NaN, inf, -inf {len(df)}")


Initial Dataset entries 6855
Dataset entries after drop NaN, inf, -inf 6034


# Custom Battery-Wise Stratified Split

In [4]:
class BatteryWiseSplit(BaseCrossValidator):
    def __init__(self, n_splits=5, random_state=None, column_group='source'):
        self.n_splits = n_splits
        self.random_state = random_state
        self.column_group = column_group

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits
    
    def split(self, X, y = None, groups = None):
        df = X.copy()
        if ('battery_id' not in df.columns) or (self.column_group not in df.columns):
            raise ValueError(f"DataFrame must contain 'battery_id' and '{self.column_group}' columns.")
        
        rng = np.random.RandomState(self.random_state)
        unique_batteries = df[["battery_id", self.column_group]].drop_duplicates()
        batteries_by_source = unique_batteries.groupby(self.column_group)["battery_id"].apply(list)
        folds = [[] for _ in range(self.n_splits)]

        for source, battery_list in batteries_by_source.items():
            rng.shuffle(battery_list)
            for i, battery_id in enumerate(battery_list):
                folds[i % self.n_splits].append(battery_id)

        for i in range(self.n_splits):
            val_batteries = set(folds[i])
            train_batteries = set(unique_batteries["battery_id"]) - val_batteries
            train_idx = df[df["battery_id"].isin(train_batteries)].index.values
            val_idx = df[df["battery_id"].isin(val_batteries)].index.values

            yield train_idx, val_idx

# Modeling

## Define Features and Targe

In [5]:
df.columns

Index(['battery_id', 'cycle_idx', 'rated_capacity', 'ambient_temperature',
       'capacity', 'soh', 'computed_q_max_Ah', 'duration', 'q_gap_abs',
       'q_gap_pct', 'dv_dt_mean', 'dv_dt_std', 'dv_dt_max', 'dv_dt_min',
       'dq_dt_mean', 'dq_dt_std', 'dq_dt_max', 'dq_dt_min', 'test_condition',
       'source'],
      dtype='object')

In [6]:
features_column = ['battery_id', 'cycle_idx', 'rated_capacity', 'ambient_temperature',
       'computed_q_max_Ah', 
       'duration', 'q_gap_abs', 'q_gap_pct', 'dv_dt_mean', 'dv_dt_std',
       'dv_dt_max', 'dv_dt_min', 'dq_dt_mean', 'dq_dt_std', 'dq_dt_max', 'dq_dt_min',
       'test_condition', 'source']
target_column = 'soh'

X = df[features_column]
y = df[target_column]

## Split Train Test

In [7]:
splitter = BatteryWiseSplit(n_splits=5, random_state=42, column_group='source')

df.reset_index(drop=True, inplace=True)
train_idx, test_idx = next(splitter.split(df))

df1, df_test = df.iloc[train_idx], df.iloc[test_idx]
print(len(train_idx))
print(len(test_idx))


4633
1401


## Initialize custom CV

In [8]:
cv = BatteryWiseSplit(n_splits=5, random_state=1, column_group='source')

## Initialize Model

In [9]:
# Import simple models from sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=50,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

# CHOOSE YOUR MODEL HERE 👇
# model = RandomForestRegressor(
#     n_estimators=100,
#     random_state=42,
#     n_jobs=-1
# )
# model = SVR(kernel='rbf', C=10)
model = LinearRegression()
# model = KNeighborsRegressor(n_neighbors=5)

## Cross Validation loop

In [10]:
# === Initialize Metrics ===
rmse_scores, mae_scores, r2_scores = [], [], []

# === Reset Index for Consistent iloc Access ===
df1.reset_index(drop=True, inplace=True)

# === Cross-Validation Loop ===
for fold, (train_idx, val_idx) in enumerate(cv.split(df1), 1):
    # Select Features and Target
    X = df1[features_column]
    y = df1[target_column]

    # One-Hot Encode Categorical Features (avoiding leakage by using entire X, not split parts)
    cat_cols = X.select_dtypes(include='object').columns.tolist()
    X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)
    X_encoded = X_encoded.fillna(0)

    # Split into Training and Validation Sets
    X_train, X_val = X_encoded.iloc[train_idx], X_encoded.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Train Model and Predict
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    # Evaluate
    rmse_scores.append(root_mean_squared_error(y_val, y_pred))
    mae_scores.append(mean_absolute_error(y_val, y_pred))
    r2_scores.append(r2_score(y_val, y_pred))

    print(f"Fold {fold} — RMSE: {rmse_scores[-1]:.4f}, MAE: {mae_scores[-1]:.4f}, R²: {r2_scores[-1]:.4f}")

# === Final Evaluation Summary ===
print("\n=== Average Performance ===")
print(f"Average RMSE: {np.mean(rmse_scores):.4f}")
print(f"Average MAE : {np.mean(mae_scores):.4f}")
print(f"Average R²  : {np.mean(r2_scores):.4f}")


Fold 1 — RMSE: 0.2405, MAE: 0.1181, R²: 0.0171
Fold 2 — RMSE: 0.0653, MAE: 0.0367, R²: 0.8735
Fold 3 — RMSE: 0.0632, MAE: 0.0313, R²: 0.9132
Fold 4 — RMSE: 0.1530, MAE: 0.0702, R²: 0.7344
Fold 5 — RMSE: 0.0692, MAE: 0.0345, R²: 0.8798

=== Average Performance ===
Average RMSE: 0.1182
Average MAE : 0.0582
Average R²  : 0.6836
