# 2.Feature Engineering and SOH Quantification

This notebook focuses on transforming cleaned battery time-series data into
quantitative health indicators.

The objectives of this notebook are:
- To derive discharge-based capacity features
- To compute State-of-Health (SOH) using industry-standard definitions
- To prepare a cycle-level dataset for degradation analysis and prediction

## 2.1 Set Up & Data Loading

### 2.1.1 Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

import warnings
warnings.filterwarnings("ignore")

### 2.1.2 Data Loading

The processed dataset generated in Notebook 1 is loaded.
This dataset contains validated time-series measurements
for charge and discharge phases.


In [2]:
# Path to processed data
data_path = Path("../data/processed")

# Load latest processed file manually or by name
data_file = "battery_timeseries_ac01_ac02_processed.csv"

data = pd.read_csv(data_path / data_file)

# Basic sanity check
print("Dataset loaded successfully")
print(f"Rows    : {len(data):,}")
print(f"Columns : {list(data.columns)}")
print(f"Cells   : {data['cell_id'].unique()}")

Dataset loaded successfully
Rows    : 105,095
Columns : ['time_s', 'current_a', 'voltage_v', 'temp_cell_c', 'temp_connector_c', 'cell_id', 'checkup_num', 'source_file', 'test_phase']
Cells   : ['AC01' 'AC02']


### 2.1.3 Schema Validation

In [3]:
required_columns = {
    "time_s",
    "current_a",
    "voltage_v",
    "cell_id",
    "checkup_num"
}

missing_cols = required_columns - set(data.columns)

if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

print("Schema validation passed")


Schema validation passed


### 2.1.4 Ensure Correct Sorting

In [4]:
data = (
    data
    .sort_values(["cell_id", "checkup_num", "time_s"])
    .reset_index(drop=True)
)

print("Data sorted by cell_id → checkup_num → time_s")

Data sorted by cell_id → checkup_num → time_s


## 2.2 Data Preprocessing

### 2.2.1 Charge / Discharge Segmentation

In [5]:
data["test_phase"] = np.where(
    data["current_a"] < 0,
    "discharge",
    "charge"
)

data["test_phase"].value_counts()

test_phase
discharge    54086
charge       51009
Name: count, dtype: int64

### 2.2.2 Extract Discharge Data Only

In [6]:
discharge_df = data[data["test_phase"] == "discharge"].copy()

print(f"Discharge rows: {len(discharge_df):,}")
discharge_df.head()

Discharge rows: 54,086


Unnamed: 0,time_s,current_a,voltage_v,temp_cell_c,temp_connector_c,cell_id,checkup_num,source_file,test_phase
1919,9581.712,-0.009591,4.199609,25.449566,25.389814,AC01,0,AC01_CheckUp00_17-Jul-2020_Cap_raw.csv,discharge
1920,9582.717,-46.129978,4.154029,25.449566,25.389814,AC01,0,AC01_CheckUp00_17-Jul-2020_Cap_raw.csv,discharge
1921,9583.717,-49.83112,4.14847,25.442125,25.367395,AC01,0,AC01_CheckUp00_17-Jul-2020_Cap_raw.csv,discharge
1922,9584.724,-50.118862,4.145969,25.434622,25.382341,AC01,0,AC01_CheckUp00_17-Jul-2020_Cap_raw.csv,discharge
1923,9585.731,-50.110401,4.144301,25.464512,25.359922,AC01,0,AC01_CheckUp00_17-Jul-2020_Cap_raw.csv,discharge


### 2.2.3 Time-Step Calculation

In [7]:
discharge_df["delta_t"] = (
    discharge_df
    .groupby(["cell_id", "checkup_num"])["time_s"]
    .diff()
)

discharge_df["delta_t"] = discharge_df["delta_t"].fillna(0)

## 2.3 Feature Extraction

### 2.3.1 Instantaneous Capacity Increment (Ah)

In [8]:
# Convert current (A) and time (s) → Ah
discharge_df["dQ_ah"] = (
    discharge_df["current_a"].abs() *
    discharge_df["delta_t"] / 3600
)

### 2.3.2 Discharge Capacity per Checkup

In [9]:
# Aggregate discharge information per checkup
discharge_summary = (
    discharge_df
    .groupby(["cell_id", "checkup_num"], as_index=False)
    .agg(
        discharge_capacity_ah=("dQ_ah", "sum"),
        duration_s=("delta_t", "sum"),
        mean_current_a=("current_a", lambda x: x.abs().mean()),
        min_voltage_v=("voltage_v", "min")
    )
)

print(f"Total discharge checkups found: {len(discharge_summary)}")

Total discharge checkups found: 25


### 2.3.3 Valid Capacity Cycle Selection

In [10]:
MIN_CAPACITY_AH = 30
MAX_CAPACITY_AH = 70
MIN_DURATION_S = 1000
MIN_MEAN_CURRENT_A = 5

valid_capacity_cycles = discharge_summary[
    (discharge_summary["discharge_capacity_ah"].between(
        MIN_CAPACITY_AH, MAX_CAPACITY_AH
    )) &
    (discharge_summary["duration_s"] > MIN_DURATION_S) &
    (discharge_summary["mean_current_a"] > MIN_MEAN_CURRENT_A)
].copy()

print(f"Valid capacity cycles retained: {len(valid_capacity_cycles)}")

Valid capacity cycles retained: 21


## 2.4 SOH Calculation

### 2.4.1 Beginning of Life (BOL) Capacity

In [11]:
bol_capacity = (
    valid_capacity_cycles
    .sort_values("checkup_num")
    .groupby("cell_id", as_index=False)
    .first()
    .rename(columns={
        "discharge_capacity_ah": "bol_capacity_ah"
    })
)

bol_capacity

Unnamed: 0,cell_id,checkup_num,bol_capacity_ah,duration_s,mean_current_a,min_voltage_v
0,AC01,0,56.481591,11070.408,27.547249,2.50009
1,AC02,0,56.502266,11078.829,32.693205,2.500163


### 2.4.2 State of Health (SOH) Calculation

In [12]:
soh_df = valid_capacity_cycles.merge(
    bol_capacity,
    on="cell_id",
    how="left"
)

soh_df["soh"] = (
    soh_df["discharge_capacity_ah"] /
    soh_df["bol_capacity_ah"]
)

soh_df.head()

Unnamed: 0,cell_id,checkup_num_x,discharge_capacity_ah,duration_s_x,mean_current_a_x,min_voltage_v_x,checkup_num_y,bol_capacity_ah,duration_s_y,mean_current_a_y,min_voltage_v_y,soh
0,AC01,0,56.481591,11070.408,27.547249,2.50009,0,56.481591,11070.408,27.547249,2.50009,1.0
1,AC01,1,55.913008,11975.144,27.936964,2.50009,0,56.481591,11070.408,27.547249,2.50009,0.989933
2,AC01,2,55.450377,18275.024,27.62031,2.50009,0,56.481591,11070.408,27.547249,2.50009,0.981742
3,AC01,4,52.188337,10700.773,27.74321,2.50009,0,56.481591,11070.408,27.547249,2.50009,0.923988
4,AC01,5,50.617659,10563.948,25.874018,2.50009,0,56.481591,11070.408,27.547249,2.50009,0.89618


## 2.5 Validation & Export

### 2.5.1 Validation & Sanity Checks

In [13]:
assert soh_df["soh"].notna().all(), "SOH contains NaN values"
assert soh_df["soh"].between(0.5, 1.05).all(), "SOH outside physical range"

print("SOH validation passed")

SOH validation passed


### 2.5.2 Save Feature Dataset

In [14]:
folder_path = Path("../data/features")

feature_path = (
    folder_path /
    "battery_features_soh_ac01_ac02.csv"
)

soh_df.to_csv(feature_path, index=False)

print("Feature dataset saved")
print(feature_path)

Feature dataset saved
../data/features/battery_features_soh_ac01_ac02.csv
