# Data Acquisition and EDA

## Import necessary libraries

In [1]:
# Import necessary libraries

# Data manipulation and analysis
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
import psutil # For system and process utilities

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats
from scipy.signal import find_peaks

# Time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Machine learning for additional insights
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

# Progress tracking
from tqdm.auto import tqdm

# Warnings management
import warnings
warnings.filterwarnings("ignore")

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("All libraries imported successfully.")

All libraries imported successfully.


  from .autonotebook import tqdm as notebook_tqdm


## Load the dataset

In [2]:
# Setup the path to the dataset
ZENODO_PRIMARY_PATH = "../datasets/Primary_use_phase/"

# Load the dataset
primary_files = glob.glob(os.path.join(ZENODO_PRIMARY_PATH, "*.csv"))
zenodo_primary_data = {}
for file in primary_files:
    file_name = Path(file).stem
    zenodo_primary_data[file_name] = pd.read_csv(file)
    print(f"Loaded {file_name} with shape: {zenodo_primary_data[file_name].shape}")
print("Dataset loaded successfully.")

Loaded 1 with shape: (728434, 15)
Loaded 10 with shape: (655813, 15)
Loaded 11 with shape: (654196, 15)
Loaded 12 with shape: (811734, 15)
Loaded 13 with shape: (110918, 15)
Loaded 14 with shape: (103443, 15)
Loaded 15 with shape: (131426, 15)
Loaded 16 with shape: (110471, 15)
Loaded 17 with shape: (1727609, 15)
Loaded 18 with shape: (1664229, 15)
Loaded 19 with shape: (1746745, 15)
Loaded 2 with shape: (763299, 15)
Loaded 20 with shape: (1656244, 15)
Loaded 21 with shape: (2051653, 15)
Loaded 22 with shape: (2144592, 15)
Loaded 23 with shape: (2183081, 15)
Loaded 24 with shape: (1904977, 15)
Loaded 25 with shape: (186045, 15)
Loaded 26 with shape: (286461, 15)
Loaded 27 with shape: (137904, 15)
Loaded 28 with shape: (116591, 15)
Loaded 29 with shape: (1627854, 15)
Loaded 3 with shape: (980242, 15)
Loaded 30 with shape: (1526947, 15)
Loaded 31 with shape: (1430848, 15)
Loaded 32 with shape: (1510949, 15)
Loaded 33 with shape: (263433, 15)
Loaded 34 with shape: (187392, 15)
Loaded 35 w

In [3]:
# Set the dataframe for primary use phase (all file combined)
primary_df = pd.concat(zenodo_primary_data.values(), ignore_index=True)
print("Primary use phase dataframe created successfully with shape:.", primary_df.shape)

Primary use phase dataframe created successfully with shape:. (87484642, 15)


In [4]:
# Check the dataframe info
primary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87484642 entries, 0 to 87484641
Data columns (total 15 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Unnamed: 0                int64  
 1   Test_Time(s)              float64
 2   Step_Time(s)              float64
 3   Cycle_Index               int64  
 4   Step_Index                int64  
 5   Current(A)                float64
 6   Voltage(V)                float64
 7   Charge_Capacity(Ah)       float64
 8   Discharge_Capacity(Ah)    float64
 9   Charge_Energy(Wh)         float64
 10  Discharge_Energy(Wh)      float64
 11  Internal_Resistance(Ohm)  float64
 12  Aux_Temperature_1(C)      float64
 13  Battery_ID                int64  
 14  Protocol_ID               int64  
dtypes: float64(10), int64(5)
memory usage: 9.8 GB


In [5]:
# Drop internal resistance columns if they exist
if 'Internal_Resistance(Ohm)' in primary_df.columns:
    primary_df = primary_df.drop(columns=['Internal_Resistance(Ohm)'])
    print("Dropped 'Internal_Resistance(Ohm)' column.")
print("Current dataframe shape:", primary_df.shape)

Dropped 'Internal_Resistance(Ohm)' column.
Current dataframe shape: (87484642, 14)


In [6]:
primary_df.head()

Unnamed: 0.1,Unnamed: 0,Test_Time(s),Step_Time(s),Cycle_Index,Step_Index,Current(A),Voltage(V),Charge_Capacity(Ah),Discharge_Capacity(Ah),Charge_Energy(Wh),Discharge_Energy(Wh),Aux_Temperature_1(C),Battery_ID,Protocol_ID
0,863,8564.156,10.001,1,12,1.2,3.3,0.003,0.0,0.011,0.0,25.282,9,2
1,864,8574.157,20.001,1,12,1.2,3.322,0.007,0.0,0.022,0.0,25.251,9,2
2,865,8584.157,30.002,1,12,1.2,3.342,0.01,0.0,0.033,0.0,25.247,9,2
3,866,8594.156,40.001,1,12,1.2,3.359,0.013,0.0,0.044,0.0,25.211,9,2
4,867,8604.157,50.002,1,12,1.2,3.375,0.017,0.0,0.055,0.0,25.169,9,2


## Understanding Battery Degradation and RUL

RUL (Remaining Useful Life) represents the number of cycles a battery can still perform before it reaches its end-of-life(EOL) criteria. Typically, a battery is considered at EOL when its capacity degrades to 80% of its initial capacity.

In [7]:
# Analyze battery-level degradation patterns
# Extract cycle-level statistics for each battery

def extract_cycle_features(df):
    """Extract key features at the cycle level for RUL prediction"""
    cycle_stats = df.groupby(['Battery_ID', 'Cycle_Index', 'Protocol_ID']).agg({
        'Discharge_Capacity(Ah)': ['max', 'mean', 'min'],
        'Charge_Capacity(Ah)': ['max', 'mean', 'min'],
        'Voltage(V)': ['max', 'mean', 'min', 'std'],
        'Current(A)': ['mean', 'std'],
        'Discharge_Energy(Wh)': 'max',
        'Charge_Energy(Wh)': 'max',
        'Aux_Temperature_1(C)': ['mean', 'max', 'min', 'std']
    }).reset_index()
    
    # Flatten column names
    cycle_stats.columns = ['_'.join(col).strip('_') if col[1] else col[0] 
                           for col in cycle_stats.columns.values]
    
    # Include absolute current mean to better capture C-rate severity
    abs_current = (df.assign(Current_abs=lambda x: x['Current(A)'].abs())
                     .groupby(['Battery_ID', 'Cycle_Index'])['Current_abs']
                     .mean()
                     .reset_index(name='Current(A)_abs_mean'))
    cycle_stats = cycle_stats.merge(abs_current, on=['Battery_ID', 'Cycle_Index'], how='left')
    
    return cycle_stats

print("Processing cycle-level features for primary use phase...")
print("This will process ALL batteries in batches of 25...")

# Get all unique batteries
all_batteries = primary_df['Battery_ID'].unique()
total_batteries = len(all_batteries)
batch_size = 25
num_batches = (total_batteries + batch_size - 1) // batch_size  # Ceiling division

print(f"Total batteries: {total_batteries}")
print(f"Batch size: {batch_size}")
print(f"Number of batches: {num_batches}")

# Process in batches and store results
all_cycle_features = []

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min((batch_idx + 1) * batch_size, total_batteries)
    
    batch_batteries = all_batteries[start_idx:end_idx]
    
    print(f"\nProcessing batch {batch_idx + 1}/{num_batches} (Batteries {start_idx + 1}-{end_idx})...")
    
    # Filter data for current batch
    batch_df = primary_df[primary_df['Battery_ID'].isin(batch_batteries)]
    
    # Extract features for this batch
    batch_features = extract_cycle_features(batch_df)
    all_cycle_features.append(batch_features)
    
    print(f"  ✓ Batch {batch_idx + 1} completed: {len(batch_batteries)} batteries, {len(batch_features)} records")
    
    # Free memory
    del batch_df, batch_features

# Combine all batches
cycle_features = pd.concat(all_cycle_features, ignore_index=True)
del all_cycle_features

print(f"\n{'='*80}")
print(f"✓ All batches processed!")
print(f"✓ Total cycle features shape: {cycle_features.shape}")
print(f"✓ Total unique batteries processed: {cycle_features['Battery_ID'].nunique()}")
print(f"{'='*80}")

Processing cycle-level features for primary use phase...
This will process ALL batteries in batches of 25...
Total batteries: 86
Batch size: 25
Number of batches: 4

Processing batch 1/4 (Batteries 1-25)...
  ✓ Batch 1 completed: 25 batteries, 16190 records

Processing batch 2/4 (Batteries 26-50)...
  ✓ Batch 2 completed: 25 batteries, 8580 records

Processing batch 3/4 (Batteries 51-75)...
  ✓ Batch 3 completed: 25 batteries, 14355 records

Processing batch 4/4 (Batteries 76-86)...
  ✓ Batch 4 completed: 11 batteries, 10056 records

✓ All batches processed!
✓ Total cycle features shape: (49181, 22)
✓ Total unique batteries processed: 86


In [8]:
cycle_features.head()

Unnamed: 0,Battery_ID,Cycle_Index,Protocol_ID,Discharge_Capacity(Ah)_max,Discharge_Capacity(Ah)_mean,Discharge_Capacity(Ah)_min,Charge_Capacity(Ah)_max,Charge_Capacity(Ah)_mean,Charge_Capacity(Ah)_min,Voltage(V)_max,Voltage(V)_mean,Voltage(V)_min,Voltage(V)_std,Current(A)_mean,Current(A)_std,Discharge_Energy(Wh)_max,Charge_Energy(Wh)_max,Aux_Temperature_1(C)_mean,Aux_Temperature_1(C)_max,Aux_Temperature_1(C)_min,Aux_Temperature_1(C)_std,Current(A)_abs_mean
0,1,1,1,2.422,0.849,0.0,2.394,2.115,0.003,4.2,3.825,3.0,0.288,-0.003,0.671,9.163,9.37,25.004,25.438,24.398,0.213,0.557
1,1,2,1,2.401,0.442,0.0,2.416,1.819,0.003,4.2,3.869,3.0,0.295,0.002,1.441,8.839,9.449,25.745,29.539,24.382,1.491,1.174
2,1,3,1,2.399,0.443,0.0,2.397,1.806,0.003,4.2,3.87,3.0,0.293,-0.001,1.442,8.836,9.377,25.8,29.626,24.458,1.469,1.174
3,1,4,1,2.398,0.444,0.0,2.395,1.804,0.003,4.2,3.87,3.0,0.293,-0.002,1.443,8.832,9.369,25.788,29.583,24.368,1.496,1.176
4,1,5,1,2.397,0.444,0.0,2.394,1.803,0.003,4.2,3.87,3.0,0.293,-0.002,1.444,8.83,9.364,25.787,29.498,24.376,1.478,1.176


In [9]:
# Check protocol ID with highest cycle index
protocol_cycle_counts = cycle_features.groupby(['Battery_ID','Protocol_ID'])['Cycle_Index'].max().reset_index()
protocol_summary = protocol_cycle_counts.groupby('Protocol_ID')['Cycle_Index'].agg(['mean', 'max', 'min', 'count']).reset_index()
protocol_summary

Unnamed: 0,Protocol_ID,mean,max,min,count
0,1,941.0,1116,711,8
1,2,512.25,586,416,8
2,3,445.167,491,416,6
3,4,475.375,546,441,8
4,5,153.5,196,111,8
5,6,76.0,91,66,6
6,7,1096.0,1126,1061,4
7,8,1363.5,1436,1251,4
8,9,149.75,196,126,4
9,10,86.0,91,76,4


## Calculate SOH and RUL

**1. SOH (State of Health)**
This formula calculates the battery's current health as a percentage of its "new" condition.

$$
SOH(\%) = \left( \frac{\text{Current\_Capacity}}{\text{Nominal\_Capacity}} \right) \times 100
$$

* **Current\_Capacity:** `Discharge_Capacity(Ah)_max` for the current cycle.
* **Nominal\_Capacity:** The `Discharge_Capacity(Ah)_max` from `Cycle_Index = 1`.

**2. RUL (Remaining Useful Life)**
This formula calculates the "answer" (target variable) for model.

$$
RUL = \text{EOL\_Cycle} - \text{Current\_Cycle\_Index}
$$

* **EOL\_Cycle:** The first `Cycle_Index` where the `SOH(%)` drops to or below the threshold (e.g., 80.0).
* **Current\_Cycle\_Index:** The `Cycle_Index` of the current row.


In [10]:
# Find the nominal capacity for each battery id
nominal_capacities = {}
for battery_id in cycle_features['Battery_ID'].unique():
    battery_data = cycle_features[cycle_features['Battery_ID'] == battery_id]
    nominal_capacity = battery_data[battery_data['Cycle_Index'] == 1]['Discharge_Capacity(Ah)_max'].values[0]
    nominal_capacities[battery_id] = nominal_capacity
print("Nominal capacities for each battery ID calculated successfully.")
print(nominal_capacities)

Nominal capacities for each battery ID calculated successfully.
{np.int64(1): np.float64(2.421769), np.int64(2): np.float64(2.419718), np.int64(3): np.float64(2.427028), np.int64(9): np.float64(2.430357), np.int64(10): np.float64(2.433783), np.int64(11): np.float64(2.429017), np.int64(24): np.float64(2.4384), np.int64(25): np.float64(2.436674), np.int64(26): np.float64(2.4378), np.int64(39): np.float64(2.427216), np.int64(40): np.float64(2.434681), np.int64(41): np.float64(2.440794), np.int64(42): np.float64(2.438203), np.int64(45): np.float64(2.43206), np.int64(46): np.float64(2.430735), np.int64(47): np.float64(2.435594), np.int64(48): np.float64(2.44081), np.int64(49): np.float64(2.434667), np.int64(50): np.float64(2.432485), np.int64(51): np.float64(2.437141), np.int64(52): np.float64(2.438994), np.int64(53): np.float64(2.42512), np.int64(54): np.float64(2.432669), np.int64(57): np.float64(2.440253), np.int64(58): np.float64(2.447194), np.int64(4): np.float64(2.429459), np.int64(12

In [11]:
# Calculate the State of Health (SOH) for each cycle
cycle_features['SOH(%)'] = cycle_features.apply(
    lambda row: (row['Discharge_Capacity(Ah)_max'] / nominal_capacities[row['Battery_ID']]) * 100,
    axis=1
)
cycle_features.head()

Unnamed: 0,Battery_ID,Cycle_Index,Protocol_ID,Discharge_Capacity(Ah)_max,Discharge_Capacity(Ah)_mean,Discharge_Capacity(Ah)_min,Charge_Capacity(Ah)_max,Charge_Capacity(Ah)_mean,Charge_Capacity(Ah)_min,Voltage(V)_max,Voltage(V)_mean,Voltage(V)_min,Voltage(V)_std,Current(A)_mean,Current(A)_std,Discharge_Energy(Wh)_max,Charge_Energy(Wh)_max,Aux_Temperature_1(C)_mean,Aux_Temperature_1(C)_max,Aux_Temperature_1(C)_min,Aux_Temperature_1(C)_std,Current(A)_abs_mean,SOH(%)
0,1,1,1,2.422,0.849,0.0,2.394,2.115,0.003,4.2,3.825,3.0,0.288,-0.003,0.671,9.163,9.37,25.004,25.438,24.398,0.213,0.557,100.0
1,1,2,1,2.401,0.442,0.0,2.416,1.819,0.003,4.2,3.869,3.0,0.295,0.002,1.441,8.839,9.449,25.745,29.539,24.382,1.491,1.174,99.132
2,1,3,1,2.399,0.443,0.0,2.397,1.806,0.003,4.2,3.87,3.0,0.293,-0.001,1.442,8.836,9.377,25.8,29.626,24.458,1.469,1.174,99.077
3,1,4,1,2.398,0.444,0.0,2.395,1.804,0.003,4.2,3.87,3.0,0.293,-0.002,1.443,8.832,9.369,25.788,29.583,24.368,1.496,1.176,99.013
4,1,5,1,2.397,0.444,0.0,2.394,1.803,0.003,4.2,3.87,3.0,0.293,-0.002,1.444,8.83,9.364,25.787,29.498,24.376,1.478,1.176,98.979


In [12]:
# Calculate and display RUL for each battery cycle
# Set a threshold for end-of-life (EOL) SOH
EOL_SOH_THRESHOLD = 80.0  # 80% of nominal capacity
def calculate_rul(df, eol_threshold):
    """Calculate Remaining Useful Life (RUL) based on SOH threshold"""
    rul_list = []
    for battery_id in df['Battery_ID'].unique():
        battery_data = df[df['Battery_ID'] == battery_id].sort_values('Cycle_Index')
        eol_cycle = battery_data[battery_data['SOH(%)'] <= eol_threshold]['Cycle_Index'].min()
        if pd.isna(eol_cycle):
            eol_cycle = battery_data['Cycle_Index'].max() + 1  # If never reaches EOL, set beyond max cycle
        battery_data = battery_data.copy()
        battery_data['RUL'] = eol_cycle - battery_data['Cycle_Index']
        rul_list.append(battery_data)
    return pd.concat(rul_list, ignore_index=True)
cycle_features = calculate_rul(cycle_features, EOL_SOH_THRESHOLD)
cycle_features.head()

Unnamed: 0,Battery_ID,Cycle_Index,Protocol_ID,Discharge_Capacity(Ah)_max,Discharge_Capacity(Ah)_mean,Discharge_Capacity(Ah)_min,Charge_Capacity(Ah)_max,Charge_Capacity(Ah)_mean,Charge_Capacity(Ah)_min,Voltage(V)_max,Voltage(V)_mean,Voltage(V)_min,Voltage(V)_std,Current(A)_mean,Current(A)_std,Discharge_Energy(Wh)_max,Charge_Energy(Wh)_max,Aux_Temperature_1(C)_mean,Aux_Temperature_1(C)_max,Aux_Temperature_1(C)_min,Aux_Temperature_1(C)_std,Current(A)_abs_mean,SOH(%),RUL
0,1,1,1,2.422,0.849,0.0,2.394,2.115,0.003,4.2,3.825,3.0,0.288,-0.003,0.671,9.163,9.37,25.004,25.438,24.398,0.213,0.557,100.0,808
1,1,2,1,2.401,0.442,0.0,2.416,1.819,0.003,4.2,3.869,3.0,0.295,0.002,1.441,8.839,9.449,25.745,29.539,24.382,1.491,1.174,99.132,807
2,1,3,1,2.399,0.443,0.0,2.397,1.806,0.003,4.2,3.87,3.0,0.293,-0.001,1.442,8.836,9.377,25.8,29.626,24.458,1.469,1.174,99.077,806
3,1,4,1,2.398,0.444,0.0,2.395,1.804,0.003,4.2,3.87,3.0,0.293,-0.002,1.443,8.832,9.369,25.788,29.583,24.368,1.496,1.176,99.013,805
4,1,5,1,2.397,0.444,0.0,2.394,1.803,0.003,4.2,3.87,3.0,0.293,-0.002,1.444,8.83,9.364,25.787,29.498,24.376,1.478,1.176,98.979,804


In [13]:
# Show some RUL < 0
cycle_features[cycle_features['RUL'] < 0].head()

Unnamed: 0,Battery_ID,Cycle_Index,Protocol_ID,Discharge_Capacity(Ah)_max,Discharge_Capacity(Ah)_mean,Discharge_Capacity(Ah)_min,Charge_Capacity(Ah)_max,Charge_Capacity(Ah)_mean,Charge_Capacity(Ah)_min,Voltage(V)_max,Voltage(V)_mean,Voltage(V)_min,Voltage(V)_std,Current(A)_mean,Current(A)_std,Discharge_Energy(Wh)_max,Charge_Energy(Wh)_max,Aux_Temperature_1(C)_mean,Aux_Temperature_1(C)_max,Aux_Temperature_1(C)_min,Aux_Temperature_1(C)_std,Current(A)_abs_mean,SOH(%),RUL
809,1,810,1,1.933,0.34,0.0,1.932,1.505,0.003,4.2,3.922,3.0,0.284,-0.001,1.348,6.977,7.668,26.251,32.148,24.814,1.816,1.036,79.807,-1
810,1,811,1,2.08,0.717,0.0,1.929,1.724,0.003,4.2,3.836,3.0,0.292,-0.02,0.634,7.792,7.655,25.271,26.112,24.817,0.284,0.516,85.896,-2
811,1,812,1,1.938,0.327,0.0,2.069,1.601,0.003,4.2,3.923,3.0,0.284,0.033,1.334,6.996,8.19,26.229,32.221,24.817,1.825,1.028,80.026,-3
812,1,813,1,1.933,0.34,0.0,1.936,1.508,0.003,4.2,3.923,3.0,0.284,0.0,1.347,6.977,7.683,26.291,32.102,24.838,1.837,1.035,79.81,-4
813,1,814,1,1.929,0.34,0.0,1.93,1.504,0.003,4.2,3.923,3.0,0.284,-0.001,1.347,6.96,7.658,26.257,32.091,24.791,1.818,1.034,79.635,-5


In [14]:
# Save the processed cycle features to a CSV file
output_path = "../datasets/preprocessed/primary_use_phase_rul.csv"
cycle_features.to_csv(output_path, index=False)
print(f"Processed cycle features with RUL saved to {output_path}.")

Processed cycle features with RUL saved to ../datasets/preprocessed/primary_use_phase_rul.csv.
