In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import scipy.io
from pathlib import Path

print("Libraries imported successfully.")

# --- Configuration ---
# Define the path to our data folder
DATA_PATH = Path('../data/')
# Define the failure threshold for battery capacity (e.g., 70% of initial capacity)
# For this dataset, the nominal capacity is 2 Ah. 70% is 1.4 Ah.
FAILURE_THRESHOLD = 1.4 

# --- Data Loading and Processing ---
all_battery_data = []

# Get a list of all .mat files in the data directory
mat_files = list(DATA_PATH.glob('*.mat'))
print(f"Found {len(mat_files)} battery data files.")

for file_path in mat_files:
    print(f"Processing file: {file_path.name}...")
    
    # Load the .mat file
    mat = scipy.io.loadmat(file_path)
    
    # Extract battery ID from the filename (e.g., 'B0005')
    battery_id = file_path.stem
    
    # Navigate the nested structure to get to the cycle data
    cycle_data = mat[battery_id][0, 0]['cycle'][0]
    
    # Store data for this specific battery
    battery_cycles = []
    
    # Initialize a "healthy" voltage curve from one of the first few cycles
    # This will be our "Digital Twin" baseline to compare against
    healthy_voltage_curve = None
    
    for i, cycle in enumerate(cycle_data):
        cycle_type = cycle['type'][0]
        
        # We are interested in 'discharge' cycles as they show capacity degradation
        if cycle_type == 'discharge':
            measurement = cycle['data'][0, 0]
            
            # --- Feature Calculation for the current cycle ---
            # 1. Capacity (The most important health indicator)
            # Capacity is the integral of current over time.
            capacity = measurement['Capacity'][0][0]
            
            # 2. Mean Temperature, Voltage, and Current
            temp_mean = np.mean(measurement['Temperature_measured'])
            voltage_mean = np.mean(measurement['Voltage_measured'])
            current_mean = np.mean(measurement['Current_measured'])
            
            # --- INNOVATIVE FEATURE: Degradation Anomaly Score ---
            # We compare the current cycle's voltage curve to a "healthy" one.
            current_voltage_curve = measurement['Voltage_measured']
            
            # Set the second cycle's curve as our "healthy" baseline
            if i == 1: 
                healthy_voltage_curve = current_voltage_curve
            
            anomaly_score = 0.0
            if healthy_voltage_curve is not None:
                # To compare curves of different lengths, we resample them to a fixed length (e.g., 100 points)
                resampled_healthy = np.interp(np.linspace(0, 1, 100), np.linspace(0, 1, len(healthy_voltage_curve)), healthy_voltage_curve.flatten())
                resampled_current = np.interp(np.linspace(0, 1, 100), np.linspace(0, 1, len(current_voltage_curve)), current_voltage_curve.flatten())
                
                # Anomaly score is the Mean Squared Error between the healthy and current curves
                anomaly_score = np.mean((resampled_healthy - resampled_current) ** 2)

            battery_cycles.append({
                'battery_id': battery_id,
                'cycle': i + 1,
                'capacity': capacity,
                'temp_mean': temp_mean,
                'voltage_mean': voltage_mean,
                'current_mean': current_mean,
                'degradation_anomaly_score': anomaly_score
            })

    # Convert the list of cycle data for this battery into a DataFrame
    df_battery = pd.DataFrame(battery_cycles)
    
    # --- Target Variable Calculation (RUL) ---
    # Find the total life of the battery in cycles (when capacity drops below threshold)
    failure_cycle = df_battery[df_battery['capacity'] <= FAILURE_THRESHOLD].iloc[0]
    total_life_in_cycles = failure_cycle['cycle']
    
    # Calculate RUL for each cycle
    df_battery['RUL'] = total_life_in_cycles - df_battery['cycle']
    
    # Add this battery's processed data to our master list
    all_battery_data.append(df_battery)

# Concatenate all battery DataFrames into a single master DataFrame
final_df = pd.concat(all_battery_data, ignore_index=True)

# --- Save the Processed Data ---
# Save the final, clean dataset to a CSV file for easy access later
output_path = DATA_PATH / 'processed_battery_data.csv'
final_df.to_csv(output_path, index=False)

print("\n--- Processing Complete ---")
print(f"Final DataFrame has {len(final_df)} rows and {len(final_df.columns)} columns.")
print(f"Processed data saved to: {output_path}")

final_df.head()

Libraries imported successfully.
Found 4 battery data files.
Processing file: B0053.mat...
Processing file: B0054.mat...
Processing file: B0055.mat...
Processing file: B0056.mat...

--- Processing Complete ---
Final DataFrame has 363 rows and 8 columns.
Processed data saved to: ..\data\processed_battery_data.csv


Unnamed: 0,battery_id,cycle,capacity,temp_mean,voltage_mean,current_mean,degradation_anomaly_score,RUL
0,B0053,1,1.069142,11.988577,3.082839,-1.993561,0.0,0
1,B0053,5,1.154073,12.059863,3.110329,-1.995265,0.0,-4
2,B0053,7,1.150153,11.967885,3.110463,-1.995272,0.0,-6
3,B0053,9,1.130586,11.985901,3.115028,-1.994739,0.0,-8
4,B0053,11,1.12179,12.004377,3.109376,-1.994717,0.0,-10


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import scipy.io
from pathlib import Path

print("Libraries imported successfully.")

# --- Configuration ---
DATA_PATH = Path('../data/')
# NEW: We will calculate the failure threshold dynamically per battery.
# We'll define failure as dropping below 80% of the initial capacity.
FAILURE_PERCENTAGE = 0.80

# --- Data Loading and Processing ---
all_battery_data = []
mat_files = list(DATA_PATH.glob('*.mat'))
print(f"Found {len(mat_files)} battery data files.")

for file_path in mat_files:
    print(f"Processing file: {file_path.name}...")
    
    mat = scipy.io.loadmat(file_path)
    battery_id = file_path.stem
    cycle_data = mat[battery_id][0, 0]['cycle'][0]
    
    battery_cycles = []
    healthy_voltage_curve = None
    
    for i, cycle in enumerate(cycle_data):
        cycle_type = cycle['type'][0]
        
        if cycle_type == 'discharge':
            measurement = cycle['data'][0, 0]
            capacity = measurement['Capacity'][0][0]
            temp_mean = np.mean(measurement['Temperature_measured'])
            voltage_mean = np.mean(measurement['Voltage_measured'])
            current_mean = np.mean(measurement['Current_measured'])
            
            current_voltage_curve = measurement['Voltage_measured']
            if i == 1: 
                healthy_voltage_curve = current_voltage_curve
            
            anomaly_score = 0.0
            if healthy_voltage_curve is not None:
                resampled_healthy = np.interp(np.linspace(0, 1, 100), np.linspace(0, 1, len(healthy_voltage_curve)), healthy_voltage_curve.flatten())
                resampled_current = np.interp(np.linspace(0, 1, 100), np.linspace(0, 1, len(current_voltage_curve)), current_voltage_curve.flatten())
                anomaly_score = np.mean((resampled_healthy - resampled_current) ** 2)

            battery_cycles.append({
                'battery_id': battery_id,
                'cycle': i + 1,
                'capacity': capacity,
                'temp_mean': temp_mean,
                'voltage_mean': voltage_mean,
                'current_mean': current_mean,
                'degradation_anomaly_score': anomaly_score
            })

    df_battery = pd.DataFrame(battery_cycles)
    
    # --- CORRECTED: Target Variable Calculation (RUL) ---
    # 1. Get the initial capacity of this specific battery
    initial_capacity = df_battery['capacity'].iloc[0]
    
    # 2. Calculate the dynamic failure threshold for this battery
    dynamic_failure_threshold = initial_capacity * FAILURE_PERCENTAGE
    
    # 3. Find the cycle number where the capacity first drops below our dynamic threshold
    failure_cycle_df = df_battery[df_battery['capacity'] <= dynamic_failure_threshold]
    
    if not failure_cycle_df.empty:
        total_life_in_cycles = failure_cycle_df['cycle'].iloc[0]
        # 4. Calculate RUL for each cycle
        df_battery['RUL'] = total_life_in_cycles - df_battery['cycle']
        all_battery_data.append(df_battery)
    else:
        print(f"WARNING: Battery {battery_id} did not reach failure threshold. It will be excluded.")


# Concatenate all battery DataFrames into a single master DataFrame
final_df = pd.concat(all_battery_data, ignore_index=True)

# --- Save the Processed Data ---
output_path = DATA_PATH / 'processed_battery_data.csv'
final_df.to_csv(output_path, index=False)

print("\n--- Processing Complete ---")
print(f"Final DataFrame has {len(final_df)} rows and {len(final_df.columns)} columns.")
print(f"Processed data saved to: {output_path}")

# Display the first few rows of the final dataset
print("\n--- Corrected Data Preview ---")
print(final_df.head())

Libraries imported successfully.
Found 4 battery data files.
Processing file: B0053.mat...
Processing file: B0054.mat...
Processing file: B0055.mat...
Processing file: B0056.mat...

--- Processing Complete ---
Final DataFrame has 159 rows and 8 columns.
Processed data saved to: ..\data\processed_battery_data.csv

--- Corrected Data Preview ---
  battery_id  cycle  capacity  temp_mean  voltage_mean  current_mean  \
0      B0053      1  1.069142  11.988577      3.082839     -1.993561   
1      B0053      5  1.154073  12.059863      3.110329     -1.995265   
2      B0053      7  1.150153  11.967885      3.110463     -1.995272   
3      B0053      9  1.130586  11.985901      3.115028     -1.994739   
4      B0053     11  1.121790  12.004377      3.109376     -1.994717   

   degradation_anomaly_score  RUL  
0                        0.0  136  
1                        0.0  132  
2                        0.0  130  
3                        0.0  128  
4                        0.0  126  
