In [5]:
from scipy.io import loadmat
import os
import pandas as pd
import numpy as np # Import numpy for calculations

# Define nominal capacity (update if needed for different batteries)
NOMINAL_CAPACITY = 2.0 # Ampere-hours (Ah) for B0005, B0006, B0007, B0018

def load_data(mat_path, battery, nominal_capacity=NOMINAL_CAPACITY):
    """
    Loads battery data from MATLAB file, extracts discharge cycles,
    and calculates SoH and SoC.

    Args:
        mat_path (str): Path to the .mat file.
        battery (str): The name of the battery struct in the .mat file (e.g., 'B0005').
        nominal_capacity (float): The nominal capacity of the battery in Ah.

    Returns:
        pandas.DataFrame: DataFrame containing processed battery data including
                          cycle, ambient_temperature, capacity, voltage_measured,
                          current_measured, temperature_measured, current_load,
                          voltage_load, time, SoH, and SoC.
    """
    try:
        mat = loadmat(mat_path)
    except FileNotFoundError:
        print(f"Error: File not found at {mat_path}")
        return pd.DataFrame() # Return empty DataFrame on error
    except Exception as e:
        print(f"Error loading {mat_path}: {e}")
        return pd.DataFrame()

    print(f"Processing: {mat_path} for battery {battery}")
    
    # Check if the battery key exists
    if battery not in mat:
        print(f"Error: Battery key '{battery}' not found in {mat_path}. Available keys: {list(mat.keys())}")
        return pd.DataFrame()
        
    # Check structure validity (basic checks)
    if not isinstance(mat[battery], np.ndarray) or mat[battery].shape != (1, 1):
         print(f"Error: Unexpected structure for mat['{battery}'] in {mat_path}")
         return pd.DataFrame()
    if 'cycle' not in mat[battery][0, 0].dtype.names:
        print(f"Error: 'cycle' field not found within mat['{battery}'][0, 0] in {mat_path}")
        return pd.DataFrame()

    counter = 0
    dataset = []
    
    num_cycles = len(mat[battery][0, 0]['cycle'][0])
    print(f"Found {num_cycles} cycle entries.")

    for i in range(num_cycles):
        row = mat[battery][0, 0]['cycle'][0, i]

        # Check if 'type' field exists and is discharge
        if 'type' in row.dtype.names and row['type'][0] == 'discharge':
            
             # --- Basic Data Integrity Checks ---
            if 'ambient_temperature' not in row.dtype.names or row['ambient_temperature'].size == 0:
                print(f"Warning: Missing or empty 'ambient_temperature' in cycle {i+1}. Skipping measurement extraction for this cycle.")
                continue
            if 'data' not in row.dtype.names or row['data'].size == 0:
                 print(f"Warning: Missing or empty 'data' in discharge cycle {i+1}. Skipping measurement extraction for this cycle.")
                 continue
            
            data_struct = row['data'][0, 0]
            required_fields = ['Capacity', 'Voltage_measured', 'Current_measured', 
                               'Temperature_measured', 'Current_load', 'Voltage_load', 'Time']
            if not all(field in data_struct.dtype.names for field in required_fields):
                 print(f"Warning: Missing one or more required data fields in discharge cycle {i+1}. Skipping measurement extraction.")
                 continue
            if data_struct['Capacity'].size == 0:
                 print(f"Warning: Missing or empty 'Capacity' in discharge cycle {i+1}. Skipping measurement extraction.")
                 continue
            # --- End Checks ---

            ambient_temperature = row['ambient_temperature'][0][0]
            cycle_capacity = data_struct['Capacity'][0][0] # Capacity for this specific cycle

            # --- Calculate State of Health (SoH) ---
            # SoH = (Current Cycle Capacity / Nominal Capacity) * 100
            soh = (cycle_capacity / nominal_capacity) * 100.0

            # Extract measurement arrays
            voltage_measured_arr = data_struct['Voltage_measured'][0]
            current_measured_arr = data_struct['Current_measured'][0]
            temperature_measured_arr = data_struct['Temperature_measured'][0]
            current_load_arr = data_struct['Current_load'][0]
            voltage_load_arr = data_struct['Voltage_load'][0]
            time_arr = data_struct['Time'][0]

            # Check if all measurement arrays have the same length
            num_measurements = len(voltage_measured_arr)
            if not all(len(arr) == num_measurements for arr in [current_measured_arr, temperature_measured_arr, current_load_arr, voltage_load_arr, time_arr]):
                print(f"Warning: Mismatched measurement array lengths in discharge cycle {i+1}. Skipping measurement extraction.")
                continue
            
            if num_measurements == 0:
                 print(f"Warning: Zero measurements found in discharge cycle {i+1}. Skipping.")
                 continue

            # --- Calculate State of Charge (SoC) using Coulomb Counting ---
            # SoC(t) = SoC(t0) - Integral[I(t) dt] / Current_Cycle_Capacity
            # Assume SoC(t0) = 100% at the start of discharge
            # Integrate charge removed (current is negative during discharge)
            
            charge_removed_Ah = 0.0 # Cumulative charge removed in Ampere-hours
            
            for j in range(num_measurements):
                voltage_measured = voltage_measured_arr[j]
                current_measured = current_measured_arr[j] # Negative for discharge
                temperature_measured = temperature_measured_arr[j]
                current_load = current_load_arr[j]
                voltage_load = voltage_load_arr[j]
                time = time_arr[j]

                # Calculate time step (dt) in seconds
                if j == 0:
                    dt = time # Time elapsed since start for the first point
                else:
                    dt = time - time_arr[j-1]

                if dt < 0:
                    print(f"Warning: Negative time step dt={dt} encountered in cycle {i+1}, step {j}. Resetting dt to 0.")
                    dt = 0 # Avoid issues with non-monotonic time

                # Calculate charge removed in this step (Ah)
                # Current (A) * time (s) / 3600 (s/h) = Ah
                # Use abs() because discharge current is negative
                charge_increment_Ah = abs(current_measured) * dt / 3600.0
                charge_removed_Ah += charge_increment_Ah

                # Calculate SoC (%)
                # Ensure cycle_capacity is not zero to avoid division errors
                if cycle_capacity > 1e-6: # Use a small threshold
                    soc = (1.0 - (charge_removed_Ah / cycle_capacity)) * 100.0
                    # Clamp SoC between 0 and 100
                    soc = max(0.0, min(100.0, soc)) 
                else:
                    soc = 0.0 # Assign 0 if capacity is effectively zero
                    print(f"Warning: Cycle {i+1} has near-zero capacity ({cycle_capacity}). Setting SoC to 0.")

                dataset.append([counter + 1, ambient_temperature, cycle_capacity,
                                voltage_measured, current_measured,
                                temperature_measured, current_load,
                                voltage_load, time,
                                soh, soc]) # Add SoH and SoC

            # Only increment counter if discharge cycle was successfully processed
            if num_measurements > 0: 
                counter = counter + 1
        # else: # Optional: uncomment to see skipped cycle types
        #     cycle_type = row['type'][0] if 'type' in row.dtype.names else 'Unknown Type'
        #     print(f"Skipping cycle {i+1}: type is {cycle_type}")


    print(f"Processed {counter} discharge cycles.")
    
    if not dataset:
        print("Warning: No valid discharge data found.")
        return pd.DataFrame() # Return empty if no data

    # Define column names including the new ones
    column_names = ['cycle', 'ambient_temperature', 'capacity',
                    'voltage_measured', 'current_measured',
                    'temperature_measured', 'current_load',
                    'voltage_load', 'time', 'SoH', 'SoC']
                    
    return pd.DataFrame(data=dataset, columns=column_names)

def calculate_RUL(df):
    """
    Calculates Remaining Useful Life (RUL) based on cycle number.

    Args:
        df (pandas.DataFrame): DataFrame with a 'cycle' column.

    Returns:
        pandas.DataFrame: DataFrame with an added 'RUL' column.
    """
    if df.empty or 'cycle' not in df.columns:
        print("Warning: Cannot calculate RUL. DataFrame is empty or missing 'cycle' column.")
        return df # Return original df if invalid
        
    eol_cycle = df['cycle'].max()
    print(f"End of Life (Max Cycle) detected: {eol_cycle}")
    
    df['RUL'] = eol_cycle - df['cycle']

    return df

# --- Main Execution ---
datasets_to_process = ["B0005", "B0006", "B0018"]
output_dir = "../data/processed_data" # Optional: Save to a subdirectory

if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created output directory: {output_dir}")

for name in datasets_to_process:
    mat_filename = f'../data/raw_data/{name}.mat'
    if not os.path.exists(mat_filename):
        print(f"--- Skipping {name}: File '{mat_filename}' not found. ---")
        continue
        
    print(f"\n--- Processing battery: {name} ---")
    # Pass nominal capacity, although it's the default for these datasets
    dataset = load_data(mat_filename, name, nominal_capacity=NOMINAL_CAPACITY)

    if not dataset.empty:
        dataset_with_rul = calculate_RUL(dataset)

        csv_filename = os.path.join(output_dir, f'{name}_discharge_with_SoH_SoC_RUL.csv')
        try:
            dataset_with_rul.to_csv(csv_filename, index=False)
            print(f"Successfully saved processed data to: {csv_filename}")
        except Exception as e:
             print(f"Error saving {csv_filename}: {e}")
    else:
        print(f"--- No data processed for {name}. Skipping CSV export. ---")

print("\n--- Processing complete. ---")


--- Processing battery: B0005 ---
Processing: ../data/raw_data/B0005.mat for battery B0005
Found 616 cycle entries.
Processed 168 discharge cycles.
End of Life (Max Cycle) detected: 168
Successfully saved processed data to: ../data/processed_data/B0005_discharge_with_SoH_SoC_RUL.csv

--- Processing battery: B0006 ---
Processing: ../data/raw_data/B0006.mat for battery B0006
Found 616 cycle entries.
Processed 168 discharge cycles.
End of Life (Max Cycle) detected: 168
Successfully saved processed data to: ../data/processed_data/B0006_discharge_with_SoH_SoC_RUL.csv

--- Processing battery: B0018 ---
Processing: ../data/raw_data/B0018.mat for battery B0018
Found 319 cycle entries.
Processed 132 discharge cycles.
End of Life (Max Cycle) detected: 132
Successfully saved processed data to: ../data/processed_data/B0018_discharge_with_SoH_SoC_RUL.csv

--- Processing complete. ---
