### imports

In [3]:
import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys

### initial data inspection

In [3]:
# Define data directory
RAW_DATA_DIR = '../data/raw_data/' # Adjust path if your notebook is elsewhere

# List the files
print("Files in raw_data directory:", os.listdir(RAW_DATA_DIR))

# --- Inspect B0005.mat ---
file_path_b0005 = os.path.join(RAW_DATA_DIR, 'B0005.mat')

# Load the .mat file
try:
    mat_b0005 = scipy.io.loadmat(file_path_b0005)
    print(f"\nSuccessfully loaded {os.path.basename(file_path_b0005)}")
    print("Type of loaded data:", type(mat_b0005))

    # Print the top-level keys
    print("\nTop-level keys:", mat_b0005.keys())

    # Identify the main data key (usually the filename without extension)
    main_key = 'B0005' # Adjust if necessary based on the keys printed above

    if main_key in mat_b0005:
        data_b0005 = mat_b0005[main_key]
        print(f"\nData structure within '{main_key}':")
        # .mat files loaded by scipy often result in structured numpy arrays
        # Check the dtype (data type) which describes the structure
        print("Data type (dtype):", data_b0005.dtype)
        # Print the names of the fields within the structured array
        if data_b0005.dtype.names:
            print("Field names:", data_b0005.dtype.names)
            # Accessing the 'cycle' field, if it exists
            if 'cycle' in data_b0005.dtype.names:
                 # Access requires [0][0] for this typical nested structure
                cycles_b0005 = data_b0005['cycle'][0, 0]
                print("\nShape of 'cycle' data:", cycles_b0005.shape)
                print("Data type of 'cycle' data:", cycles_b0005.dtype)
                # Inspect the structure of the first cycle
                if cycles_b0005.shape[1] > 0:
                    first_cycle = cycles_b0005[0, 0]
                    print("\nStructure of the first cycle (dtype):", first_cycle.dtype)
                    print("Field names in the first cycle:", first_cycle.dtype.names)
                    # Further inspect elements within the first cycle if needed
                    if 'data' in first_cycle.dtype.names:
                        first_cycle_data = first_cycle['data'][0,0]
                        print("\nStructure of data within the first cycle (dtype):", first_cycle_data.dtype)
                        print("Field names in first cycle data:", first_cycle_data.dtype.names)
                        print("Shape of data within first cycle:", first_cycle_data.shape)


            else:
                print(f"'{main_key}' does not contain a 'cycle' field directly.")
        else:
             print(f"'{main_key}' data does not appear to be a structured array with named fields.")

    else:
        print(f"\nCould not find the expected key '{main_key}' in the loaded .mat file.")
        print("Please check the 'Top-level keys' output and adjust the 'main_key' variable.")

except FileNotFoundError:
    print(f"Error: File not found at {file_path_b0005}")
except Exception as e:
    print(f"An error occurred while loading or processing the file: {e}")

Files in raw_data directory: ['B0018.mat', 'B0005.mat', 'B0006.mat']

Successfully loaded B0005.mat
Type of loaded data: <class 'dict'>

Top-level keys: dict_keys(['__header__', '__version__', '__globals__', 'B0005'])

Data structure within 'B0005':
Data type (dtype): [('cycle', 'O')]
Field names: ('cycle',)

Shape of 'cycle' data: (1, 616)
Data type of 'cycle' data: [('type', 'O'), ('ambient_temperature', 'O'), ('time', 'O'), ('data', 'O')]

Structure of the first cycle (dtype): [('type', 'O'), ('ambient_temperature', 'O'), ('time', 'O'), ('data', 'O')]
Field names in the first cycle: ('type', 'ambient_temperature', 'time', 'data')

Structure of data within the first cycle (dtype): [('Voltage_measured', 'O'), ('Current_measured', 'O'), ('Temperature_measured', 'O'), ('Current_charge', 'O'), ('Voltage_charge', 'O'), ('Time', 'O')]
Field names in first cycle data: ('Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge', 'Time')
Shape of data w

In [5]:
# --- Re-refined Inspection of Measurement Data Shape & Type ---

if main_key in mat_b0005 and 'cycle' in data_b0005.dtype.names:
    cycles_b0005 = data_b0005['cycle'][0, 0]
    if cycles_b0005.shape[1] > 0:
        first_cycle = cycles_b0005[0, 0] # Get the first cycle structure
        if 'data' in first_cycle.dtype.names:
            first_cycle_data_struct = first_cycle['data'][0,0] # Get the data structure within the cycle

            # Check if 'Voltage_measured' exists
            if 'Voltage_measured' in first_cycle_data_struct.dtype.names:
                # Try accessing without the final [0,0]
                voltage_data_alt = first_cycle_data_struct['Voltage_measured']

                # Check the type of this accessed element
                print(f"\nType of accessed 'Voltage_measured': {type(voltage_data_alt)}")

                # If it's a numpy array, print its shape and some values
                if isinstance(voltage_data_alt, np.ndarray):
                    print(f"Shape of 'Voltage_measured' (accessed directly): {voltage_data_alt.shape}")
                    # It might be nested, try accessing [0,0] only if needed
                    if voltage_data_alt.shape == (1,1):
                         voltage_values = voltage_data_alt[0,0]
                         print(f"Shape of actual data array inside: {voltage_values.shape}")
                         print(f"Data type: {voltage_values.dtype}")
                         print(f"First 5 voltage values: {voltage_values.flatten()[:5]}")
                    else:
                         # If shape is already (n,) or similar, use directly
                         print(f"Data type: {voltage_data_alt.dtype}")
                         print(f"First 5 voltage values: {voltage_data_alt.flatten()[:5]}")

                else:
                    # Handle case if it's not an array as expected
                    print(f"'Voltage_measured' accessed directly is not a NumPy array. Value: {voltage_data_alt}")

            else:
                print("Field 'Voltage_measured' not found in the first cycle's data.")


Type of accessed 'Voltage_measured': <class 'numpy.ndarray'>
Shape of 'Voltage_measured' (accessed directly): (1, 789)
Data type: float64
First 5 voltage values: [3.87301722 3.47939356 4.00058782 4.01239519 4.01970806]


### inspect B0005, B0006, B0018

In [8]:
# Define data directory
RAW_DATA_DIR = '../data/raw_data/' # Adjust path if your notebook is elsewhere

# --- Inspect B0005.mat ---
file_path_b0005 = os.path.join(RAW_DATA_DIR, 'B0005.mat')
try:
    mat_b0005 = scipy.io.loadmat(file_path_b0005)
    print(f"\n--- B0005 ---")
    print("Top-level keys:", mat_b0005.keys())
    if 'B0005' in mat_b0005:
        data_b0005 = mat_b0005['B0005']
        print("B0005 Data dtype fields:", data_b0005.dtype.names)
        if 'cycle' in data_b0005.dtype.names:
            cycles_b0005 = data_b0005['cycle'][0, 0]
            print("Shape of 'cycle' data:", cycles_b0005.shape)
            if cycles_b0005.shape[1] > 0:
                first_cycle_b0005 = cycles_b0005[0, 0]
                print("First cycle dtype fields:", first_cycle_b0005.dtype.names)
                if 'data' in first_cycle_b0005.dtype.names:
                    first_cycle_data_b0005 = first_cycle_b0005['data'][0,0]
                    print("First cycle data dtype fields:", first_cycle_data_b0005.dtype.names)
except Exception as e:
    print(f"Error processing B0005.mat: {e}")

print("-"*30)

# --- Inspect B0006.mat ---
file_path_b0005 = os.path.join(RAW_DATA_DIR, 'B0006.mat')
try:
    mat_b0005 = scipy.io.loadmat(file_path_b0005)
    print(f"\n--- B0006 ---")
    print("Top-level keys:", mat_b0005.keys())
    if 'B0006' in mat_b0005:
        data_b0005 = mat_b0005['B0006']
        print("B0006 Data dtype fields:", data_b0005.dtype.names)
        if 'cycle' in data_b0005.dtype.names:
            cycles_b0005 = data_b0005['cycle'][0, 0]
            print("Shape of 'cycle' data:", cycles_b0005.shape)
            if cycles_b0005.shape[1] > 0:
                first_cycle_b0005 = cycles_b0005[0, 0]
                print("First cycle dtype fields:", first_cycle_b0005.dtype.names)
                if 'data' in first_cycle_b0005.dtype.names:
                    first_cycle_data_b0005 = first_cycle_b0005['data'][0,0]
                    print("First cycle data dtype fields:", first_cycle_data_b0005.dtype.names)
except Exception as e:
    print(f"Error processing B0006.mat: {e}")


print("-"*30)

# --- Inspect B0018.mat ---
file_path_b0018 = os.path.join(RAW_DATA_DIR, 'B0018.mat')
try:
    mat_b0018 = scipy.io.loadmat(file_path_b0018)
    print(f"\n--- B0018 ---")
    print("Top-level keys:", mat_b0018.keys())
    if 'B0018' in mat_b0018:
        data_b0018 = mat_b0018['B0018']
        print("B0018 Data dtype fields:", data_b0018.dtype.names)
        if 'cycle' in data_b0018.dtype.names:
            cycles_b0018 = data_b0018['cycle'][0, 0]
            print("Shape of 'cycle' data:", cycles_b0018.shape)
            if cycles_b0018.shape[1] > 0:
                first_cycle_b0018 = cycles_b0018[0, 0]
                print("First cycle dtype fields:", first_cycle_b0018.dtype.names)
                if 'data' in first_cycle_b0018.dtype.names:
                    first_cycle_data_b0018 = first_cycle_b0018['data'][0,0]
                    print("First cycle data dtype fields:", first_cycle_data_b0018.dtype.names)
except Exception as e:
    print(f"Error processing B0018.mat: {e}")


--- B0005 ---
Top-level keys: dict_keys(['__header__', '__version__', '__globals__', 'B0005'])
B0005 Data dtype fields: ('cycle',)
Shape of 'cycle' data: (1, 616)
First cycle dtype fields: ('type', 'ambient_temperature', 'time', 'data')
First cycle data dtype fields: ('Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge', 'Time')
------------------------------

--- B0006 ---
Top-level keys: dict_keys(['__header__', '__version__', '__globals__', 'B0006'])
B0006 Data dtype fields: ('cycle',)
Shape of 'cycle' data: (1, 616)
First cycle dtype fields: ('type', 'ambient_temperature', 'time', 'data')
First cycle data dtype fields: ('Voltage_measured', 'Current_measured', 'Temperature_measured', 'Current_charge', 'Voltage_charge', 'Time')
------------------------------

--- B0018 ---
Top-level keys: dict_keys(['__header__', '__version__', '__globals__', 'B0018'])
B0018 Data dtype fields: ('cycle',)
Shape of 'cycle' data: (1, 319)
First cycle dtype 

### parse data with data_utils.py

In [1]:
# Add the 'scripts' directory to the Python path
SCRIPTS_DIR = '../scripts'
if SCRIPTS_DIR not in sys.path:
    sys.path.append(SCRIPTS_DIR)
    
from data_utils import parse_nasa_mat_file

# Define the path to the B0005 file again
RAW_DATA_DIR = '../data/raw_data/' # Make sure this path is correct relative to notebook
file_path_b0005 = os.path.join(RAW_DATA_DIR, 'B0005.mat')

# Call the parsing function
if 'parse_nasa_mat_file' in globals(): # Check if import was successful
    df_b0005 = parse_nasa_mat_file(file_path_b0005)

    # Display the first few rows and info if successful
    if df_b0005 is not None and not df_b0005.empty:
        print("\nDataFrame Info (B0005):")
        df_b0005.info()
        print("\nDataFrame Head (B0005):")
        display(df_b0005.head())
        print("\nDataFrame Tail (B0005):")
        display(df_b0005.tail())
        print("\nCycle Types (B0005):")
        print(df_b0005['cycle_type'].value_counts())
        print("\nCapacity values (non-NaN) for discharge cycles (B0005):")
        print(df_b0005.loc[df_b0005['cycle_type'] == 'discharge', 'capacity'].dropna().unique())

    elif df_b0005 is not None and df_b0005.empty:
         print("\nParsing completed, but the resulting DataFrame is empty.")
    else:
        print("\nParsing failed.")

Processing file: B0005.mat
Found 616 cycles.


Parsing B0005: 100%|██████████| 616/616 [00:00<00:00, 2175.71it/s]


Finished processing B0005. DataFrame shape: (591458, 11)

DataFrame Info (B0005):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 591458 entries, 0 to 591457
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   battery_id                 591458 non-null  object 
 1   cycle_number               591458 non-null  int64  
 2   cycle_type                 591458 non-null  object 
 3   ambient_temperature        591458 non-null  int64  
 4   measurement_time_relative  591458 non-null  float64
 5   voltage_measured           591458 non-null  float64
 6   current_measured           591458 non-null  float64
 7   temperature_measured       591458 non-null  float64
 8   voltage_load_or_charge     591458 non-null  float64
 9   current_load_or_charge     591458 non-null  float64
 10  capacity                   50285 non-null   float64
dtypes: float64(7), int64(2), object(2)
memory usage: 49.6+ MB

Da

Unnamed: 0,battery_id,cycle_number,cycle_type,ambient_temperature,measurement_time_relative,voltage_measured,current_measured,temperature_measured,voltage_load_or_charge,current_load_or_charge,capacity
0,B0005,1,charge,24,0.0,3.873017,-0.001201,24.655358,0.003,0.0,
1,B0005,1,charge,24,2.532,3.479394,-4.030268,24.66648,1.57,-4.036,
2,B0005,1,charge,24,5.5,4.000588,1.512731,24.675394,4.726,1.5,
3,B0005,1,charge,24,8.344,4.012395,1.509063,24.693865,4.742,1.5,
4,B0005,1,charge,24,11.125,4.019708,1.511318,24.705069,4.753,1.5,



DataFrame Tail (B0005):


Unnamed: 0,battery_id,cycle_number,cycle_type,ambient_temperature,measurement_time_relative,voltage_measured,current_measured,temperature_measured,voltage_load_or_charge,current_load_or_charge,capacity
591453,B0005,616,charge,24,0.0,0.236356,-0.003484,23.372048,0.003,0.0,
591454,B0005,616,charge,24,2.547,0.003365,-0.001496,23.369434,0.003,0.0,
591455,B0005,616,charge,24,5.5,4.985137,0.000506,23.386535,5.002,0.0,
591456,B0005,616,charge,24,8.312,4.98472,0.000442,23.386983,5.002,-0.002,
591457,B0005,616,charge,24,12.656,4.21344,-0.000734,23.385061,4.229,-0.002,



Cycle Types (B0005):
cycle_type
charge       541173
discharge     50285
Name: count, dtype: int64

Capacity values (non-NaN) for discharge cycles (B0005):
[1.85648742 1.84632725 1.83534919 1.83526253 1.83464551 1.83566166
 1.83514614 1.82575679 1.82477385 1.82461327 1.82461955 1.81420194
 1.81375216 1.81344049 1.802598   1.8021069  1.8025795  1.80306831
 1.80277762 1.84702599 1.84741731 1.83617742 1.82578075 1.82511364
 1.8255815  1.81403113 1.81476919 1.81396939 1.80276567 1.80407704
 1.85180255 1.83070385 1.81990411 1.80930796 1.8046099  1.79937707
 1.78844323 1.78292305 1.77303372 1.77303776 1.76787211 1.76231507
 1.76761729 1.76266836 1.75173049 1.7418496  1.73609135 1.79362401
 1.78318902 1.76736421 1.75701779 1.74687062 1.74171725 1.73642251
 1.72632172 1.71580654 1.71053335 1.7060145  1.70031103 1.69457986
 1.68490291 1.67447416 1.67456925 1.66371638 1.65901387 1.65385406
 1.64265378 1.63785784 1.63273504 1.62775289 1.62212549 1.61132566
 1.60656314 1.60151422 1.59036923 1.5857

In [5]:
# --- Parse B0006 and B0018 ---

file_path_b0006 = os.path.join(RAW_DATA_DIR, 'B0006.mat')
file_path_b0018 = os.path.join(RAW_DATA_DIR, 'B0018.mat')

df_b0006 = None
df_b0018 = None

if 'parse_nasa_mat_file' in globals():
    df_b0006 = parse_nasa_mat_file(file_path_b0006)
    df_b0018 = parse_nasa_mat_file(file_path_b0018)

# --- Combine DataFrames ---
all_dfs = [df_b0005, df_b0006, df_b0018]
valid_dfs = [df for df in all_dfs if df is not None and not df.empty]

df_all = None
if len(valid_dfs) > 0:
    print(f"\nCombining {len(valid_dfs)} DataFrames...")
    df_all = pd.concat(valid_dfs, ignore_index=True)
    print("Combined DataFrame Info:")
    df_all.info()
    print("\nBattery IDs in combined DataFrame:")
    print(df_all['battery_id'].value_counts())
else:
    print("\nNo valid DataFrames were parsed to combine.")


# --- Save Combined DataFrame ---
PROCESSED_DATA_DIR = '../data/processed_data/'
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True) # Ensure directory exists

if df_all is not None:
    output_file_csv = os.path.join(PROCESSED_DATA_DIR, 'nasa_battery_data_combined.csv')
    try:
        print(f"\nSaving combined data to {output_file_csv}...")
        df_all.to_csv(output_file_csv, index=False)
        print("Save complete.")
    except Exception as e_csv:
        print(f"Error saving as CSV: {e_csv}")

Processing file: B0006.mat
Found 616 cycles.


Parsing B0006: 100%|██████████| 616/616 [00:00<00:00, 2326.07it/s]


Finished processing B0006. DataFrame shape: (591458, 11)
Processing file: B0018.mat
Found 319 cycles.


Parsing B0018: 100%|██████████| 319/319 [00:00<00:00, 2302.79it/s]


Finished processing B0018. DataFrame shape: (314676, 11)

Combining 3 DataFrames...
Combined DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1497592 entries, 0 to 1497591
Data columns (total 11 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   battery_id                 1497592 non-null  object 
 1   cycle_number               1497592 non-null  int64  
 2   cycle_type                 1497592 non-null  object 
 3   ambient_temperature        1497592 non-null  int64  
 4   measurement_time_relative  1497592 non-null  float64
 5   voltage_measured           1497590 non-null  float64
 6   current_measured           1497590 non-null  float64
 7   temperature_measured       1497590 non-null  float64
 8   voltage_load_or_charge     1497592 non-null  float64
 9   current_load_or_charge     1497592 non-null  float64
 10  capacity                   135436 non-null   float64
dtypes: float64(7), int6