# Data Acquisition and EDA

## Import necessary libraries

In [1]:
# Import necessary libraries

# Data manipulation and analysis
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path
import psutil # For system and process utilities

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats
from scipy.signal import find_peaks

# Time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Machine learning for additional insights
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

# Progress tracking
from tqdm.auto import tqdm

# Warnings management
import warnings
warnings.filterwarnings("ignore")

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

print("All libraries imported successfully.")

All libraries imported successfully.


  from .autonotebook import tqdm as notebook_tqdm


## Load the dataset

In [6]:
# Setup the path to the dataset
ZENODO_PRIMARY_PATH = "../datasets/Primary_use_phase/"

# Load the dataset
primary_files = glob.glob(os.path.join(ZENODO_PRIMARY_PATH, "*.csv"))
zenodo_primary_data = {}
for file in primary_files:
    file_name = Path(file).stem
    zenodo_primary_data[file_name] = pd.read_csv(file)
    print(f"Loaded {file_name} with shape: {zenodo_primary_data[file_name].shape}")
print("Dataset loaded successfully.")

Loaded 1 with shape: (728434, 15)
Loaded 10 with shape: (655813, 15)
Loaded 11 with shape: (654196, 15)
Loaded 12 with shape: (811734, 15)
Loaded 13 with shape: (110918, 15)
Loaded 14 with shape: (103443, 15)
Loaded 15 with shape: (131426, 15)
Loaded 16 with shape: (110471, 15)
Loaded 17 with shape: (1727609, 15)
Loaded 18 with shape: (1664229, 15)
Loaded 19 with shape: (1746745, 15)
Loaded 2 with shape: (763299, 15)
Loaded 20 with shape: (1656244, 15)
Loaded 21 with shape: (2051653, 15)
Loaded 22 with shape: (2144592, 15)
Loaded 23 with shape: (2183081, 15)
Loaded 24 with shape: (1904977, 15)
Loaded 25 with shape: (186045, 15)
Loaded 26 with shape: (286461, 15)
Loaded 27 with shape: (137904, 15)
Loaded 28 with shape: (116591, 15)
Loaded 29 with shape: (1627854, 15)
Loaded 3 with shape: (980242, 15)
Loaded 30 with shape: (1526947, 15)
Loaded 31 with shape: (1430848, 15)
Loaded 32 with shape: (1510949, 15)
Loaded 33 with shape: (263433, 15)
Loaded 34 with shape: (187392, 15)
Loaded 35 w

In [7]:
# Set the dataframe for primary use phase (all file combined)
primary_df = pd.concat(zenodo_primary_data.values(), ignore_index=True)
print("Primary use phase dataframe created successfully with shape:.", primary_df.shape)

Primary use phase dataframe created successfully with shape:. (87484642, 15)


In [8]:
# Check the dataframe info
primary_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87484642 entries, 0 to 87484641
Data columns (total 15 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Unnamed: 0                int64  
 1   Test_Time(s)              float64
 2   Step_Time(s)              float64
 3   Cycle_Index               int64  
 4   Step_Index                int64  
 5   Current(A)                float64
 6   Voltage(V)                float64
 7   Charge_Capacity(Ah)       float64
 8   Discharge_Capacity(Ah)    float64
 9   Charge_Energy(Wh)         float64
 10  Discharge_Energy(Wh)      float64
 11  Internal_Resistance(Ohm)  float64
 12  Aux_Temperature_1(C)      float64
 13  Battery_ID                int64  
 14  Protocol_ID               int64  
dtypes: float64(10), int64(5)
memory usage: 9.8 GB


In [None]:
# Drop internal resistance columns if they exist
if 'Internal_Resistance(Ohm)' in primary_df.columns:
    primary_df = primary_df.drop(columns=['Internal_Resistance(Ohm)'])
    print("Dropped 'Internal_Resistance(Ohm)' column.")
print("Current dataframe shape:", primary_df.shape)

In [9]:
primary_df.head()

Unnamed: 0.1,Unnamed: 0,Test_Time(s),Step_Time(s),Cycle_Index,Step_Index,Current(A),Voltage(V),Charge_Capacity(Ah),Discharge_Capacity(Ah),Charge_Energy(Wh),Discharge_Energy(Wh),Internal_Resistance(Ohm),Aux_Temperature_1(C),Battery_ID,Protocol_ID
0,863,8564.156,10.001,1,12,1.2,3.3,0.003,0.0,0.011,0.0,,25.282,9,2
1,864,8574.157,20.001,1,12,1.2,3.322,0.007,0.0,0.022,0.0,,25.251,9,2
2,865,8584.157,30.002,1,12,1.2,3.342,0.01,0.0,0.033,0.0,,25.247,9,2
3,866,8594.156,40.001,1,12,1.2,3.359,0.013,0.0,0.044,0.0,,25.211,9,2
4,867,8604.157,50.002,1,12,1.2,3.375,0.017,0.0,0.055,0.0,,25.169,9,2


## Understanding Battery Degradation and RUL

RUL (Remaining Useful Life) represents the number of cycles a battery can still perform before it reaches its end-of-life(EOL) criteria. Typically, a battery is considered at EOL when its capacity degrades to 80% of its initial capacity.

In [None]:
# Set up battery details
battery_details = {
    'Model': '86 NMC(532)/graphite batteries',
    'Capacity': 2.5,
    'Nominal Voltage': 3.7,
    'Max Voltage': 4.2,
}