In [None]:
import pickle
import numpy as np
import pandas as pd

from scipy.stats import skew, kurtosis
from sklearn.linear_model import LinearRegression

In [None]:
# change working directory to base, to make all imports and file paths work
import os
os.chdir(os.pardir)
print("Current directory: %s" % os.getcwd())

In [None]:
import trainer.constants as cst

In [None]:
# window defaults
window_size = 20
shift = 20
stride = 1
batch_size = 16

In [None]:
preprocessed_pkl = pickle.load(open(cst.PROCESSED_DATA, "rb"))  # dict

In [None]:
preprocessed_pkl.keys()

In [None]:
preprocessed_pkl['b1c0']['summary'].keys()

In [None]:
preprocessed_pkl['b1c0']['cycles'].keys()

In [None]:
preprocessed_pkl['b1c0']['cycles']['1'].keys()

In [None]:
# fullmod_features = [
#     "minimum_dQ_100_10",
#     "variance_dQ_100_10",
#     "slope_lin_fit_2_100",
#     "intercept_lin_fit_2_100",
#     "discharge_capacity_2",
#     "mean_charge_time_2_6",
#     "minimum_IR_2_100",
#     "diff_IR_100_2",
# ]

In [None]:
cell_dfs = []
for cell_k, cell_v in preprocessed_pkl.items():
    print('processing', cell_k)
    # init arrays for values
    total_cycles = len(cell_v['cycles'])
    num_windows = int( (total_cycles-window_size) // shift) + 1
    sanity_check = int(total_cycles//shift)
    print('num_windows', num_windows, 'total_cycles', total_cycles)
    assert num_windows == sanity_check

In [None]:
# work out details on specific cell
cell_k = 'b1c0'
cell_v = preprocessed_pkl[cell_k]
cell_cycles = cell_v['cycles']
total_cycles = len(cell_cycles)
cell_summary = cell_v['summary']

In [None]:
# slice cycle keys into windows
cycle_keys = list(cell_cycles.keys())
window_cycle_keys = []
for i, w_slice in enumerate(range(0, len(cycle_keys), shift)):
    cycle_keys_slice = cycle_keys[w_slice : w_slice + window_size]
    if len(cycle_keys_slice) % window_size == 0:  # drop remainder
        window_cycle_keys.append(cycle_keys_slice)

In [None]:
num_windows = len(window_cycle_keys)
# sanity check
assert num_windows == int((total_cycles-window_size) // shift) + 1
print('num_windows', num_windows, 'total_cycles', total_cycles)
minimum_dQ_window = np.zeros(num_windows)
variance_dQ_window = np.zeros(num_windows)
slope_lin_fit_window = np.zeros(num_windows)
intercept_lin_fit_window = np.zeros(num_windows)
discharge_capacity_2 = np.zeros(num_windows)
mean_charge_time_2_6 = np.zeros(num_windows)
minimum_IR_window = np.zeros(num_windows)
diff_IR_window = np.zeros(num_windows)
target_remaining = np.zeros(num_windows)
target_current = np.zeros(num_windows)

In [None]:
for i, window_keys in enumerate(window_cycle_keys):
    key_c2 = window_keys[1]
    key_clast = window_keys[-1]
    c2 = cell_cycles[key_c2]    
    clast = cell_cycles[key_clast]
    # summary keys are 0 indexed
    summary_key_c2 = int(key_c2) - 1
    summary_key_clast = int(key_clast) - 1
    print("{}: [{}, {}], summary_keys [{}, {}]".format(i, key_c2, key_clast, summary_key_c2, summary_key_clast))

    # 1. delta_Q_100_10(V) -> delta_Q_last_2(V)
    dQ_window = clast['Qdlin'] - c2['Qdlin']
    minimum_dQ_window[i] = np.log(np.abs(np.min(dQ_window)))
    variance_dQ_window[i] = np.log(np.var(dQ_window))
    
    # 2. Discharge capacity fade curve features
    # Compute linear fit for cycles 2 to last:
    # discharge cappacities; q.shape = (window_size-1, 1); 
    q = cell_summary['QD'][summary_key_c2:summary_key_clast].reshape(-1, 1)     
    # Cylce index from 2 to last; X.shape = (window_size-1, 1)
    X = np.array(window_keys)[2:].reshape(-1, 1) 

    linear_regressor_window = LinearRegression()
    linear_regressor_window.fit(X, q)
    slope_lin_fit_window[i] = linear_regressor_window.coef_[0]
    intercept_lin_fit_window[i] = linear_regressor_window.intercept_
    discharge_capacity_2[i] = q[0][0]
        
    # 3. Other features
    summary_key_c6 = int(window_keys[5]) - 1
    mean_charge_time_2_6[i] = np.mean(cell_summary['Discharge_time'][summary_key_c2:summary_key_c6])
    minimum_IR_window[i] = np.min(cell_summary['IR'][summary_key_c2:summary_key_clast])
    diff_IR_window[i] = cell_summary['IR'][summary_key_clast] - cell_summary['IR'][summary_key_c2]

    # 4. Targets
    target_remaining[i] = cell_summary['Remaining_cycles'][summary_key_clast]
    target_current[i] = int(key_clast)

In [None]:
cell_summary.keys()

In [None]:
# assemble cell-level df
cell_df = pd.DataFrame({
    "cell_key": np.array(cell_k),
    "minimum_dQ_100_10": minimum_dQ_window,
    "variance_dQ_window": variance_dQ_window,
    "slope_lin_fit_window": slope_lin_fit_window,
    "intercept_lin_fit_window": intercept_lin_fit_window,
    "discharge_capacity_2": discharge_capacity_2,
    "mean_charge_time_2_6": mean_charge_time_2_6,
    "minimum_IR_window": minimum_IR_window,
    "diff_IR_window": diff_IR_window,
    "target_remaining": np.array(target_remaining),
    "target_current": np.array(target_current)
})

In [None]:
cell_df.head()

In [None]:
cell_dfs = []

for cell_k, cell_v in preprocessed_pkl.items():
    print('processing', cell_k)
    cell_cycles = cell_v['cycles']
    cell_summary = cell_v['summary']
    total_cycles = len(cell_cycles)

    # slice cycle keys into windows
    cycle_keys = list(cell_cycles.keys())
    window_cycle_keys = []
    for i, w_slice in enumerate(range(0, len(cycle_keys), shift)):
        cycle_keys_slice = cycle_keys[w_slice : w_slice + window_size]
        if len(cycle_keys_slice) % window_size == 0:  # drop remainder
            window_cycle_keys.append(cycle_keys_slice)

    # init value arrays
    num_windows = len(window_cycle_keys)
    assert num_windows == int((total_cycles-window_size) // shift) + 1
    print('num_windows', num_windows, 'total_cycles', total_cycles)
    minimum_dQ_window = np.zeros(num_windows)
    variance_dQ_window = np.zeros(num_windows)
    slope_lin_fit_window = np.zeros(num_windows)
    intercept_lin_fit_window = np.zeros(num_windows)
    discharge_capacity_2 = np.zeros(num_windows)
    mean_discharge_time = np.zeros(num_windows)
    minimum_IR_window = np.zeros(num_windows)
    diff_IR_window = np.zeros(num_windows)
    target_remaining = np.zeros(num_windows)
    target_current = np.zeros(num_windows)

    # build cell-level df
    for i, window_keys in enumerate(window_cycle_keys):
        key_c1 = window_keys[0]
        key_clast = window_keys[-1]
        # summary keys may not line up with cycle keys, since cycles could be cleaned up
        summary_key_c1 = np.where(np.array(cycle_keys)==key_c1)[0][0]
        summary_key_clast = np.where(np.array(cycle_keys)==key_clast)[0][0]
        print("{}: [{}, {}], summary_keys: [{}, {}]".format(i, key_c1, key_clast, summary_key_c1, summary_key_clast))

        # 1. delta_Q_100_10(V) -> delta_Q_window(V)
        dQ_window = cell_cycles[key_clast]['Qdlin'] - cell_cycles[key_c1]['Qdlin']
        minimum_dQ_window[i] = np.log(np.abs(np.min(dQ_window)))
        variance_dQ_window[i] = np.log(np.var(dQ_window))

        # 2. Discharge capacity fade curve features
        # Compute linear fit for cycles 2 to last:
        # discharge cappacities; q.shape = (window_size, 1); 
        q = cell_summary['QD'][summary_key_c1:summary_key_clast+1].reshape(-1, 1).astype(np.float64) 
        # Cylce index from 2 to last; X.shape = (window_size, 1)
        X = np.arange(window_size).reshape(-1, 1).astype(np.int32) 

        linear_regressor_window = LinearRegression()
        linear_regressor_window.fit(X, q)
        slope_lin_fit_window[i] = linear_regressor_window.coef_[0]
        intercept_lin_fit_window[i] = linear_regressor_window.intercept_
        discharge_capacity_2[i] = q[0][0]

        # 3. Other features
        mean_discharge_time[i] = np.mean(cell_summary['Discharge_time'][summary_key_c1:summary_key_clast+1])
        minimum_IR_window[i] = np.min(cell_summary['IR'][summary_key_c1:summary_key_clast+1])
        diff_IR_window[i] = cell_summary['IR'][summary_key_clast] - cell_summary['IR'][summary_key_c1]
        
        # 4. Targets
        target_remaining[i] = cell_summary['Remaining_cycles'][summary_key_clast]
        target_current[i] = int(key_clast)

    # assemble cell-level df
    cell_dfs.append(
        pd.DataFrame({
            "cell_key": np.array(cell_k, dtype=str),
            "minimum_dQ_100_10": minimum_dQ_window,
            "variance_dQ_window": variance_dQ_window,
            "slope_lin_fit_window": slope_lin_fit_window,
            "intercept_lin_fit_window": intercept_lin_fit_window,
            "discharge_capacity_2": discharge_capacity_2,
            "mean_discharge_time": mean_discharge_time,
            "minimum_IR_window": minimum_IR_window,
            "diff_IR_window": diff_IR_window,
            "target_remaining": np.array(target_remaining, dtype=np.int32),
            "target_current": np.array(target_current, dtype=np.int32)
        }))
    
df = pd.concat(cell_dfs)

In [None]:
df.head(100)

In [None]:
df[df.cell_key=="b2c14"]

In [None]:
df.cell_key.value_counts()

In [None]:
df.shape

In [None]:
cycle_keys