In [1]:
#Install for structure
from util_all import * 

## Loading the DATA

In [2]:
# Loading all the data from Matlab file
data=scipy.io.loadmat(f'data/features_matrix.mat')
labels= np.squeeze(data['labels'])
labels_for_ML = 10**labels
Y = pd.DataFrame({'y': labels_for_ML})
features = data['features']
cycling_conditions = data['cycling_conditions']
features_names0 = data['features_names']
features_names = [subarray[0] for subarray in features_names0[0]]
cycling_conditions_names0 = data['cycling_conditions_names']
cycling_conditions_names = [subarray[0] for subarray in cycling_conditions_names0[0]]

#Concatenating all features in one big array of dim num_cells x num_features
all_features = features[:,:,0]
for i in range(1,3):
    all_features=np.concatenate((all_features,features[:,:,i]),axis=1)
    
# Creating features names (handles for features df column and metadata df )
classes = ['charge', 'full', 'discharge']
all_features_names = [my_class+'_'+s  for my_class in classes for s in features_names]

# Creating the Features Dataframes
all_features_df = pd.DataFrame(all_features, columns = all_features_names)
cycling_conditions_df = pd.DataFrame(cycling_conditions, columns = cycling_conditions_names)

In [3]:
#creating metadata Dataframe for all features
features_metadata_df = pd.DataFrame(index = all_features_names)
for name, row in features_metadata_df.iterrows():
    features_metadata_df.loc[name,'type']=get_feature_type(name)
    features_metadata_df.loc[name,'class']=get_feature_class(name)
    features_metadata_df.loc[name,'stream']=get_feature_stream(name)
    features_metadata_df.loc[name,'num_cycles_needed']=int(name[name.find('y')+9:name.find('y')+12]) 
    my_min, my_max = get_min_max_percentile(name)
    features_metadata_df.loc[name,'min_percentile']=my_min
    features_metadata_df.loc[name,'max_percentile']=my_max
    features_metadata_df.loc[name,'Pearson']=abs(pearsonr(all_features_df[name],labels_for_ML)[0])
    features_metadata_df.loc[name,'Log_Pearson']=abs(pearsonr(all_features_df[name],labels)[0])
    
#creating metadata Dataframe for cycling conditions
cycling_conditions_metadata_df = pd.DataFrame(data = [0,0,0,0],index=cycling_conditions_names,columns=['class'])
for name, row in cycling_conditions_metadata_df.iterrows():
    cycling_conditions_metadata_df.loc[name,'Pearson']=abs(pearsonr(cycling_conditions_df[name],labels_for_ML)[0])
    cycling_conditions_metadata_df.loc[name,'Log_Pearson']=abs(pearsonr(cycling_conditions_df[name],labels)[0])
    


## Dummy Model benchmark


In [4]:
X= cycling_conditions_df
dummy_results = np.empty((0,6))
for TTS in range(10):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=TTS)
    dummy_model = DummyRegressor() # default is average
    dummy_model.fit(X_train, Y_train)
    train_predictions = dummy_model.predict(X_train)
    predictions = dummy_model.predict(X_test)
    dummy_iter_result = plot_n_MAPE_TTS (np.array(Y_train), train_predictions,np.array(Y_test), predictions,plot=False) 
    dummy_iter_result= np.array(dummy_iter_result).reshape((1,-1))
    dummy_results = np.append(dummy_results,dummy_iter_result,axis=0)

In [5]:
my_columns = ['train MAPE','test MAPE','train RMSE','test RMSE','train MAE','test MAE']
dummy_results_df = pd.DataFrame(dummy_results,columns=my_columns)
dummy_results_df.mean(axis=0)

train MAPE      0.4038
test MAPE       0.4120
train RMSE    370.4056
test RMSE     403.9809
train MAE     270.9863
test MAE      298.3506
dtype: float64

In [6]:
dummy_results_df.std(axis=0)

train MAPE     0.013298
test MAPE      0.073740
train RMSE    19.976498
test RMSE     68.829514
train MAE     12.761092
test MAE      43.724007
dtype: float64

## Class 0 Model

In [9]:
#Data selection and preparation
X= cycling_conditions_df
class0_ENet_results = np.empty((0,6))
class0_RF_results = np.empty((0,6))
for TTS in range(10):
    print(f'doing iteration TTS {TTS} ---------------')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=TTS)
    df_train = Y_train.join(X_train)
    df_test = Y_test.join(X_test)

    Y_col = ['y']
    X_col =  X.columns.tolist()
    
    #finding best ENet
    warnings.simplefilter('ignore')
    opt_model, X_train_scaled, Y_train_scaled, X_scaler, Y_scaler = make_cv_model_object(df_train,
                                     X_col=X_col,Y_col=Y_col,
                                    cv_splits = 10,                                       
    #                                  split_lists=split_lists,
                                     model=ElasticNet(random_state=25),
                                     model_hyperparams={'alpha': [0.00001, 0.00005,0.0001,0.001,0.005,0.01,0.02,0.03,0.04,0.1],
                                                        'l1_ratio':[0.2,0.3,0.4,0.5,0.6,0.7,0.8]})

    #Recover best ENet and train it on the totality of X_train:
    best_ENet= opt_model.best_estimator_
    best_ENet.fit(X_train_scaled,Y_train_scaled.values.ravel())

    ENet_iter_result = plot_train_test_model_predictions(best_ENet,
                                          X_train_scaled = X_train_scaled, X_test = X_test, X_scaler = X_scaler, 
                                         Y_train = Y_train, Y_test = Y_test, Y_scaler = Y_scaler,
                                            X_col=X_col,Y_col=['y'],
                                         plot_bounds = [0,2000], plot=False)[0:6]

    ENet_iter_result = np.array(ENet_iter_result).reshape((1,-1))
    class0_ENet_results = np.append(class0_ENet_results,ENet_iter_result,axis=0)

    #finding best RF
    opt_model, X_train_scaled, Y_train_scaled, X_scaler, Y_scaler = make_cv_model_object(df_train,
                                             Y_col=Y_col,
                                             X_col=X_col,
                                            cv_splits = 10,                                       
            #                                  split_lists=split_lists,
                                             model=RandomForestRegressor(random_state=0),
                                             model_hyperparams={'n_estimators': [40,80,160],
                                                                'min_samples_leaf':[2,4],#[1,2,4,8]
                                                                'min_samples_split':[2,4,8]})

    #Best RF:
    best_RF= opt_model.best_estimator_
    best_RF.fit(X_train_scaled,Y_train_scaled.values.ravel())

    RF_iter_result= plot_train_test_model_predictions(best_RF,
                                          X_train_scaled = X_train_scaled, X_test = X_test, X_scaler = X_scaler, 
                                         Y_train = Y_train, Y_test = Y_test, Y_scaler = Y_scaler,
                                            X_col=X_col,Y_col=['y'],
                                         plot_bounds = [0,2000], plot=False)[0:6]

    RF_iter_result= np.array(RF_iter_result).reshape((1,-1))
    class0_RF_results = np.append(class0_RF_results,RF_iter_result,axis=0)
        

doing iteration TTS 0 ---------------
doing iteration TTS 1 ---------------
doing iteration TTS 2 ---------------
doing iteration TTS 3 ---------------
doing iteration TTS 4 ---------------
doing iteration TTS 5 ---------------
doing iteration TTS 6 ---------------
doing iteration TTS 7 ---------------
doing iteration TTS 8 ---------------
doing iteration TTS 9 ---------------


In [10]:
my_columns = ['train MAPE','test MAPE','train RMSE','test RMSE','train MAE','test MAE']
class0_ENet_results_df = pd.DataFrame(class0_ENet_results,columns=my_columns)
class0_RF_results_df = pd.DataFrame(class0_RF_results,columns=my_columns)
print(class0_ENet_results_df.mean(axis=0))
class0_RF_results_df.mean(axis=0)

train MAPE      0.296472
test MAPE       0.296754
train RMSE    273.817342
test RMSE     299.486306
train MAE     206.078215
test MAE      221.547571
dtype: float64


train MAPE      0.168508
test MAPE       0.264454
train RMSE    200.047991
test RMSE     280.175617
train MAE     125.051852
test MAE      194.440774
dtype: float64

In [12]:
print(class0_ENet_results_df.std(axis=0))
class0_RF_results_df.std(axis=0)

train MAPE     0.012088
test MAPE      0.043983
train RMSE     8.456005
test RMSE     39.205220
train MAE      7.698141
test MAE      32.613813
dtype: float64


train MAPE     0.014638
test MAPE      0.042548
train RMSE    14.612636
test RMSE     39.145225
train MAE     10.928223
test MAE      28.373068
dtype: float64