In [1]:
#Install for structure
from util_all import * 

## Loading the DATA

In [2]:
# Loading all the data from Matlab file
data=scipy.io.loadmat(f'data/features_matrix.mat')
labels= np.squeeze(data['labels'])
labels_for_ML = 10**labels
Y = pd.DataFrame({'y': labels_for_ML})
features = data['features']
cycling_conditions = data['cycling_conditions']
features_names0 = data['features_names']
features_names = [subarray[0] for subarray in features_names0[0]]
cycling_conditions_names0 = data['cycling_conditions_names']
cycling_conditions_names = [subarray[0] for subarray in cycling_conditions_names0[0]]

#Concatenating all features in one big array of dim num_cells x num_features
all_features = features[:,:,0]
for i in range(1,3):
    all_features=np.concatenate((all_features,features[:,:,i]),axis=1)
    
# Creating features names (handles for features df column and metadata df )
classes = ['charge', 'full', 'discharge']
all_features_names = [my_class+'_'+s  for my_class in classes for s in features_names]

# Creating the Features Dataframes
all_features_df = pd.DataFrame(all_features, columns = all_features_names)
cycling_conditions_df = pd.DataFrame(cycling_conditions, columns = cycling_conditions_names)

In [3]:
#creating metadata Dataframe for all features
features_metadata_df = pd.DataFrame(index = all_features_names)
for name, row in features_metadata_df.iterrows():
    features_metadata_df.loc[name,'type']=get_feature_type(name)
    features_metadata_df.loc[name,'class']=get_feature_class(name)
    features_metadata_df.loc[name,'stream']=get_feature_stream(name)
    features_metadata_df.loc[name,'num_cycles_needed']=int(name[name.find('y')+9:name.find('y')+12]) 
    my_min, my_max = get_min_max_percentile(name)
    features_metadata_df.loc[name,'min_percentile']=my_min
    features_metadata_df.loc[name,'max_percentile']=my_max
    features_metadata_df.loc[name,'Pearson']=abs(pearsonr(all_features_df[name],labels_for_ML)[0])
    features_metadata_df.loc[name,'Log_Pearson']=abs(pearsonr(all_features_df[name],labels)[0])
    
#creating metadata Dataframe for cycling conditions
cycling_conditions_metadata_df = pd.DataFrame(data = [0,0,0,0],index=cycling_conditions_names,columns=['class'])
for name, row in cycling_conditions_metadata_df.iterrows():
    cycling_conditions_metadata_df.loc[name,'Pearson']=abs(pearsonr(cycling_conditions_df[name],labels_for_ML)[0])
    cycling_conditions_metadata_df.loc[name,'Log_Pearson']=abs(pearsonr(cycling_conditions_df[name],labels)[0])
    


## Feature Importance Analysis

In [4]:
def get_datasets(all_features_df, features_metadata_df, dataset_type):
    time_regions = [1,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150]

    #Getting a list of x datasets, 1 per time_region (TR)
    metadata_per_TR = [features_metadata_df[(features_metadata_df['num_cycles_needed']==y)&(features_metadata_df['type']==dataset_type)] for y in time_regions]  
    datasets = [all_features_df[meta_x.index.values.tolist()] for meta_x in metadata_per_TR]
    return  datasets, metadata_per_TR

charge_datasets, charge_meta = get_datasets(all_features_df, features_metadata_df, "charge")
discharge_datasets, discharge_meta = get_datasets(all_features_df, features_metadata_df, "discharge")
full_datasets, full_meta = get_datasets(all_features_df, features_metadata_df, "full")

## Use Single Discharge Features to Predict

In [None]:
Y = pd.DataFrame({'y': labels_for_ML})

performance_by_split = []
for TTS in range(10):
    discharge_mapes = []
    for j, Z in enumerate(discharge_datasets):
        discharge_mapes.append([])
        feature_order = []
        for i, X in Z.iteritems():
            print(f"------{TTS, j, i}--------")
            feature_order.append(i)
            X = pd.DataFrame(X)
            X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=TTS)

            ## Make unified dataframes for train and test
            df_train = Y_train.join(X_train)
            df_test = Y_test.join(X_test)

            Y_col = ['y']
            X_col =  X.columns.tolist()

            #finding best RF
            opt_model, X_train_scaled, Y_train_scaled, X_scaler, Y_scaler = make_cv_model_object(df_train,
                                                     Y_col=Y_col,
                                                     X_col=X_col,
                                                    cv_splits = 10,                                       
                                                     model=RandomForestRegressor(random_state=0),
                                                     model_hyperparams={'n_estimators': [40,80,160],
                                                                        'min_samples_leaf':[2,4],#[1,2,4,8]
                                                                        'min_samples_split':[2,4,8]})

            #Best RF:
            best_RF= opt_model.best_estimator_
            best_RF.fit(X_train_scaled,Y_train_scaled.values.ravel())

            RF_iter_result= plot_train_test_model_predictions(best_RF,
                                                  X_train_scaled = X_train_scaled, X_test = X_test, X_scaler = X_scaler, 
                                                 Y_train = Y_train, Y_test = Y_test, Y_scaler = Y_scaler,
                                                    X_col=X_col,Y_col=['y'],
                                                 plot_bounds = [0,2000], plot=False)[0:6]

            discharge_mapes[-1].append(RF_iter_result[1])
    performance_by_split.append(discharge_mapes)

In [8]:
def plot_discharge_fit(discharge_mapes):
    dis_p = pd.DataFrame(discharge_mapes).T   
    dis_p = dis_p.reindex(dis_p.mean(axis=1).sort_values().index) #sorting dataframe by features, according to average MAPE across 16 time regions (increasingly).
    dis_p.index = discharge_meta[0].index[dis_p.index]  # placing the features names as the index
    return dis_p    

In [9]:
average_discharges = []
for i in range(10):
    average_discharges.append(plot_discharge_fit(performance_by_split[i]))

In [10]:
# from all the results stored in average_discharge (1 dataframe per TTS), we recover the average and std MAPEs across all TTS.
index = average_discharges[0].index
feature_averages = []
feature_stds =[]
for feature in index:
    feature_df = [] #our list to store all values across the 10 TTS, for each feature
    for TTS in range(10):
        feature_df.append(average_discharges[TTS].loc[feature])
    x = pd.DataFrame(feature_df).mean(axis=0)
    x.name = feature
    feature_averages.append(x)
    z= pd.DataFrame(feature_df).std(axis=0)
    z.name = feature
    feature_stds.append(z)

feature_averages = pd.DataFrame(feature_averages)
feature_stds = pd.DataFrame(feature_stds)

#creating a dict mapping to rename the columns of the dataframes with the time_regions
feature_stds.columns.tolist()
time_regions = [1,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150]
renaming = dict(zip(feature_stds.columns.tolist(),time_regions))

#renaming
feature_stds= feature_stds.rename(columns=renaming)
feature_averages = feature_averages.rename(columns=renaming)