# Baseline Models and Classifiers

This notebook adapts the feature engineering from the original paper to our windowed approach.  We use linear & other simple regression models here to serve as a baseline for the deep learning models.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score

In [None]:
df = pd.read_csv("../data/rebuild_windowed_features_20_5_1.csv")

In [None]:
df.columns.values

In [None]:
df.describe()

In [None]:
df.head()

# Preprocessing and feature selection

In [None]:
batch_1_2_keys = df['cell_key'][df['cell_batch']!=3].unique()
train_keys = batch_1_2_keys[1::2]
test_keys = batch_1_2_keys[0::2]
train_ind = df[df['cell_key'].isin(train_keys)].index
test_ind = df[df['cell_key'].isin(test_keys)].index
secondary_test_ind = df[df['cell_batch']==3].index

splits = [train_ind, test_ind, secondary_test_ind]

In [None]:
# Define feature and target columns for regression models

varmod_features = ["variance_dQ_window"]
dismod_features = [
    "variance_dQ_window",
    "minimum_dQ_window",
    "skewness_dQ_window",
    "kurtosis_dQ_window",
    "discharge_capacity_1",
    "diff_discharge_capacity_max_1",
]
fullmod_features = [
    "minimum_dQ_window",
    "variance_dQ_window",
    "slope_lin_fit_window",
    "intercept_lin_fit_window",
    "discharge_capacity_1",
    "mean_discharge_time",
    "minimum_IR_window",
    "diff_IR_window",
]
targetmod = ["target_remaining"]  # , "target_current"

# Define feature and target columns for classifiers

varclf_features = ["variance_dQ_window"]
fullclf_features = [
    "minimum_dQ_window",
    "variance_dQ_window",
    "discharge_capacity_1",
    "diff_IR_window",
]
targetclf = ["target_classifier"]

In [None]:
def get_split(data, features, target, split):
    X = data.iloc[split,:].loc[:,features]
    y = data.iloc[split,:].loc[:,target]
    return X, y

def eval_model(model, data, features, target, splits):
    mse = list()
    mae = list()
    mpe = list()
    for split in splits:
        X, y = get_split(data, features, target, split)
        pred = model.predict(X)
        mse.append(mean_squared_error(pred, y))
        mae.append(float(np.mean(np.abs(y-pred.reshape(-1,1)))))
        mpe.append(float(np.mean(np.abs((y - pred.reshape(-1,1))) / y * 100)))
    return mse, mae, mpe

def eval_classifier(model, data, features, target, splits):
    acc = list()    
    for split in splits:
        X, y = get_split(data, features, target, split)
        pred = model.predict(X)
        acc.append(accuracy_score(pred, y.values.ravel()))
    return acc

# Variance Model

In [None]:
# Train Elastic net
x_train, y_train = get_split(df, varmod_features, targetmod, train_ind)

alphas = np.linspace(0.0001,1,30)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.01, 0.25, 0.5, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4, iid=False)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

"""
Because an elastic net with alpha = 0 is technically a linear regression
and elastic net produces inaccuracies with a small alpha,
we also train a linear regression model.
Linear regression performs slighty better at RMSE,
Elastic net performs slightly better at MPE.
We decide to take the linear regression scores.
"""
lin_reg = LinearRegression()
print("Linear Regression: %s" % lin_reg.fit(x_train, y_train).score(x_train, y_train))

varmod_mse, varmod_mae, varmod_mpe = eval_model(lin_reg, df, varmod_features, targetmod, splits)


# Add Random Forest
rf_params = {
    "max_depth": [2, 3],
    "n_estimators": [10, 100]
}
rfst = RandomForestRegressor(random_state=54)
rfst_grid = GridSearchCV(rfst, rf_params, cv=4, iid=False)
print("Random Forest: %s" % rfst_grid.fit(x_train, y_train).score(x_train, y_train))

varmod_rf_mse, varmod_rf_mae, varmod_rf_mpe = eval_model(rfst_grid, df, varmod_features, targetmod, splits)
print('varmod_rf_mse', varmod_rf_mse)
print('varmod_rf_mae', varmod_rf_mae)
print('varmod_rf_mpe', varmod_mpe)

# Discharge Model

In [None]:
# Train Elastic net
x_train, y_train = get_split(df, dismod_features, targetmod, train_ind)

alphas = np.linspace(0.1,1,20)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.01, 0.25, 0.5, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4, iid=False)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

dismod_mse, dismod_mae, dismod_mpe = eval_model(regr, df, dismod_features, targetmod, splits)

# Full Model

In [None]:
# Train Elastic net model
# raising the alpha minimum to 0.59 silences the convergence warnings,
# but decreases the score significantly - what's wrong here? 

x_train, y_train = get_split(df, fullmod_features, targetmod, train_ind)

alphas = np.linspace(0.001,1,20)
parameters = {
    "alpha": alphas,
    "l1_ratio": [0.001, 0.75, 1.]
}
enet = ElasticNet(random_state=54)
regr = GridSearchCV(enet, parameters, cv=4, iid=False)
print("Elastic Net: %s" % regr.fit(x_train, y_train).score(x_train, y_train))

fullmod_mse, fullmod_mae, fullmod_mpe = eval_model(regr, df, fullmod_features, targetmod, splits)

# Evaluate all linear regression models

In [None]:
pd.DataFrame({"Model":["Variance model", "Discharge model", "Full model"],
              "MAE - Train": [varmod_mae[0],dismod_mae[0],fullmod_mae[0]],
              "MAE - Primary test": [varmod_mae[1],dismod_mae[1],fullmod_mae[1]],
              "MAE - Secondary test": [varmod_mae[2],dismod_mae[2],fullmod_mae[2]],
              "MSE - Train": [varmod_mse[0],dismod_mse[0],fullmod_mse[0]],
              "MSE - Primary test": [varmod_mse[1],dismod_mse[1],fullmod_mse[1]],
              "MSE - Secondary test": [varmod_mse[2],dismod_mse[2],fullmod_mse[2]],
              "MPE - Train": [varmod_mpe[0],dismod_mpe[0],fullmod_mpe[0]],
              "MPE - Primary test": [varmod_mpe[1],dismod_mpe[1],fullmod_mpe[1]],
              "MPE - Secondary test": [varmod_mpe[2],dismod_mpe[2],fullmod_mpe[2]]
             })

# Variance Classifier

In [None]:
# Train Logistic Regression
x_train, y_train = get_split(df, varclf_features, targetclf, train_ind)

parameters = {"C": [0.01,0.1,0.5,0.75,1]}

logreg = LogisticRegression(solver="liblinear", random_state=54)
clf = GridSearchCV(logreg, parameters, cv=4, iid=False)
print("Logreg: %s" % clf.fit(x_train, y_train.values.ravel()).score(x_train, y_train.values.ravel()))

varclf_acc = eval_classifier(clf, df, varclf_features, targetclf, splits)

# Full Classifier

In [None]:
# Train Logistic Regression
# Why is the full classifier worse than the variance classifier?
x_train, y_train = get_split(df, fullclf_features, targetclf, train_ind)

parameters = {"C": [0.01,0.1,0.5,0.75,1]}

logreg = LogisticRegression(solver="liblinear", random_state=54)
clf = GridSearchCV(logreg, parameters, cv=4, iid=False)
print("Logreg: %s" % clf.fit(x_train, y_train.values.ravel()).score(x_train, y_train.values.ravel()))

fullclf_acc = eval_classifier(clf, df, fullclf_features, targetclf, splits)

# Evaluate all classifiers

In [None]:
pd.DataFrame({"Classifier":["Variance classifier", "Full classifier"],
              "Acc - Train": [varclf_acc[0],fullclf_acc[0]],
              "Acc - Primary test": [varclf_acc[1],fullclf_acc[1]],
              "Acc - Secondary test": [varclf_acc[2],fullclf_acc[2]]})                                  