# Model Selection

In this notebook, we will train models with the parameters we got from hiperparametrization, compare the performance of said models and select and train the model that will be used for deployment

In [4]:
import pickle
import pandas as pd
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
filename = "../Models/ML/Hyperparametrized/hyper_models.pkl"
with open(filename, "rb") as file:
    mlp_grid, xgb_grid, rf_grid, mlp_bayes, xgb_bayes, rf_bayes = pickle.load(file)

In [7]:
filename = "../Models/Parametrization/train.pkl"
X_train, y_train = pickle.load(open(filename, "rb"))
filename = "../Models/Parametrization/test.pkl"
X_test, y_test = pickle.load(open(filename, "rb"))

In [8]:
filename = "../Models/Transformations/transformations.pkl"
minmax, labelEncoder = pickle.load(open(filename, "rb"))

In [5]:
hyper_models = pd.DataFrame(index=["Acuracy", "ROC AUC", "F1 Score", "Recall Poisonous"])

### MLPClassifier

In [6]:
mlp_grid.best_params_

{'solver': 'adam',
 'max_iter': 500,
 'learning_rate': 'adaptive',
 'hidden_layer_sizes': (50, 50),
 'alpha': 0.01,
 'activation': 'relu'}

In [10]:
mlp_grid_model = MLPClassifier(
    activation="relu",
    hidden_layer_sizes=(50, 50),
    solver="adam",
    learning_rate="adaptive",
    alpha=0.01,
    max_iter=500,
)

mlp_grid_model.fit(X_train, y_train)

Y_pred = mlp_grid_model.predict(X_test)
print(
    metrics.classification_report(
        y_true=y_test, y_pred=Y_pred, target_names=labelEncoder.classes_
    )
)

hyper_models["NN Grid"] = [
    metrics.accuracy_score(y_test, Y_pred),
    metrics.roc_auc_score(y_test, Y_pred),
    metrics.f1_score(y_test, Y_pred),
    metrics.recall_score(y_test, Y_pred, pos_label=1),
]

              precision    recall  f1-score   support

           e       0.98      0.99      0.98     57394
           p       0.99      0.98      0.99     68606

    accuracy                           0.98    126000
   macro avg       0.98      0.98      0.98    126000
weighted avg       0.98      0.98      0.98    126000



In [9]:
mlp_bayes.best_params_

OrderedDict([('alpha', 0.001),
             ('learning_rate_init', 0.1),
             ('max_iter', 909),
             ('momentum', 0.4)])

In [11]:
mlp_bayes_model = MLPClassifier(
    activation="relu",
    hidden_layer_sizes=(50, 50),
    solver="adam",
    learning_rate="adaptive",
    alpha=0.001,
    learning_rate_init=0.1,
    momentum=0.4,
    max_iter=909,
)

mlp_bayes_model.fit(X_train, y_train)

Y_pred = mlp_bayes_model.predict(X_test)
print(
    metrics.classification_report(
        y_true=y_test, y_pred=Y_pred, target_names=labelEncoder.classes_
    )
)

hyper_models["NN Bayes"] = [
    metrics.accuracy_score(y_test, Y_pred),
    metrics.roc_auc_score(y_test, Y_pred),
    metrics.f1_score(y_test, Y_pred),
    metrics.recall_score(y_test, Y_pred, pos_label=1),
]

              precision    recall  f1-score   support

           e       0.96      0.98      0.97     57394
           p       0.98      0.97      0.98     68606

    accuracy                           0.97    126000
   macro avg       0.97      0.97      0.97    126000
weighted avg       0.97      0.97      0.97    126000



### XGBoost

In [12]:
xgb_grid.best_params_

{'subsample': 0.8,
 'n_estimators': 100,
 'max_depth': 7,
 'learning_rate': 0.2,
 'colsample_bytree': 0.7}

In [14]:
xgb_grid_model = XGBClassifier(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.2,
    subsample=0.8,
    colsample_bytree=0.7,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
)
xgb_grid_model.fit(X_train, y_train)

Y_pred = xgb_grid_model.predict(X_test)
Y_pred_proba = xgb_grid_model.predict_proba(X_test)[:, 1]

print(
    metrics.classification_report(
        y_true=y_test, y_pred=Y_pred, target_names=labelEncoder.classes_
    )
)

hyper_models["XGBoost Grid"] = [
    metrics.accuracy_score(y_test, Y_pred),
    metrics.roc_auc_score(y_test, Y_pred_proba),
    metrics.f1_score(y_test, Y_pred),
    metrics.recall_score(y_test, Y_pred, pos_label=1),
]

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           e       0.98      0.99      0.98     57394
           p       0.99      0.98      0.99     68606

    accuracy                           0.98    126000
   macro avg       0.98      0.99      0.98    126000
weighted avg       0.98      0.98      0.98    126000



In [13]:
xgb_bayes.best_params_

OrderedDict([('colsample_bytree', 0.9),
             ('learning_rate', 0.2),
             ('max_depth', 7),
             ('n_estimators', 200),
             ('subsample', 0.7)])

In [15]:
xgb_bayes_model = XGBClassifier(
    n_estimators=200,
    max_depth=7,
    learning_rate=0.2,
    subsample=0.7,
    colsample_bytree=0.9,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
)
xgb_bayes_model.fit(X_train, y_train)

Y_pred = xgb_bayes_model.predict(X_test)
Y_pred_proba = xgb_bayes_model.predict_proba(X_test)[:, 1]

print(
    metrics.classification_report(
        y_true=y_test, y_pred=Y_pred, target_names=labelEncoder.classes_
    )
)

hyper_models["XGBoost Bayes"] = [
    metrics.accuracy_score(y_test, Y_pred),
    metrics.roc_auc_score(y_test, Y_pred_proba),
    metrics.f1_score(y_test, Y_pred),
    metrics.recall_score(y_test, Y_pred, pos_label=1),
]

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           e       0.98      0.99      0.98     57394
           p       0.99      0.99      0.99     68606

    accuracy                           0.99    126000
   macro avg       0.99      0.99      0.99    126000
weighted avg       0.99      0.99      0.99    126000



### RandomForest

In [16]:
rf_grid.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': 20}

In [18]:
rf_grid_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    max_features="log2",
    n_jobs=-1,
)
rf_grid_model.fit(X_train, y_train)

Y_pred = rf_grid_model.predict(X_test)
Y_pred_proba = rf_grid_model.predict_proba(X_test)[:, 1]

print(
    metrics.classification_report(
        y_true=y_test, y_pred=Y_pred, target_names=labelEncoder.classes_
    )
)

hyper_models["Random Forest Grid"] = [
    metrics.accuracy_score(y_test, Y_pred),
    metrics.roc_auc_score(y_test, Y_pred_proba),
    metrics.f1_score(y_test, Y_pred),
    metrics.recall_score(y_test, Y_pred, pos_label=1),
]

              precision    recall  f1-score   support

           e       0.96      0.97      0.96     57394
           p       0.97      0.96      0.97     68606

    accuracy                           0.97    126000
   macro avg       0.96      0.97      0.97    126000
weighted avg       0.97      0.97      0.97    126000



In [17]:
rf_bayes.best_params_

OrderedDict([('max_depth', 20),
             ('min_samples_split', 3),
             ('n_estimators', 139)])

In [19]:
rf_bayes_model = RandomForestClassifier(
    n_estimators=139,
    max_depth=20,
    min_samples_split=3,
    min_samples_leaf=2,
    random_state=42,
    max_features="log2",
    n_jobs=-1,
)
rf_bayes_model.fit(X_train, y_train)

Y_pred = rf_bayes_model.predict(X_test)
Y_pred_proba = rf_bayes_model.predict_proba(X_test)[:, 1]

print(
    metrics.classification_report(
        y_true=y_test, y_pred=Y_pred, target_names=labelEncoder.classes_
    )
)

hyper_models["Random Forest Bayes"] = [
    metrics.accuracy_score(y_test, Y_pred),
    metrics.roc_auc_score(y_test, Y_pred_proba),
    metrics.f1_score(y_test, Y_pred),
    metrics.recall_score(y_test, Y_pred, pos_label=1),
]

              precision    recall  f1-score   support

           e       0.98      0.99      0.98     57394
           p       0.99      0.98      0.99     68606

    accuracy                           0.99    126000
   macro avg       0.98      0.99      0.99    126000
weighted avg       0.99      0.99      0.99    126000



## Evaluation and model selection

In [20]:
hyper_models

Unnamed: 0,NN Grid,NN Bayes,XGBoost Grid,XGBoost Bayes,Random Forest Grid,Random Forest Bayes
Acuracy,0.984222,0.973492,0.98496,0.98569,0.965278,0.985151
ROC AUC,0.984374,0.973943,0.995359,0.995463,0.990396,0.995164
F1 Score,0.98547,0.975492,0.986161,0.986841,0.967948,0.986335
Recall Poisonous,0.982669,0.96888,0.984156,0.985453,0.962904,0.9842


## Conclusion

After training the models and evaluating them, we get 4 models with high performance:

1. Neural Network Grid
2. XGBoost Grid
3. XGBoost Bayes
4. RandomForest Bayes

In the end, we selected XGBoost Bayes, as it is the model with the best performance in all of the metrics we evaluated and is one of the fastest to train.

We will now train it with the entire dataset and save it for deployment.

In [21]:
X = pd.concat([X_train, X_test])
Y = pd.concat([y_train, y_test])

xgb_bayes_model.fit(X, Y)

Parameters: { "use_label_encoder" } are not used.



In [22]:
filename = "../Models/ML/Deployment/xgb_bayes.pkl"
with open(filename, "wb") as file:
    pickle.dump([xgb_bayes_model], file)