# 1. Imports

In [1]:
import warnings

import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import scale
from sklearn.svm import SVC

from utils import *

warnings.filterwarnings("ignore")

random.seed(1032021)
np.random.seed(1032021)
tf.random.set_seed(1032021)

# 2. Loading The Dataset

In [2]:
ds = pd.read_excel("data/07_14_22_data.xls",
                   sheet_name="All BC").T
ds = ds.iloc[1:, :]
ds = ds.drop(["LN Equinox", "Mystery"], axis=0)
ds = ds.reset_index(drop=True)

# the label of the sample
y = ds[0]
label_encoder = LabelEncoder()
y_num = pd.DataFrame(label_encoder.fit_transform(y))

ds = ds.iloc[:, 1:]

# normalize the dataset
x = ds.copy()
x.columns = list(range(x.shape[1]))
x = scale(ds, axis=1)
x = pd.DataFrame(x)

# only keep the peaks
peaks = [198, 262, 1506, 1669, 1967, 4564, 4708, 4760, 4970]
x_peaks = x.iloc[:, peaks]

# 3. K-Fold

In [3]:
num_splits = 5
num_repeats = 3
kfold = RepeatedStratifiedKFold(n_splits=num_splits,
                                n_repeats=num_repeats,
                                random_state=1032021)

num_splits *= num_repeats

models = {'pca_lda': [], "svm": [], "forest": [], "xgb": [], 'mlp': []}

svm_param = {'C': [0.01, 0.1, 1, 10, 100, 1000],
             'gamma': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001],
             'kernel': ['rbf']}

xgb_param = {'n_estimators': [20, 50, 100, 200, 300, 400, 500],
             'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],
             'gamma': [0, 0.5, 1, 1.5, 2, 5],
             'max_depth': [2, 4, 6]}

rf_param = {'n_estimators': [20, 50, 100, 200, 300, 400, 500],
            'min_samples_leaf': [1, 3, 5, 7, 9, 11],
            'max_depth': [2, 4, 6, 8, 10],
            'max_features': ['sqrt', 'log2']}

## 3.1 Linear Discriminant Analysis

In [4]:

pca_lda_grid_search(x=x_peaks,
                    y=y,
                    kfold=kfold)
# 3 0.9460317460317461

4 0.9460317460317461


## 3.2 Support Vector Machine

In [5]:

svm = SVC()
svm_grid = general_grid_search(x=x_peaks,
                               y=y,
                               model=svm,
                               param=svm_param,
                               kfold=kfold)

print(svm_grid.best_params_, svm_grid.best_score_)
# {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'} 0.9380952380952382

Fitting 15 folds for each of 42 candidates, totalling 630 fits
{'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'} 0.9380952380952382


## 3.3 Random Forest

In [6]:
forest = RandomForestClassifier(random_state=1032021)

forest_grid = general_grid_search(x=x_peaks,
                                  y=y,
                                  model=forest,
                                  param=rf_param,
                                  kfold=kfold)

print(forest_grid.best_params_, forest_grid.best_score_)
# {'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 300} 0.8968253968253969

Fitting 15 folds for each of 420 candidates, totalling 6300 fits
{'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 300} 0.8968253968253969


## 3.4 Extreme Gradient Boosting

In [7]:
xgboost = xgb.XGBClassifier(random_state=1032021)

xgb_grid = general_grid_search(x=x_peaks,
                               y=y_num,
                               model=xgboost,
                               param=xgb_param,
                               kfold=kfold)

print(xgb_grid.best_params_, xgb_grid.best_score_)
# {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 500} 0.8968253968253969

Fitting 15 folds for each of 756 candidates, totalling 11340 fits
{'gamma': 0, 'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 500} 0.8968253968253969


## 3.5 Multilayer Perceptron

In [8]:
mlp = create_model(num_neurons=128,
                   drop_out_rate=0.3,
                   input_shape=x_peaks.shape,
                   output_shape=4)

accuracy = mlp_kfold(x=x_peaks,
                     y=y_num,
                     model=mlp,
                     kfold=kfold,
                     epochs=200,
                     folder="mlp_model_2/")

Epoch 1/200

Epoch 00001: val_accuracy improved from -inf to 0.57143, saving model to mlp_model_2/model_1.h5
Epoch 2/200

Epoch 00002: val_accuracy did not improve from 0.57143
Epoch 3/200

Epoch 00003: val_accuracy did not improve from 0.57143
Epoch 4/200

Epoch 00004: val_accuracy did not improve from 0.57143
Epoch 5/200

Epoch 00005: val_accuracy did not improve from 0.57143
Epoch 6/200

Epoch 00006: val_accuracy did not improve from 0.57143
Epoch 7/200

Epoch 00007: val_accuracy did not improve from 0.57143
Epoch 8/200

Epoch 00008: val_accuracy did not improve from 0.57143
Epoch 9/200

Epoch 00009: val_accuracy did not improve from 0.57143
Epoch 10/200

Epoch 00010: val_accuracy did not improve from 0.57143
Epoch 11/200

Epoch 00011: val_accuracy did not improve from 0.57143
Epoch 12/200

Epoch 00012: val_accuracy did not improve from 0.57143
Epoch 13/200

Epoch 00013: val_accuracy did not improve from 0.57143
Epoch 14/200

Epoch 00014: val_accuracy did not improve from 0.57143
Ep

In [9]:
accuracy = np.array(accuracy)
np.argmax(accuracy, axis=1), accuracy.max(axis=1)

(array([113,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0]),
 array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))