# Tutorial 4: ML on the best [2~200] features

---

### Introduction

Hello, this notebook will show how to use machine learning (ML) to classify apples using the best [2, .. , 50] wavelength. The notebook will show how to visualize the final accuracy and their relative performance in two different graphs.

first, let us call the data

In [36]:
%store -r df_input_GS
%store -r y_GS
%store -r df_GS

In [37]:
y_GS = y_GS.map({'S': 1, 'B': 0})

---

calling some packages

In [38]:
# Python packages 
import pandas as pd # for importing data into data frame format
import seaborn as sns # For drawing useful graphs, such as bar graphs
import numpy as np
import matplotlib.pyplot as plt

---

<b><i> Data splitting </i></b> 

In [39]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split( df_input_GS, y_GS, test_size = 0.3, random_state=3, stratify=y_GS) # train and valid sets

In [40]:
print(Xtrain.shape)
print(Xtest.shape)

(352, 2074)
(151, 2074)


---

<b><i> get the best N features </i></b> 

In [41]:
%store -r featImp_GS

In [42]:
arrimp = np.array(featImp_GS).mean(0)
sorted_idx = arrimp.argsort()
print(sorted_idx[-10:])

[1793 1775 2048   31    3 1771  544 1781   17   45]


In [43]:
cols_ordered = df_input_GS.columns[sorted_idx][-400:]
cols_ordered

Index([1841.364,  980.575, 1133.147, 2337.819,  911.297, 1075.338, 1155.366,
        974.677, 1447.594, 1613.342,
       ...
       1968.596, 1942.053, 2441.281,   842.04,  834.451, 1936.251, 1010.382,
       1950.821,  838.229,  845.886],
      dtype='object', length=400)

---

<b><i> prepare diffrint feature set sizes </i></b> 

In [44]:
number_features = [2, 4, 6, 8, 10, 20, 50, 100, 200, 400] # define the number features. 


---

<b><i> ML model and thier hyper-parameters </i></b> 

In [45]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from numpy import random as np_random

first let us define the model


In [46]:
lr = LogisticRegression()
svm = SVC(kernel="rbf")
rf = RandomForestClassifier(random_state=1)
xgb = XGBClassifier(random_state=1)
knns = KNeighborsClassifier()
anns = MLPClassifier(max_iter=3000, random_state=1, activation = 'relu', solver = 'sgd')

defining the hyper-paramter per model

In [47]:
solvers = ['newton-cg', 'liblinear']
penalty = ['l2']
c_values = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001]
lr_par = dict(solver=solvers,penalty=penalty,C=c_values)

##############################
C_range = np.logspace(-1, 10, 20) # define a set of values for the parameter C
gamma_range = np.logspace(-9, 3, 20) # define a set of values for the parameter gamma

svm_par = dict(gamma=gamma_range, C=C_range)

#########################

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

#max_features = ['sqrt', 'log2']

rf_par = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
#######################
# https://www.kaggle.com/code/tilii7/hyperparameter-grid-search-with-xgboost/notebook
xgb_par = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5, 9],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5, 8]
        }
##################################
n_neighbors = range(1, 30, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

knns_par = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
################
anns_par = {
    'hidden_layer_sizes': [(3,3),(5,5),(8,3)],
#     'activation': ['relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001,0.01,0.1, 0.05, 0.005 , 0.005, 0.00005],
#     'learning_rate': ['constant','adaptive'],
}
################

group the above into two lists

In [48]:
models = [[lr, 'lr', 14], [svm, 'svm', 15], [rf, 'rf', 30], [xgb, 'xgb', 30], [knns, 'knns', 30], [anns, 'anns', 30]]
par = [lr_par, svm_par, rf_par, xgb_par, knns_par, anns_par]

# models = [[lr, 'lr', 14]]
# par = [lr_par]

---

<b><i> train and test </i></b> 

In [49]:
from source.ml_acc import get_accuracy_ml

In [None]:
# num_ml_tools = len(par)
# ml_dicts = {}

# for m, par in zip(models, par):
#     key0 = str(m[1])
#     ml_dicts[key0] = {}
#     for f in number_features:
#         print(f)
#         print(cols_ordered[-f:])
#         print(type(Xtrain))
#         xtr =  Xtrain[cols_ordered[-f:]]
#         xte =  Xtest[cols_ordered[-f:]]
#         results = get_accuracy_ml (m[0], m[2], par, np.array(xtr), np.array(Ytrain), np.array(xte), np.array(Ytest)) # to get the accuracies for the ml model

#         key = str(m[1])+","+str((f))
#         ml_dicts[key0][key] = {}

#         ml_dicts[key0][key]['tot_acc'] = results[0]
#         ml_dicts[key0][key]['jack_train'] = results[1]
#         ml_dicts[key0][key]['jack_test'] = results[2]

2
Index([838.229, 845.886], dtype='object')
<class 'pandas.core.frame.DataFrame'>
4
Index([1010.382, 1950.821, 838.229, 845.886], dtype='object')
<class 'pandas.core.frame.DataFrame'>
6
Index([834.451, 1936.251, 1010.382, 1950.821, 838.229, 845.886], dtype='object')
<class 'pandas.core.frame.DataFrame'>
8
Index([2441.281, 842.04, 834.451, 1936.251, 1010.382, 1950.821, 838.229,
       845.886],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
10
Index([1968.596, 1942.053, 2441.281,   842.04,  834.451, 1936.251, 1010.382,
       1950.821,  838.229,  845.886],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
20
Index([  968.85,  844.508, 1959.668, 1926.182, 1521.503,  836.875,  849.768,
        984.672,  835.796,  842.587, 1968.596, 1942.053, 2441.281,   842.04,
        834.451, 1936.251, 1010.382, 1950.821,  838.229,  845.886],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
50
Index([2280.247, 2011.358, 1437.162,  835.258,  851.721,  840.675,  1956.71,


In [None]:
import json

In [None]:
# with open('ml_gs.txt', 'w') as file:
#      file.write(json.dumps(ml_dicts)) # use `json.loads` to do the reverse

---

<b><i> accuracy visualisation </i></b> 

In [63]:
import json
with open('base_gs.txt') as f:
    data = f.read()
    
ml_dicts = json.loads(data)

with open('ml_gs.txt') as f:
    data = f.read()
    
baseDict = json.loads(data)
baseDict['lr'].keys()    

dict_keys(['lr,2', 'lr,4', 'lr,6', 'lr,8', 'lr,10', 'lr,20', 'lr,50', 'lr,100', 'lr,200', 'lr,400'])

In [64]:
from source.calculate_jack import jack_SD # importing the baseline code from source.basline file

In [65]:
arr_all = []
for m, d in zip (models, ml_dicts.keys()):
    acc_arr = [] 
    sd_arr = [] 

    # print(ml_dicts[d])
    for key in ml_dicts[d].keys():
        acc_arr.append(ml_dicts[d][key][ 'tot_acc' ]) # append total accuracy to an array
        sd_train = jack_SD(np.zeros( len(ml_dicts[d][key][ 'jack_train' ]) ), ml_dicts[d][key][ 'jack_train' ])[0]
        sd_test = jack_SD(np.zeros( len(ml_dicts[d][key][ 'jack_test' ]) ), ml_dicts[d][key][ 'jack_test' ])[0]
        sd = np.sqrt( np.array((sd_train**2)) + np.array((sd_test**2)))
        sd_arr.append(sd) # append sd_arr to an array
    arr_all.append([ list(ml_dicts[d].keys()), acc_arr, sd_arr])    

In [67]:
print(arr_all)

[[['lr, all'], [0.8160919540229885], [0.10236490642455316]]]
