# Parameters tuning

In [2]:
# Imports
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split


import sys
sys.path.append('../utils')
from utils import perf, thomas_parser

In [3]:
gt = pd.read_csv('../../dumps/various_sizes/8K.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']

data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
scaler = Normalizer()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

#### Solver

The solver for weight optimization.
- ‘lbfgs’ is an optimizer in the family of quasi-Newton methods.
- ‘sgd’ refers to stochastic gradient descent.
- ‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba

Note: The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, ‘lbfgs’ can converge faster and perform better.

In [4]:
solver = ['lbfgs','sgd', 'adam']
for i in solver:
    print("Solver : %s" % i)
    mlp = MLPClassifier(solver=i, random_state=0, max_iter=10000) 
    mlp.fit(data_train, target_train)
    print("Accuracy on training set: {:.3f}".format(mlp.score(data_train, target_train))) 
    print("Accuracy on test set: {:.3f}".format(mlp.score(data_test, target_test)))

Solver : lbfgs
Accuracy on training set: 0.925
Accuracy on test set: 0.924
Solver : sgd
Accuracy on training set: 0.899
Accuracy on test set: 0.902
Solver : adam
Accuracy on training set: 0.963
Accuracy on test set: 0.956


As we could expect, the 'adam' algorithm performs quite well on our dataset. Still, the performance on the test set might be improved by tuning other parameters.

#### Activation

Activation function for the hidden layer.
- ‘identity’, no-op activation, useful to implement linear bottleneck, returns f(x) = x
- ‘logistic’, the logistic sigmoid function, returns f(x) = 1 / (1 + exp(-x)).
- ‘tanh’, the hyperbolic tan function, returns f(x) = tanh(x).
- ‘relu’, the rectified linear unit function, returns f(x) = max(0, x)

In [5]:
act = ['identity','logistic', 'tanh', 'relu']
for i in act:
    print("function : %s" % i)
    mlp = MLPClassifier(activation=i, random_state=0, max_iter=10000) 
    mlp.fit(data_train, target_train)
    print("Accuracy on training set: {:.3f}".format(mlp.score(data_train, target_train))) 
    print("Accuracy on test set: {:.3f}".format(mlp.score(data_test, target_test)))

function : identity
Accuracy on training set: 0.906
Accuracy on test set: 0.911
function : logistic
Accuracy on training set: 0.908
Accuracy on test set: 0.914
function : tanh
Accuracy on training set: 0.916
Accuracy on test set: 0.912
function : relu
Accuracy on training set: 0.963
Accuracy on test set: 0.956


The 'relu' activation performed better on our training and test set, which is the default activation function.

#### Learning rate

Learning rate schedule for weight updates (only for 'sgd' solver).
- ‘constant’ is a constant learning rate given by ‘learning_rate_init’.
- ‘invscaling’ gradually decreases the learning rate at each time step ‘t’ using an inverse scaling exponent of ‘power_t’. effective_learning_rate = learning_rate_init / pow(t, power_t)
- ‘adaptive’ keeps the learning rate constant to ‘learning_rate_init’ as long as training loss keeps decreasing. Each time two consecutive epochs fail to decrease training loss by at least tol, or fail to increase validation score by at least tol if ‘early_stopping’ is on, the current learning rate is divided by 5.

In [6]:
learning_rate = ['constant','invscaling', 'adaptive']
for i in learning_rate:
    print("function : %s" % i)
    mlp = MLPClassifier(solver='sgd', learning_rate=i, random_state=0, max_iter=10000) 
    mlp.fit(data_train, target_train)
    print("Accuracy on training set: {:.3f}".format(mlp.score(data_train, target_train))) 
    print("Accuracy on test set: {:.3f}".format(mlp.score(data_test, target_test)))

function : constant
Accuracy on training set: 0.899
Accuracy on test set: 0.902
function : invscaling
Accuracy on training set: 0.899
Accuracy on test set: 0.902
function : adaptive
Accuracy on training set: 0.899
Accuracy on test set: 0.902


For the 'sgd' solver, the learning rate doesn't really matter.

#### Alpha

L2 penalty (regularization term) parameter (default is 0.0001).

In [7]:
alpha = [0.0001,0.001,0.1,1,10,100,1000]
for i in alpha:
    print("alpha : %s" % i)
    mlp = MLPClassifier(alpha=i, random_state=0, max_iter=10000) 
    mlp.fit(data_train, target_train)
    print("Accuracy on training set: {:.3f}".format(mlp.score(data_train, target_train))) 
    print("Accuracy on test set: {:.3f}".format(mlp.score(data_test, target_test)))

alpha : 0.0001
Accuracy on training set: 0.963
Accuracy on test set: 0.956
alpha : 0.001
Accuracy on training set: 0.937
Accuracy on test set: 0.933
alpha : 0.1
Accuracy on training set: 0.915
Accuracy on test set: 0.915
alpha : 1
Accuracy on training set: 0.899
Accuracy on test set: 0.902
alpha : 10
Accuracy on training set: 0.899
Accuracy on test set: 0.902
alpha : 100
Accuracy on training set: 0.899
Accuracy on test set: 0.902
alpha : 1000
Accuracy on training set: 0.899
Accuracy on test set: 0.902


The default value seems to provide the best result.

#### Hidden layers

This parameter allows us to set the number of layers and the number of nodes we wish to have in the Neural Network Classifier. Each element in the tuple represents the number of nodes at the ith position where i is the index of the tuple.

In [8]:
hidden_layers_size = [(50,50,50), (100,100,100), (50,100,50), (100,50,50), (50,50,100), (100,)]
for i in hidden_layers_size:
    print("layer size : %s" % (i,))
    mlp = MLPClassifier(hidden_layer_sizes=i, random_state=0, max_iter=10000) 
    mlp.fit(data_train, target_train)
    print("Accuracy on training set: {:.3f}".format(mlp.score(data_train, target_train))) 
    print("Accuracy on test set: {:.3f}".format(mlp.score(data_test, target_test)))

layer size : (50, 50, 50)
Accuracy on training set: 0.976
Accuracy on test set: 0.968
layer size : (100, 100, 100)
Accuracy on training set: 0.985
Accuracy on test set: 0.973
layer size : (50, 100, 50)
Accuracy on training set: 0.977
Accuracy on test set: 0.968
layer size : (100, 50, 50)
Accuracy on training set: 0.968
Accuracy on test set: 0.965
layer size : (50, 50, 100)
Accuracy on training set: 0.978
Accuracy on test set: 0.971
layer size : (100,)
Accuracy on training set: 0.963
Accuracy on test set: 0.956


#### Final choice

In [9]:
gt = pd.read_csv('../../dumps/various_sizes/8K.csv')
cols = [col for col in gt.columns if col not in ['label']]
data = gt[cols]
target = gt['label']
data_train, data_test, target_train, target_test = train_test_split(data,target, test_size = 0.20, random_state = 0)
scaler = Normalizer()
scaler.fit(data_train)
data_train = scaler.transform(data_train)
data_test = scaler.transform(data_test)

In [13]:
mlp = MLPClassifier(solver='adam',hidden_layer_sizes=(100, 100, 100),max_iter=10000)
mlp.fit(data_train, target_train)
print("Accuracy on training set: {:.3f}".format(mlp.score(data_train, target_train))) 
print("Accuracy on test set: {:.3f}".format(mlp.score(data_test, target_test)))

Accuracy on training set: 0.969
Accuracy on test set: 0.965
