# Analysis

### Helper functions

In [7]:
%run utils/helper_functions.ipynb

In [8]:
%run utils/MLP_utils.ipynb

### Load training data

In [6]:
# Load data
fpath = '/tmp/training.pkl'
X, y = load_pickle(fpath)

# Preview
print("Sample records:")
for r in X[:10]:
    print(r)
print("...")
print("\nSample labels")
y[:10]

Sample records:
(2, 2, 2, 0, 2, 2, 1, 3, 0)
(3, 0, 2, 0, 1, 0, 1, 1, 0)
(2, 2, 6, 0, 2, 2, 1, 2, 1)
(2, 2, 5, 4, 2, 1, 0, 5, 0)
(3, 2, 4, 4, 1, 1, 1, 3, 1)
(3, 0, 7, 0, 1, 2, 0, 3, 0)
(2, 2, 1, 0, 1, 1, 0, 3, 0)
(2, 2, 0, 0, 1, 1, 1, 4, 0)
(2, 0, 7, 2, 2, 1, 1, 3, 1)
(3, 2, 4, 0, 1, 1, 0, 2, 0)
...

Sample labels


[1, 0, 0, 1, 0, 0, 0, 0, 0, 0]

### Transform pipeline

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

steps = [
    ('scaler', StandardScaler()), # data scaling
    ('clf', MLPClassifier()) # Multilayer Perceptron
]

pipeline = Pipeline(steps)

### Hyperparameter space

In [11]:
import numpy as np
from itertools import product

max_layers = 5
max_neurons = 20
layer_space_list = []
for i in range(1, max_layers+1):
    i_layers = list(product(list(range(1, max_neurons+1)), repeat=i))
    layer_space_list.extend(i_layers) # all config: i layers
layer_space = tuple(layer_space_list)

param_grid = {
    'clf__hidden_layer_sizes' : layer_space, # (100,)
    'clf__max_iter' : (800, 1000, 1200),
    'clf__activation' : ['identity', 'logistic', 'tanh', 'relu'], # 'relu'
    'clf__solver' : ['lbfgs', 'sgd', 'adam'], #'adam'
    'clf__alpha' : np.linspace(start=0.00001, stop=0.001, num=50), #0.0001
    'clf__learning_rate' : ['constant', 'invscaling', 'adaptive'], #'constant'
    'clf__learning_rate_init' : np.linspace(start=0.0001, stop=0.01, num=50), #0.001
    'clf__momentum' : np.linspace(start=0.1, stop=1, num=10) #0.9
}

### Hyperparameter optimization (random), with KFold (K=10)

In [12]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=100, # search-iterations
    n_jobs=8, # parallel jobs
    refit=True,
    cv=10, # 10-fold cross-validation
    verbose=0,
    random_state=None
)

random_search.fit(X, y)

# .../.local/lib/python3.6/site-packages/sklearn/neural_network/multilayer_perceptron.py:564:
# ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
#   % self.max_iter, ConvergenceWarning)



RandomizedSearchCV(cv=10, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rat...=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))]),
          fit_params=None, iid=True, n_iter=100, n_jobs=8,
          param_distributions={'clf__hidden_layer_sizes': ((1,), (2,), (3,), (4,), (5,), (6,), (7,), (8,), (9,), (10,), (11,), (12,), (13,), (14,), (15,), (16,), (17,), (18,), (19,), (20,), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1,... ,  0.01   ]), 'clf__momentum': array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ])},
          pre_dispatch='2*

### Retrieve best model

In [13]:
best_params_pipeline = random_search.best_params_
best_score_pipeline = random_search.best_score_

clf_args = {
    'activation' : best_params_pipeline['clf__activation'],
    'alpha' : best_params_pipeline['clf__alpha'],
    'hidden_layer_sizes' : best_params_pipeline['clf__hidden_layer_sizes'],
    'learning_rate' : best_params_pipeline['clf__learning_rate'],
    'learning_rate_init' : best_params_pipeline['clf__learning_rate_init'],
    'max_iter' : best_params_pipeline['clf__max_iter'],
    'max_iter' : best_params_pipeline['clf__max_iter'],
    'momentum' : best_params_pipeline['clf__momentum'],
    'solver' : best_params_pipeline['clf__solver']
}


save_json(clf_args, '/tmp/clf_args_cancer.json')
print("best params [score={}]:".format(best_score_pipeline))
pprint(clf_args)

best params [score=0.7508771929824561]:
{
  "activation": "tanh",
  "alpha": 7.061224489795919e-05,
  "hidden_layer_sizes": [
    20,
    20,
    5,
    16,
    12
  ],
  "learning_rate": "constant",
  "learning_rate_init": 0.008181632653061224,
  "max_iter": 800,
  "momentum": 1.0,
  "solver": "sgd"
}
