# Block 6 Exercise 1: Non-Linear Classification

## MNIST Data
We return to the MNIST data set on handwritten digits to compare non-linear classification algorithms ...   

In [1]:
#imports 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)


In [3]:
#the full MNIST data set contains 70k samples of digits 0-9 as 28*28 gray scale images (represented as 784 dim vectors)
np.shape(X)

(70000, 784)

In [4]:
X.min()

0.0

In [5]:
#look at max/min value in the data
X.max()

255.0

### E1.1: Cross-Validation and Support Vector Machines
Train and optimize  C-SVM classifier on MNIST (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC)
* use a RBF kernel
* use *random search* with cross-validation to find the best settings for *gamma* and *C* (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html#sklearn.model_selection.RandomizedSearchCV)
* use max_iter in the SVM to avoid long training times

In [11]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats import uniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn import svm

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
svm =svm.SVC(kernel='rbf', max_iter=50)

In [14]:
svm.fit(X_train, y_train)



SVC(max_iter=50)

In [19]:
#Train
svm.score(X_train, y_train)

0.8743035714285714

In [20]:
#Test
svm.score(X_test,y_test)

0.8651428571428571

In [23]:
svmCV = SVC(kernel='rbf', max_iter=50)

In [47]:
parameter = dict(gamma=[0.0000000001, 0.000000001, 0.00000001, 0.0000001, 0.000001, 0.00001,0.0001, 0.001, 0.01, 0,1], C=np.arange(0.1,10,0.1))

In [48]:
search = RandomizedSearchCV(svmCV, parameter, random_state=0)

In [52]:
result = search.fit(X_train[:800,:],y_train[:800])



In [53]:
result.best_params_

{'gamma': 1e-06, 'C': 7.0}

In [54]:
result.best_score_

0.8800000000000001

### E1.2: Pipelines and simple Neural Networks
Split the MNIST data into  train- and test-sets and then train and evaluate a simple Multi Layer Perceptron (MLP) network. Since the non-linear activation functions of MLPs are sensitive to the scaling on the input (recall the *sigmoid* function), we need to scale all input values to [0,1] 

* combine all steps of your training in a SKL pipeline (https://scikit-learn.org/stable/modules/compose.html#pipeline)
* use a SKL-scaler to scale the data (https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)
* MLP Parameters: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier
    * use a *SGD* solver
    * use *tanh* as activation function
    * compare networks with 1, 2 and 3 layers, use different numbers of neurons per layer
    * adjust training parameters *alpha* (regularization) and *learning rate* - how sensitive is the model to these parameters?
    * Hint: do not change all parameters at the same time, split into several experiments
* How hard is it to find the best parameters? How many experiments would you need to find the best parameters?
    


In [55]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Test 1 - 1 Layer and 100 units per layer

In [71]:
mlp = make_pipeline(StandardScaler(), MLPClassifier(solver='sgd',activation='tanh', hidden_layer_sizes=(100,1), alpha=0.0001, learning_rate='constant'))

In [72]:
mlp

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=(100, 1),
                               solver='sgd'))])

In [73]:
mlp.fit(X_train,y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=(100, 1),
                               solver='sgd'))])

In [74]:
mlp.score(X_test,y_test)

0.38242857142857145

### Test 2 - 2 Layer and 120 units per layer

In [76]:
mlp2 = make_pipeline(StandardScaler(), MLPClassifier(solver='sgd',activation='tanh', hidden_layer_sizes=(120,2), alpha=0.0001, learning_rate='constant', max_iter=400))

In [77]:
mlp2.fit(X_train,y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=(120, 2),
                               max_iter=400, solver='sgd'))])

In [78]:
mlp2.score(X_test,y_test)

0.7795

### Test 3 - 3 Layer and 100 units per layer

In [80]:
mlp3 = make_pipeline(StandardScaler(), MLPClassifier(solver='sgd',activation='tanh', hidden_layer_sizes=(100,3), alpha=0.0001, learning_rate='constant', max_iter=200))

In [81]:
mlp3.fit(X_train,y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=(100, 3),
                               solver='sgd'))])

In [82]:
mlp3.score(X_test,y_test)

0.9004285714285715

### Test 4 - 1 Layer and 150 units per layer

In [84]:
mlp4 = make_pipeline(StandardScaler(), MLPClassifier(solver='sgd',activation='tanh', hidden_layer_sizes=(150,1), alpha=0.0001, learning_rate='constant'))

In [85]:
mlp4.fit(X_train,y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=(150, 1),
                               solver='sgd'))])

In [86]:
mlp4.score(X_test,y_test)

0.376

### Test 5 - 3 Layer and 150 units per layer

In [59]:
mlp5 = make_pipeline(StandardScaler(), MLPClassifier(solver='sgd',activation='tanh', hidden_layer_sizes=(150,3), alpha=0.0001, learning_rate='constant', max_iter=200))

In [60]:
mlp5.fit(X_train,y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', hidden_layer_sizes=(150, 3),
                               solver='sgd'))])

In [61]:
mlp5.score(X_test,y_test)

0.9003571428571429