## Import library and prepare dataset

In [72]:
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
feature = pd.DataFrame(datasets.load_iris()['data'],columns=['sepal_length','sepal_width','petal_length','petal_width'])
target = pd.DataFrame(datasets.load_iris()['target'],columns=['target'])
dataset = pd.concat([feature,target],axis=1)
dataset

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


## Describe the statistical data 

In [55]:
dataset.describe()
#notice that all the measurements are in cm

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## Split dataset
we split into 20% test set and 80% training set. We will also introduce a function that will print the full results of the training

In [56]:
X_train, X_test, y_train, y_test = train_test_split(dataset[['sepal_length','sepal_width','petal_length','petal_width']], dataset['target'], test_size=0.2, train_size = 0.8,shuffle=True)

In [68]:
def print_training(model):
    print('BEST PARAMETERS: {}\n'.format(model.best_params_))
    means = model.cv_results_['mean_test_score']
    stds = model.cv_results_['std_test_score']
    for mean, std, params in zip(means,stds,model.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean,3),round(std*2,3),params))

## Models to Consider
We will do K cross validation for the following models and test them to select the best model
1. KNN
2. Logistic Regression
3. SVM
4. Random Forest
5. Multi-layer Perceptron

# K-Nearest Neighbour (KNN)

In [69]:
from sklearn.neighbors import KNeighborsClassifier

In [70]:
model_knn = KNeighborsClassifier()
parameters_knn = {
    'n_neighbors' : [3,5,8,10,15],
    'p' : [1,2,4]
}
knn_cv = GridSearchCV(model_knn, parameters_knn,cv=5)
knn_cv.fit(X_train,y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [3, 5, 8, 10, 15], 'p': [1, 2, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [71]:
print_training(knn_cv)

BEST PARAMETERS: {'n_neighbors': 3, 'p': 4}

0.967 (+/-0.06) for {'n_neighbors': 3, 'p': 1}
0.975 (+/-0.065) for {'n_neighbors': 3, 'p': 2}
0.983 (+/-0.065) for {'n_neighbors': 3, 'p': 4}
0.975 (+/-0.065) for {'n_neighbors': 5, 'p': 1}
0.967 (+/-0.06) for {'n_neighbors': 5, 'p': 2}
0.975 (+/-0.065) for {'n_neighbors': 5, 'p': 4}
0.95 (+/-0.031) for {'n_neighbors': 8, 'p': 1}
0.975 (+/-0.04) for {'n_neighbors': 8, 'p': 2}
0.983 (+/-0.04) for {'n_neighbors': 8, 'p': 4}
0.975 (+/-0.04) for {'n_neighbors': 10, 'p': 1}
0.975 (+/-0.042) for {'n_neighbors': 10, 'p': 2}
0.975 (+/-0.042) for {'n_neighbors': 10, 'p': 4}
0.975 (+/-0.04) for {'n_neighbors': 15, 'p': 1}
0.958 (+/-0.05) for {'n_neighbors': 15, 'p': 2}
0.95 (+/-0.078) for {'n_neighbors': 15, 'p': 4}


In [124]:
final_knn = knn_cv.best_estimator_

# Logistic Regression

In [79]:
from sklearn.linear_model import LogisticRegression

In [91]:
model_lr = LogisticRegression()
params_lr = {
    'C':[0.001,0.01,0.1,1,10,100,1000]
}
#higher C means lower regularization -> more likely to overfit
lr_cv = GridSearchCV(model_lr,params_lr,cv = 5)
lr_cv.fit(X_train,y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [92]:
print_training(lr_cv)

BEST PARAMETERS: {'C': 10}

0.367 (+/-0.014) for {'C': 0.001}
0.692 (+/-0.027) for {'C': 0.01}
0.725 (+/-0.074) for {'C': 0.1}
0.933 (+/-0.12) for {'C': 1}
0.983 (+/-0.039) for {'C': 10}
0.983 (+/-0.039) for {'C': 100}
0.975 (+/-0.065) for {'C': 1000}


In [120]:
lr_cv.best_estimator_.coef_ # num row corresponds to the class and cols to the preds

array([[ 0.63824704,  2.1484675 , -3.37489549, -1.56508602],
       [ 0.22776199, -2.27077952,  0.81452558, -2.01553158],
       [-3.3400591 , -3.06987745,  5.15514313,  4.78977941]])

In [125]:
final_log = lr_cv.best_estimator_

# SVM

In [93]:
from sklearn.svm import SVC

In [99]:
model_svc = SVC()
params_svc = {
    'C' : [0.01,0.1,1,10,100],
    'kernel':['rbf','linear']
}
svc_cv = GridSearchCV(model_svc,params_svc,cv=5)
svc_cv.fit(X_train,y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.01, 0.1, 1, 10, 100], 'kernel': ['rbf', 'linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [100]:
print_training(svc_cv)

BEST PARAMETERS: {'C': 1, 'kernel': 'linear'}

0.367 (+/-0.014) for {'C': 0.01, 'kernel': 'rbf'}
0.833 (+/-0.151) for {'C': 0.01, 'kernel': 'linear'}
0.942 (+/-0.11) for {'C': 0.1, 'kernel': 'rbf'}
0.975 (+/-0.065) for {'C': 0.1, 'kernel': 'linear'}
0.975 (+/-0.065) for {'C': 1, 'kernel': 'rbf'}
0.992 (+/-0.032) for {'C': 1, 'kernel': 'linear'}
0.975 (+/-0.065) for {'C': 10, 'kernel': 'rbf'}
0.975 (+/-0.065) for {'C': 10, 'kernel': 'linear'}
0.958 (+/-0.05) for {'C': 100, 'kernel': 'rbf'}
0.958 (+/-0.05) for {'C': 100, 'kernel': 'linear'}


In [126]:
final_svm = svc_cv.best_estimator_

# Random Forest

In [101]:
from sklearn.ensemble import RandomForestClassifier

In [104]:
rf_model = RandomForestClassifier()
params_rf = {
    'min_samples_split':[2,4,6,8],
    'max_depth':[None,3,5,7,10]
}
rf_cv = GridSearchCV(rf_model,params_rf,cv=5)
rf_cv.fit(X_train,y_train)







GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'min_samples_split': [2, 4, 6, 8], 'max_depth': [None, 3, 5, 7, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [105]:
print_training(rf_cv)

BEST PARAMETERS: {'max_depth': None, 'min_samples_split': 8}

0.958 (+/-0.05) for {'max_depth': None, 'min_samples_split': 2}
0.967 (+/-0.061) for {'max_depth': None, 'min_samples_split': 4}
0.958 (+/-0.05) for {'max_depth': None, 'min_samples_split': 6}
0.975 (+/-0.065) for {'max_depth': None, 'min_samples_split': 8}
0.967 (+/-0.061) for {'max_depth': 3, 'min_samples_split': 2}
0.95 (+/-0.031) for {'max_depth': 3, 'min_samples_split': 4}
0.958 (+/-0.089) for {'max_depth': 3, 'min_samples_split': 6}
0.975 (+/-0.065) for {'max_depth': 3, 'min_samples_split': 8}
0.967 (+/-0.032) for {'max_depth': 5, 'min_samples_split': 2}
0.975 (+/-0.04) for {'max_depth': 5, 'min_samples_split': 4}
0.975 (+/-0.039) for {'max_depth': 5, 'min_samples_split': 6}
0.958 (+/-0.05) for {'max_depth': 5, 'min_samples_split': 8}
0.967 (+/-0.061) for {'max_depth': 7, 'min_samples_split': 2}
0.967 (+/-0.061) for {'max_depth': 7, 'min_samples_split': 4}
0.967 (+/-0.061) for {'max_depth': 7, 'min_samples_split': 6}
0

In [127]:
final_rf = rf_cv.best_estimator_

# Multi-layer Perceptron

In [128]:
from sklearn.neural_network import MLPClassifier

In [131]:
mlp_model = MLPClassifier()
params_mlp = {
    'activation':['relu','logistic','tanh'],
    'alpha':[0.0001,0.001,0.01,0.1], #this is the regularization
    'hidden_layer_sizes':[(10,),(50,),(100,),(200,),(500,)],
    'solver':['adam','sgd'],
    'learning_rate' : ['constant', 'invscaling', 'adaptive']
}
mlp_cv = GridSearchCV(mlp_model,params_mlp,cv=5)
mlp_cv.fit(X_train,y_train)







































































































GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'activation': ['relu', 'logistic', 'tanh'], 'alpha': [0.0001, 0.001, 0.01, 0.1], 'hidden_layer_sizes': [(10,), (50,), (100,), (200,), (500,)], 'solver': ['adam', 'sgd'], 'learning_rate': ['constant', 'invscaling', 'adaptive']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [133]:
print_training(mlp_cv)

BEST PARAMETERS: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}

0.642 (+/-0.249) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate': 'constant', 'solver': 'adam'}
0.833 (+/-0.152) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate': 'constant', 'solver': 'sgd'}
0.717 (+/-0.119) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling', 'solver': 'adam'}
0.4 (+/-0.428) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate': 'invscaling', 'solver': 'sgd'}
0.575 (+/-0.308) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive', 'solver': 'adam'}
0.8 (+/-0.144) for {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'learning_rate': 'adaptive', 'solver': 'sgd'}
0.917 (+/-0.133) for {'activation': 'relu', 'a

In [134]:
final_mlp = mlp_cv.best_estimator_

# Model Evaluation

In [138]:
def evaluate(models,X_test,Y_test):
    result = {}
    for model in models:
        res = np.sum(model.predict(X_test)==y_test)/len(y_test)
        result[model] = res
    return result
        

In [143]:
result = evaluate([final_knn,final_log,final_svm,final_rf,final_mlp],X_test,y_test)

In [156]:
max_val = max(result.values())
for i in result.keys():
    if result[i] == max_val:
        print(i,max_val)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False) 0.9666666666666667
MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False) 0.9666666666666667


### Therefore, in our test cases, Multi Layer Perceptron and SVM perform the best among these 5 algorithms