# Exercise: Machine Learning Finding Optimal Model and Hyperparameters

For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

In [3]:
import pandas as pd

from sklearn.datasets import load_digits
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [10]:
model_params = {
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear']
        },
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1, 5, 10]
        }
    },
    'logistic_reg': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1, 5, 10],
            'penalty': ['l1', 'l2']
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random']
        }
    }
}

In [11]:
from sklearn.model_selection import GridSearchCV

digits = load_digits()
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append(
        {
            'model': model_name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        }
    )

scores

[{'model': 'svm',
  'best_score': 0.9476973073351903,
  'best_params': {'C': 1, 'kernel': 'linear'}},
 {'model': 'random_forest',
  'best_score': 0.9037542556484061,
  'best_params': {'n_estimators': 10}},
 {'model': 'logistic_reg',
  'best_score': 0.9276787372330547,
  'best_params': {'C': 1, 'penalty': 'l1'}},
 {'model': 'naive_bayes_gaussian',
  'best_score': 0.8069281956050759,
  'best_params': {}},
 {'model': 'naive_bayes_multinomial',
  'best_score': 0.8703497369235531,
  'best_params': {}},
 {'model': 'decision_tree',
  'best_score': 0.8141488703187868,
  'best_params': {'criterion': 'entropy', 'splitter': 'best'}}]

In [12]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.903754,{'n_estimators': 10}
2,logistic_reg,0.927679,"{'C': 1, 'penalty': 'l1'}"
3,naive_bayes_gaussian,0.806928,{}
4,naive_bayes_multinomial,0.87035,{}
5,decision_tree,0.814149,"{'criterion': 'entropy', 'splitter': 'best'}"


For the winner is SVM (C=1, kernel=linear) with 94.7% score.

We can also use RandomizedSearchCV as well, Let's give it a try:

In [15]:
from sklearn.model_selection import RandomizedSearchCV

scores = []

for model_name, mp in model_params.items():
    clf = RandomizedSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, n_iter=2)
    clf.fit(digits.data, digits.target)
    scores.append(
        {
            'model': model_name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        }
    )



In [16]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'kernel': 'linear', 'C': 10}"
1,random_forest,0.899864,{'n_estimators': 10}
2,logistic_reg,0.91822,"{'penalty': 'l2', 'C': 5}"
3,naive_bayes_gaussian,0.806928,{}
4,naive_bayes_multinomial,0.87035,{}
5,decision_tree,0.795215,"{'splitter': 'random', 'criterion': 'entropy'}"
