In [1]:
# choose the optimal model for digits classification
# identify the most optiimal values for the hyperparameters

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits

In [2]:
digits = load_digits()
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [3]:
df1 = pd.DataFrame(digits.data)
df1['target'] = digits.target
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [13]:
models_parameters = {
    'random_forest' : {
        'model' : RandomForestClassifier(),
        'params' : {
            'n_estimators' : [1,5,10]
        }
    },
    'svm' : {
        'model' : SVC(gamma='auto'),
        'params' : {
            'C' : range(1,21),
            'kernel' : ['linear',  'rbf']
        }
    },
    'logistic_regression'  : {
        'model' : LogisticRegression(),
        'params' : {
            'max_iter' : [500]
        }
    },
    'gaussian_nb' : {
        'model' : GaussianNB(),
        'params' : {
            'var_smoothing' : [1e-8]
        }
    },
    'multinomial_nb' : {
        'model' : MultinomialNB(),
        'params' : {
            'alpha' : [1.0]
        }
    },
    'decision_tree' : {
        'model' : DecisionTreeClassifier(),
        'params' : {
            'criterion' : ['gini', 'entropy']
        }
    }
}

In [14]:
pooled_scores = []

for model, parameters in models_parameters.items():
    gscv = GridSearchCV(parameters['model'], parameters['params'], n_jobs=5, return_train_score=False)
    gscv.fit(digits.data, digits.target)
    pooled_scores.append(
        {
            'model' : model,
            'best_score' : gscv.best_score_,
            'best_parameters' : gscv.best_params_
        }
    )

In [15]:
pooled_scores

[{'model': 'random_forest',
  'best_score': np.float64(0.9065289384091612),
  'best_parameters': {'n_estimators': 10}},
 {'model': 'svm',
  'best_score': np.float64(0.9476973073351903),
  'best_parameters': {'C': 1, 'kernel': 'linear'}},
 {'model': 'logistic_regression',
  'best_score': np.float64(0.9148777468276075),
  'best_parameters': {'max_iter': 500}},
 {'model': 'gaussian_nb',
  'best_score': np.float64(0.8197229959764778),
  'best_parameters': {'var_smoothing': 1e-08}},
 {'model': 'multinomial_nb',
  'best_score': np.float64(0.8703497369235531),
  'best_parameters': {'alpha': 1.0}},
 {'model': 'decision_tree',
  'best_score': np.float64(0.8052692664809656),
  'best_parameters': {'criterion': 'entropy'}}]

In [18]:
df2 = pd.DataFrame(pooled_scores, columns=['model', 'best_score','best_parameters'])
df2

Unnamed: 0,model,best_score,best_parameters
0,random_forest,0.906529,{'n_estimators': 10}
1,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
2,logistic_regression,0.914878,{'max_iter': 500}
3,gaussian_nb,0.819723,{'var_smoothing': 1e-08}
4,multinomial_nb,0.87035,{'alpha': 1.0}
5,decision_tree,0.805269,{'criterion': 'entropy'}


In [None]:
# SVM gives the best performance among the others.