# GridSearchCV on multiclass datasets

### Some algorithms may already inherently be usable by GridSearchCV on multiclass datasets. Others may need to declare explicitly using either 'ovr' or 'ovo' first, before you can use GridSearchCV.

Please check here for more details: [Multiclass algorithms (scikit-learn)](https://scikit-learn.org/stable/modules/multiclass.html)

In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

SEED = 0

In [2]:
# Load multiclass Iris dataset from scikit-learn
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

### Which scoring method to use with GridSearchCV on multiclass datasets

When identifying the relevant scoring method for GridSearchCV, choose one that can work with multiclass datasets. If F1 score is preferred, then choose between: f1_micro, f1_macro, f1_weighted. If roc_auc is preferred, then choose between any of the roc_auc methods with 'ovr' or 'ovo'. If 'None' is chosen, the cross-validation will default to the scoring method of selected estimator/classifier, which usually is the model accuracy score.

Please check here for more scoring methods: [Metrics and Scoring (scikit-learn)](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

In [3]:
# Hyperparameter fine-tuning for Logistic Regression on multi-class dataset
parameters = {'penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'C': np.logspace(-2, 2, 5)}

gs_clf = GridSearchCV(LogisticRegression(multi_class='ovr'),
                      parameters, 
                      cv=5,
                      # scoring='f1_macro',
                      scoring='roc_auc_ovr',
                      n_jobs=-1)
_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

LogisticRegression(C=10.0, multi_class='ovr')
{'C': 10.0, 'penalty': 'l2'}
0.9921031746031748


In [4]:
# Hyperparameter fine-tuning for K-nearest Neigobors on multi-class dataset
parameters = {'n_neighbors': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}

gs_clf = GridSearchCV(KNeighborsClassifier(),
                      parameters, 
                      cv=5,
                      # scoring='f1_macro',
                      scoring='roc_auc_ovr',
                      n_jobs=-1)
_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

KNeighborsClassifier(n_neighbors=11)
{'n_neighbors': 11}
0.9946031746031746


In [5]:
# Hyperparameter fine-tuning for GassianNB on multi-class dataset
parameters = {'var_smoothing': np.logspace(0,-9, num=100)}

gs_clf = GridSearchCV(GaussianNB(),
                      parameters, 
                      cv=5,
                      # scoring='f1_macro',
                      scoring='roc_auc_ovr',
                      n_jobs=-1)
_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

GaussianNB(var_smoothing=0.006579332246575682)
{'var_smoothing': 0.006579332246575682}
0.9933333333333334


In [6]:
# Hyperparameter fine-tuning for LinearSVC on multi-class dataset
parameters = {'penalty': ['l1', 'l2'],
              'C': np.logspace(-2, 2, 5),
              'max_iter': list(range(500, 3000, 500))}

gs_clf = GridSearchCV(LinearSVC(multi_class='ovr'),
                      parameters, 
                      cv=5,
                      # scoring='f1_macro',
                      # scoring='roc_auc_ovr',
                      n_jobs=-1)
_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

LinearSVC(C=10.0, max_iter=1500)
{'C': 10.0, 'max_iter': 1500, 'penalty': 'l2'}
0.9557312252964426




In [7]:
# Hyperparameter fine-tuning for Decision Tree on multi-class dataset
parameters = {'criterion': ['gini', 'entropy'],
              'max_depth': [5, 10, 15, 20],
              'min_samples_split': [2, 3, 5]}

gs_clf = GridSearchCV(DecisionTreeClassifier(),
                      parameters, 
                      cv=5,
                      # scoring='f1_macro',
                      scoring='roc_auc_ovr',
                      n_jobs=-1)
_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5)
{'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 5}
0.9634325396825398


In [8]:
# Hyperparameter fine-tuning for Random Forest on multi-class dataset
parameters = {'n_estimators': [50, 100, 150, 250],
              'criterion': ['gini', 'entropy'],
              'max_depth': [5, 10, 15, 20],
              'min_samples_split': [2, 3, 5]}

gs_clf = GridSearchCV(RandomForestClassifier(),
                      parameters, 
                      cv=5,
                      # scoring='f1_macro',
                      scoring='roc_auc_ovr',
                      n_jobs=-1)
_ = gs_clf.fit(X_train, y_train)

print(gs_clf.best_estimator_)
print(gs_clf.best_params_)
print(gs_clf.best_score_)

RandomForestClassifier(max_depth=15, min_samples_split=5, n_estimators=150)
{'criterion': 'gini', 'max_depth': 15, 'min_samples_split': 5, 'n_estimators': 150}
0.9915476190476191
