In [1]:
import numpy as np
np.random.seed(42)
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

#### Helper

In [2]:
def print_grid_cv_results(grid_result):
    print(
        f"Best model score: {grid_result.best_score_} "
        f"Best model params: {grid_result.best_params_} "
    )
    means = grid_result.cv_results_["mean_test_score"]
    stds = grid_result.cv_results_["std_test_score"]
    params = grid_result.cv_results_["params"]

    for mean, std, param in zip(means, stds, params):
        mean = round(mean, 4)
        std = round(std, 4)
        print(f"{mean} (+/- {2 * std}) with: {param}")

#### LOAD DATASET

In [3]:
mnist = load_digits()
x = mnist.data
y = mnist.target

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

#### NORMALIZE DATASET

In [4]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

#### KNN CLASSIFICATION

In [5]:
from sklearn.neighbors import KNeighborsClassifier

params = {
    "n_neighbors": [i for i in range(2, 22, 2)],
    "weights": ["uniform", "distance"]
}

clf = KNeighborsClassifier()

grid = GridSearchCV(clf, params, cv=3)
grid_result = grid.fit(x_train, y_train)

print_grid_cv_results(grid_result)

Best model score: 0.9697692919649961 Best model params: {'n_neighbors': 2, 'weights': 'distance'} 
0.9547 (+/- 0.0156) with: {'n_neighbors': 2, 'weights': 'uniform'}
0.9698 (+/- 0.0022) with: {'n_neighbors': 2, 'weights': 'distance'}
0.9578 (+/- 0.009) with: {'n_neighbors': 4, 'weights': 'uniform'}
0.9682 (+/- 0.0082) with: {'n_neighbors': 4, 'weights': 'distance'}
0.9626 (+/- 0.0022) with: {'n_neighbors': 6, 'weights': 'uniform'}
0.9682 (+/- 0.0022) with: {'n_neighbors': 6, 'weights': 'distance'}
0.9586 (+/- 0.0098) with: {'n_neighbors': 8, 'weights': 'uniform'}
0.9682 (+/- 0.0082) with: {'n_neighbors': 8, 'weights': 'distance'}
0.9547 (+/- 0.0104) with: {'n_neighbors': 10, 'weights': 'uniform'}
0.9674 (+/- 0.0126) with: {'n_neighbors': 10, 'weights': 'distance'}
0.9499 (+/- 0.0038) with: {'n_neighbors': 12, 'weights': 'uniform'}
0.9602 (+/- 0.0098) with: {'n_neighbors': 12, 'weights': 'distance'}
0.9499 (+/- 0.017) with: {'n_neighbors': 14, 'weights': 'uniform'}
0.9539 (+/- 0.0158) w

#### RANDOM FOREST CLASSIFICATION

In [6]:
from sklearn.ensemble import RandomForestClassifier

params = {
    "n_estimators": [50*i for i in range(4, 10)],
    "max_depth": [i for i in range(20, 51, 10)] + [None],
    "criterion": ["gini", "entropy"],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 2]
}

clf = RandomForestClassifier()

grid = GridSearchCV(clf, params, cv=3, n_jobs=-1)
grid_result = grid.fit(x_train, y_train)

print_grid_cv_results(grid_result)

Best model score: 0.9721559268098647 Best model params: {'criterion': 'gini', 'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300} 
0.9698 (+/- 0.009) with: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
0.969 (+/- 0.014) with: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 250}
0.9674 (+/- 0.0158) with: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
0.9682 (+/- 0.012) with: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 350}
0.9666 (+/- 0.0078) with: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
0.969 (+/- 0.0068) with: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 450}
0.965 (+/- 0.006) with: {'criterion': 'gini

#### GRADIENT BOOSTING CLASSIFICATION

In [7]:
from sklearn.ensemble import GradientBoostingClassifier

params = {
    "n_estimators": [50*i for i in range(4, 10)],
    "max_depth": [i for i in range(20, 51, 10)] + [None]
}

clf = GradientBoostingClassifier()

grid = GridSearchCV(clf, params, cv=3, n_jobs=-1)
grid_result = grid.fit(x_train, y_train)

print_grid_cv_results(grid_result)

Best model score: 0.8631662688941925 Best model params: {'max_depth': 40, 'n_estimators': 350} 
0.8536 (+/- 0.025) with: {'max_depth': 20, 'n_estimators': 200}
0.8568 (+/- 0.0256) with: {'max_depth': 20, 'n_estimators': 250}
0.8592 (+/- 0.0312) with: {'max_depth': 20, 'n_estimators': 300}
0.8624 (+/- 0.0316) with: {'max_depth': 20, 'n_estimators': 350}
0.8592 (+/- 0.0312) with: {'max_depth': 20, 'n_estimators': 400}
0.86 (+/- 0.0332) with: {'max_depth': 20, 'n_estimators': 450}
0.8544 (+/- 0.031) with: {'max_depth': 30, 'n_estimators': 200}
0.8576 (+/- 0.0276) with: {'max_depth': 30, 'n_estimators': 250}
0.8568 (+/- 0.0372) with: {'max_depth': 30, 'n_estimators': 300}
0.8608 (+/- 0.0312) with: {'max_depth': 30, 'n_estimators': 350}
0.8608 (+/- 0.0332) with: {'max_depth': 30, 'n_estimators': 400}
0.8592 (+/- 0.0312) with: {'max_depth': 30, 'n_estimators': 450}
0.8512 (+/- 0.0312) with: {'max_depth': 40, 'n_estimators': 200}
0.8568 (+/- 0.0256) with: {'max_depth': 40, 'n_estimators': 250

#### SVM CLASSIFICATION:

In [8]:
from sklearn.svm import SVC

params = {
    "kernel": ["linear", "sigmoid", "rbf", "poly"]
}

clf = SVC()

grid = GridSearchCV(clf, params, cv=3, n_jobs=-1)
grid_result = grid.fit(x_train, y_train)

print_grid_cv_results(grid_result)

Best model score: 0.9737470167064438 Best model params: {'kernel': 'rbf'} 
0.9674 (+/- 0.0148) with: {'kernel': 'linear'}
0.9475 (+/- 0.0104) with: {'kernel': 'sigmoid'}
0.9737 (+/- 0.0104) with: {'kernel': 'rbf'}
0.9109 (+/- 0.0158) with: {'kernel': 'poly'}


#### BEST MODEL:

In [6]:
from sklearn.svm import SVC

best_params = {'kernel': 'rbf'}
best_classifier = SVC

regr = best_classifier(**best_params)
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

print(f"Accuracy: {acc}")
print(f"Confusion matrix:\n{cm}")

Accuracy: 0.9796296296296296
Confusion matrix:
[[53  0  0  0  0  0  0  0  0  0]
 [ 0 50  0  0  0  0  0  0  0  0]
 [ 0  0 47  0  0  0  0  0  0  0]
 [ 0  0  2 51  0  1  0  0  0  0]
 [ 0  0  0  0 60  0  0  0  0  0]
 [ 0  0  0  0  0 66  0  0  0  0]
 [ 0  0  0  0  0  0 53  0  0  0]
 [ 0  0  0  0  1  0  0 53  0  1]
 [ 0  0  1  1  0  0  0  0 41  0]
 [ 0  0  0  0  0  1  1  0  2 55]]
