In [1]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import time


In [2]:
# Load MNIST dataset
mnist = fetch_openml('mnist_784', version=1, parser='auto')
X, y = mnist.data, mnist.target.astype(int)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# 1. Random Forest

start_time = time.time()

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

runtime = round(time.time() - start_time,2)
minutes = int(runtime//60)
seconds = int(runtime%60)

print(f"Runtime is: {minutes} mins {seconds} secs ")

Runtime is: 1 mins 47 secs 


In [7]:
print("Random Forest Accuracy:", round(rf_accuracy, 2))
rf_cm = confusion_matrix(y_test, rf_pred)
print(rf_cm)

Random Forest Accuracy: 0.97
[[1325    0    4    0    1    1    3    1    6    2]
 [   0 1573    6    8    2    0    0    6    3    2]
 [   4    5 1334    4    5    1    9    8    8    2]
 [   1    0   23 1366    0    9    0   14   12    8]
 [   4    1    3    0 1257    0    2    3    3   22]
 [   1    3    3   17    4 1226    8    1    8    2]
 [   4    1    0    0    6   10 1372    0    3    0]
 [   3    5   15    0    7    1    0 1455    2   15]
 [   1    6    9   15    6   11    5    7 1290    7]
 [   4    6    5   16   20    5    1   10    9 1344]]


In [8]:
# 2. Gradient Boosting Machines (GBM)

start_time = time.time()

gbm_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gbm_model.fit(X_train, y_train)
gbm_pred = gbm_model.predict(X_test)
gbm_accuracy = accuracy_score(y_test, gbm_pred)

runtime = round(time.time() - start_time,2)
minutes = int(runtime//60)
seconds = int(runtime%60)

print(f"Runtime is: {minutes} mins {seconds} secs ")

Runtime is: 73 mins 8 secs 


In [10]:
print("Gradient Boosting Machines Accuracy:", round(gbm_accuracy, 2))
gbm_cm = confusion_matrix(y_test, gbm_pred)
gbm_cm

Gradient Boosting Machines Accuracy: 0.95


array([[1314,    1,    3,    2,    2,    2,    6,    1,   12,    0],
       [   0, 1572,    5,    5,    4,    3,    0,    5,    4,    2],
       [   4,    7, 1304,    8,   11,    1,   10,   10,   19,    6],
       [   2,    8,   21, 1317,    1,   29,    1,   16,   17,   21],
       [   2,    1,    7,    3, 1227,    1,    5,    4,    3,   42],
       [   6,    7,    5,   29,    8, 1183,   10,    1,   18,    6],
       [   4,    3,    5,    0,   14,   17, 1344,    0,    9,    0],
       [   6,    8,   16,    4,    8,    3,    0, 1418,    3,   37],
       [   4,    9,   10,   22,    8,   18,    7,    7, 1253,   19],
       [   7,    9,    4,   19,   25,    7,    1,   34,   10, 1304]],
      dtype=int64)

In [None]:
# 3. Support Vector Machines (SVM)
start_time = time.time()

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)

runtime = round(time.time() - start_time, 2)
minutes = int(runtime//60)
seconds = int(runtime%60)

print(f"Runtime is: {minutes} mins {seconds} secs ")

In [None]:
print("Support Vector Machines Accuracy:", rond(svm_accuracy, 2))
svm_cm = confusion_matrix(y_test, svm_pred)
svm_cm

In [None]:
# 4. k-Nearest Neighbors (k-NN)
# Define k-NN model
knn_model = KNeighborsClassifier()

# Define hyperparameters to tune
param_grid = {'n_neighbors': [1, 2, 3, 4, 5, 6,]}  # Adjust the range of K values as needed

# Perform grid search cross-validation to find the best K value
grid_search = GridSearchCV(knn_model, param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:

# Plot grid search results
k_values = [params['n_neighbors'] for params in grid_search.cv_results_['params']]
cv_accuracies = grid_search.cv_results_['mean_test_score']

plt.figure(figsize=(10, 6))
plt.plot(k_values, cv_accuracies, marker='o')
plt.title('Grid Search Results')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Cross-Validation Accuracy')
plt.xticks(k_values)
plt.grid(True)
plt.show()

In [None]:
# Get the best K value and its corresponding accuracy
best_k = grid_search.best_params_['n_neighbors']
best_accuracy = grid_search.best_score_
print("Best K value:", best_k)
print("Best Cross-Validation Accuracy:", best_accuracy)

In [None]:
# Get the best K value and its corresponding accuracy
best_k = grid_search.best_params_['n_neighbors']
best_accuracy = grid_search.best_score_
print("Best K value:", best_k)
print("Best Cross-Validation Accuracy:", best_accuracy)

# Evaluate the model on the test set using the best K value
best_knn_model = KNeighborsClassifier(n_neighbors=best_k)
best_knn_model.fit(X_train, y_train)
y_pred = best_knn_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Test Set Accuracy with Best K:", test_accuracy)
