### Question.1. Using the MNIST Dataset, https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html train a classifier using each of the following:
1. Decision Tree
2. Ensemble of trees
3. Random forest
4. MLP Neural Network a. With 1 hidden layer b. With 2 hidden layers c. With 3 hidden layers
You may experiment with choosing the other hyper-parameters for these models, however, anything that makes the model converge and work is sufficient for this exercise.
Use cross-fold validation in order to evaluate the performance of each classifier on the dataset.

In [None]:
#For imports
import numpy as np
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

### To load the data and visualize the images

In [None]:
#Loading the data
digits = load_digits()
print(digits.data.shape)
features = digits.feature_names
#print(features)

In [None]:
#Plotting the 0th image in the dataset
plt.gray()
plt.matshow(digits.images[1])
plt.show()

In [None]:
#Reshape the data into one-dim arrays
length = len(digits.images)
digits_data = digits.images.reshape((length, -1))
digits_data

In [None]:
print("Train Shape:", digits_data.shape)
print("Target shape:", digits.target.shape)


In [None]:
n_samples = digits_data.shape[0]
n_features = digits_data.shape[1]

In [None]:
# Split the data to train and test
X_train, X_test, y_train, y_test = train_test_split(digits_data, digits.target, test_size = 0.25, shuffle = False, random_state = 42)

### 1.1. Decision Tree Classifier

In [None]:
dtc = DecisionTreeClassifier(random_state = 42)
dtc_model = dtc.fit(X_train, y_train)
dtc_predict = dtc.predict(X_test)

In [None]:
_, axes = plt.subplots(nrows=1, ncols=8, figsize=(12, 3))
for ax, image, label in zip(axes, digits.images, dtc_predict):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title("Prediction: %i" % label)

In [None]:
#hyperparameter tuning
param_grid_dtc = {'min_samples_split' : range(10,500,20),
                'max_depth': range(1,20,2),
                 'criterion' : ['gini', 'entropy']}

grid_dtc = GridSearchCV(dtc, param_grid = param_grid_dtc, refit = True, verbose = 2, n_jobs = 1)

hyper_dtc = grid_dtc.fit(X_train, y_train)

print("Best parameters for Decision Tree:", grid_dtc.best_params_)

scores_dtc = grid_dtc.cv_results_['mean_test_score'];


In [None]:
print("Mean Test score for each combination:", scores_rf)

In [None]:
#Using best parameters for testing
dtc_best = DecisionTreeClassifier(criterion = 'gini', max_depth = 11, min_samples_split = 10)

dtc_best_model = dtc_best.fit(X_train, y_train)

dtc_best_pred = dtc_best_model.predict(X_test)

dtc_test_accuracy = accuracy_score(y_test, dtc_best_pred)

print(f"Test accuracy with Decision Tree hyper parameter tuning: {dtc_test_accuracy: .2%}")

In [None]:
#Cross validation for evaluation

dtc_score = cross_val_score(dtc_best, digits_data, digits.target, cv = 5)

for i, score in enumerate(dtc_score, start = 1):
    print(f"Fold {i} Test Accuracy: {score: .2%}")
    
dtc_mean_score = np.mean(dtc_score)
print(f"Mean Test Accuracy score:{dtc_mean_score:.2%}")

### 1.2. Ensemble of trees

In [None]:
bag = BaggingClassifier(random_state = 1, n_jobs = -1)
bag_model = bag.fit(X_train, y_train)
bag_predict = bag_model.predict(X_test)

In [None]:
_, axes = plt.subplots(nrows=1, ncols=8, figsize=(12, 3))
for ax, image, label in zip(axes, digits.images, bag_predict):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title("Prediction: %i" % label)

In [None]:
params_grid_bag = {'estimator': [None, KNeighborsClassifier(), DecisionTreeClassifier()],
          'n_estimators': [20,50,100],
          'max_samples': [0.5, 1.0, n_samples//2,],
          'max_features': [0.5, 1.0, n_features//2,],
          'bootstrap': [True, False],
          'bootstrap_features': [True, False]}

grid_bag = GridSearchCV(bag, param_grid = params_grid_bag, cv = 3, n_jobs = -1, verbose = 2)

hyper_bag = grid_bag.fit(X_train, y_train)

print("Best parameters for Ensemble of trees with KNN and DT is:", hyper_bag.best_params_)

scores_bag = grid_bag.cv_results_['mean_test_score']

In [None]:
print("Mean Test score with Ensemble for each combination:", scores_bag)

In [None]:
bag_best = BaggingClassifier(KNeighborsClassifier(), bootstrap = False, 
                             bootstrap_features = True, 
                             max_features = 1.0,
                             max_samples = 1.0, 
                             n_estimators = 100)  

#bag_best_model = hyper_bag.fit(X_train, y_train)
bag_best_model = bag_best.fit(X_train, y_train)

bag_best_pred = bag_best_model.predict(X_test)

bag_test_accuracy = accuracy_score(y_test, bag_best_pred)

print(f"Test accuracy:{bag_test_accuracy:.2%}")

In [None]:
bag_score = cross_val_score(bag_best, digits_data, digits.target, cv = 5)

for i, score in enumerate(bag_score, start = 1):
    print(f"Fold {i} Test AccuracyL: {score:.2%}")
    
bag_mean_score = np.mean(bag_score)
print(f"Mean Test Accuracy score:{bag_mean_score:.2%}")

### 1.3. Random Forest

In [None]:
rf = RandomForestClassifier(random_state = 42)
rf_model = rf.fit(X_train, y_train)
rf_predict = rf_model.predict(X_test)

In [None]:
_, axes = plt.subplots(nrows=1, ncols=8, figsize=(12, 3))
for ax, image, label in zip(axes, digits.images, rf_predict):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title("Prediction: %i" % label)

In [None]:
# hyper parameter
param_grid_rf = {"n_estimators": [10, 50, 100],
                "max_depth": [3, None],
                "max_features": ["sqrt"],
                "min_samples_split": [2,11],
                "min_samples_leaf": [1,11],
                "bootstrap": [True, False],
                "criterion": ["gini", "entropy"]}

grid_rf = GridSearchCV(rf, param_grid = param_grid_rf, cv = 5, n_jobs = 1, verbose = 2)

hyper_rf = grid_rf.fit(X_train, y_train)

scores_rf = grid_rf.cv_results_['mean_test_score']



In [None]:
print("Mean Test score for each combination:", scores_rf)

In [None]:
hyper_rf.best_params_

In [None]:
# Model training with best parameters
rf_best = RandomForestClassifier(bootstrap = False, criterion = 'entropy', max_depth = None, max_features = 'sqrt',
                                 min_samples_leaf = 1, min_samples_split = 2, n_estimators = 100)

rf_best_model = rf_best.fit(X_train, y_train)

rf_best_pred = rf_best_model.predict(X_test)

rf_test_accuracy = accuracy_score(y_test, rf_best_pred)

print(f"Test accuracy with Random Forest hyper parameter tuning: {rf_test_accuracy: .2%}")

In [None]:
# Using cross validation

rf_score = cross_val_score(rf_best, digits_data, digits.target, cv = 5)

for i, score in enumerate(rf_score, start = 1):
    print(f"Fold {i} Test Accuracy:{score:.2%}")

rf_mean_score = np.mean(rf_score)
print(f"Mean Test Accuracy score:{rf_mean_score:.2%}")

### 1.4. MLP Neural Network 
1.4.a. With 1 hidden layer
1.4.b. With 2 hidden layers
1.4.c. With 3 hidden layers

In [None]:
mlp = MLPClassifier(random_state = 42)
mlp_model = mlp.fit(X_train, y_train)
mlp_predict = mlp_model.predict(X_test)

In [None]:
_, axes = plt.subplots(nrows=1, ncols=8, figsize=(12, 3))
for ax, image, label in zip(axes, digits.images, mlp_predict):
    ax.set_axis_off()
    ax.imshow(image, cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title("Prediction: %i" % label)

In [None]:
# hyper parameter tuning with 1 layer
param_grid_mlp_1_layer = {'hidden_layer_sizes': [(150,)],
             'max_iter': [100, 200],
             'activation': ['tanh', 'relu'],
             'solver': ['sgd','adam'],
             'alpha': [0.0001, 0.05],
             'learning_rate': ['constant','adaptive']}

grid_mlp_1_layer = GridSearchCV(mlp, param_grid = param_grid_mlp_1_layer, cv = 5, n_jobs = -1, verbose = 2)

grid_mlp_1_layer.fit(X_train, y_train)

print("Best parameters with 1 hidden layer for MLP:", grid_mlp_1_layer.best_params_)

scores_mlp_1_layer = grid_mlp_1_layer.cv_results_['mean_test_score']


In [None]:
print("Mean test scores for each combination with 1 layer:", scores_mlp_1_layer)

In [None]:
# hyper parameter tuning with 2 layers
param_grid_mlp_2_layer = {'hidden_layer_sizes': [(150,100)],
             'max_iter': [100, 200],
             'activation': ['tanh', 'relu'],
             'solver': ['sgd', 'adam'],
             'alpha': [0.0001, 0.05],
             'learning_rate': ['constant','adaptive']}

grid_mlp_2_layer = GridSearchCV(mlp, param_grid = param_grid_mlp_2_layer, cv = 5, n_jobs = -1, verbose = 2)

grid_mlp_2_layer.fit(X_train, y_train)

print("Best parameters with 2 hidden layers for MLP:", grid_mlp_2_layer.best_params_)

scores_mlp_2_layer = grid_mlp_2_layer.cv_results_['mean_test_score']

In [None]:
print("Mean test scores for each combination with 2 layers:", scores_mlp_2_layer)

In [None]:
# hyper parameter tuning with 3 layers
param_grid_mlp_3_layer = {'hidden_layer_sizes': [(150,100,50)],
             'max_iter': [100, 200],
             'activation': ['tanh', 'relu'],
             'solver': ['sgd','adam'],
             'alpha': [0.0001, 0.05],
             'learning_rate': ['constant','adaptive']}

grid_mlp_3_layer = GridSearchCV(mlp, param_grid = param_grid_mlp_3_layer, cv = 5, n_jobs = -1, verbose = 2)

grid_mlp_3_layer.fit(X_train, y_train)

print("Best parameters with 3 hidden layer for MLP:", grid_mlp_3_layer.best_params_)

scores_mlp_3_layer = grid_mlp_3_layer.cv_results_['mean_test_score']

In [None]:
print("Mean test scores for each combination with 3 layers:", scores_mlp_3_layer)

In [None]:
#Evaluate the models
#mlp_best_1_layer = MLPClassifier(activation = 'tanh', alpha = 0.05, hidden_layer_sizes = (150,), learning_rate = 'constant',
                                # max_iter = 200, solver = 'adam')

#mlp_best_1_layer_model = mlp_best_1_layer.fit(X_train, y_train)
#print(f"Test accuracy for MLP with 1 layer::::", mlp_best_1_layer_model.score(X_test, y_test))

mlp_1_layer_test_accuracy = grid_mlp_1_layer.score(X_test, y_test)
mlp_2_layer_test_accuracy = grid_mlp_2_layer.score(X_test, y_test)
mlp_3_layer_test_accuracy = grid_mlp_3_layer.score(X_test, y_test)

print(f"Test accuracy for MLP with 1 layer:{mlp_1_layer_test_accuracy:.2%}")
print(f"Test accuracy for MLP with 2 layers:{mlp_2_layer_test_accuracy:.2%}")
print(f"Test accuracy for MLP with 3 layers:{mlp_3_layer_test_accuracy:.2%}")

In [None]:
#Best MLP is with 2 layers
mlp_best = MLPClassifier(activation = 'relu', alpha = 0.05, hidden_layer_sizes = (150,100), learning_rate = 'constant',
                                max_iter = 200, solver = 'adam')

mlp_best_model = mlp_best.fit(X_train, y_train)

mlp_score = cross_val_score(mlp_best, digits_data, digits.target, cv = 5)

for i, score in enumerate(mlp_score, start = 1):
    print(f"Fold {i} Test Accuracy:{score:.2%}")
    
mlp_mean_score = np.mean(mlp_score)
print(f"Mean Test Accuracy score:{mlp_mean_score:.2%}")

### Question.2. Split a small amount of data off from the main training set for validation. For each classifier, find which label is predicted with the highest confidence over the validation set and plot the image and its predicted class.

In [None]:
#Splitting for validation set
X_train, X_val, y_train, y_val = train_test_split(digits.data, digits.target, test_size = 0.1, random_state = 42)

### 2.1. Decision Tree

In [None]:
dtc_val_predict = dtc_best.predict(X_val)
dtc_confidence = dtc_best.predict_proba(X_val)
dtc_best_labels = np.argmax(dtc_confidence, axis = 1)
dtc_best_labels

In [None]:
plt.figure(figsize=(12, 6))
for i in range(5):  # Plot the first 5 images
    plt.subplot(2, 5, i + 1)
    plt.imshow(X_val[i].reshape(8, 8), cmap='gray')
    plt.title(f"Predicted: {dtc_val_predict[i]}, Actual: {y_val[i]}")

plt.suptitle("Decision Tree - Predicted vs Actual")
plt.show()

### 2.2. Ensemble of trees

In [None]:
bag_val_predict = bag_best.predict(X_val)
bag_confidence = bag_best.predict_proba(X_val)
bag_best_labels = np.argmax(bag_confidence, axis = 1)
bag_best_labels

In [None]:
plt.figure(figsize=(12, 6))
for i in range(5):  # Plot the first 5 images
    plt.subplot(2, 5, i + 1)
    plt.imshow(X_val[i].reshape(8, 8), cmap='gray')
    plt.title(f"Predicted: {bag_val_predict[i]}, Actual: {y_val[i]}")

plt.suptitle("Ensemble of Trees (Bagging) - Predicted vs Actual")
plt.show()

### 2.3. Random Forest

In [None]:
rf_val_predict = rf_best.predict(X_val)
rf_confidence = rf_best.predict_proba(X_val)
rf_best_labels = np.argmax(rf_confidence, axis = 1)
rf_best_labels

In [None]:
plt.figure(figsize=(12, 6))
for i in range(5):  # Plot the first 5 images
    plt.subplot(2, 5, i + 1)
    plt.imshow(X_val[i].reshape(8, 8), cmap='gray')
    plt.title(f"Predicted: {rf_val_predict[i]}, Actual: {y_val[i]}")

plt.suptitle("Random Forest - Predicted vs Actual")
plt.show()

### 2.4. MLP Neural Network
2.4.a. With 1 hidden layer
2.4.b. With 2 hidden layers
2.4.c. With 3 hidden layers


In [None]:
mlp_val_predict = mlp_best.predict(X_val)
mlp_confidence = mlp_best.predict_proba(X_val)
mlp_best_labels = np.argmax(mlp_confidence, axis = 1)
mlp_best_labels

In [None]:
plt.figure(figsize=(12, 6))
for i in range(5):  # Plot the first 5 images
    plt.subplot(2, 5, i + 1)
    plt.imshow(X_val[i].reshape(8, 8), cmap='gray')
    plt.title(f"Predicted: {mlp_best_labels[i]}, Actual: {y_val[i]}")

plt.suptitle("Random Forest - Predicted vs Actual")
plt.show()