# MNIST Classifier

In this notebook you will create both, an mnist tabular dataset and a classifier.

## 1.- import the Operating System (os) module in python and any other library you need

In [None]:
import os
from PIL import Image
import numpy as np
import pandas as pd

import time
import numpy    as np
import pandas   as pd

from sklearn import model_selection
from sklearn import metrics

from sklearn.svm           import SVC
from sklearn.neighbors     import KNeighborsClassifier
from sklearn.tree          import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble      import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble      import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble      import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble      import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble      import HistGradientBoostingClassifier
from xgboost               import XGBClassifier, XGBRegressor
from lightgbm              import LGBMClassifier, LGBMRegressor
from catboost              import CatBoostClassifier, CatBoostRegressor


## 2.- As you can see each class has its own folder (Do it only for train). 

    - Iterate folder by folder ( os.listdir() )
    - Inside each folder: 
        1.- Read the image
        2.- Reshape it into a flat array (784,)
        3.- Save the data into a pandas dataframe apending the column name as the class
    - Save the data into a CSV

    Note: if it takes to long try doing only 100 images per folder for the CSV.

In [None]:
# Define the path for the folder containing all images
path = "C:/Users/Igor/Documents/GitHub/AI-Engineering/Chapter 2/12. Images/MNIST Digit classifier/trainingSet/"
directories = os.listdir(path)    # get a list of all subdirectories

df1 = pd.DataFrame()    # data frame to save the tabular data

for directory in list(directories):       # loop through the list of directories
    images = os.listdir(path + directory) # get the path for all 10 image directories
    arr = np.zeros((len(images), 785))    # create a numpy zeros array to append the values for the images later

    for i, img in enumerate(images):                     # loop through each image directory
        image = Image.open(path + directory + '/' + img) # get the path for each image
        arr2 = np.array(image, dtype=float)              # convert the image to a numpy array
        arr2 = arr2.flatten()                            # reshape the array into 1 dimension

        arr[i,:784] = arr2          # replace the zeros array with the image data - except last column
        arr[i,784] = int(directory) # replace last column with the labels data - 0 to 9

    df2 = pd.DataFrame(data=arr) # save each iteration of the loop into a data frame
    df = pd.concat([df1, df2])   # add that iteration to another data frame
    df1 = df                     # final version of the data frame
    
df.shape

# df.to_csv('C:/Users/Igor/Documents/GitHub/AI-Engineering/Chapter 2/12. Images/training_set.csv', index=False)

## 3.- Load the CSV

In [None]:
training_set = pd.read_csv('C:/Users/Igor/Documents/GitHub/AI-Engineering/Chapter 2/12. Images/training_set.csv')
training_set.head()

y = training_set.iloc[:,-1]
x = training_set.iloc[:,:-1]

x.shape, y.shape, training_set.iloc[:,-1].unique()

## 4.- Create a dictionary of models (No preprocessing needed, it has already been done).
    
    Include both, tree models and mult models.

In [None]:
tree_classifiers = {
  "Decision Tree": DecisionTreeClassifier(),
  "Random Forest": RandomForestClassifier(verbose=False),
  "CatBoost": CatBoostClassifier(verbose=False),
  "kNN": KNeighborsClassifier(),
  "SVC": SVC()
}

## 5.- Using either cross validation or stratification find out which is the best model
    - Base your code on the previous two days examples

In [None]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
        x, y, 
        test_size=0.2, 
        random_state=0, 
        stratify=y
)

skf = model_selection.StratifiedKFold(
        n_splits=5,
        random_state=0,
        shuffle=True
)

results = pd.DataFrame({'Model': [], 
                        'Accuracy': [], 
                        'Bal Acc.': [], 
                        'Time': []})

for model_name, model in tree_classifiers.items():
    
    start_time = time.time()
    pred = model_selection.cross_val_predict(model, x, y, cv=skf)
    total_time = time.time() - start_time

    print(f'Finished {model_name}')

    results = results.append({"Model":  model_name,
                              "Accuracy": metrics.accuracy_score(y, pred)*100,
                              "Bal Acc.": metrics.balanced_accuracy_score(y, pred)*100,
                              "Time":     total_time},
                              ignore_index=True)

### Without cross validation

In [8]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(
        x, y, 
        test_size=0.2, 
        random_state=0, 
        stratify=y
)

results = pd.DataFrame({'Model': [], 
                        'Accuracy': [], 
                        'Bal Acc.': [], 
                        'Time': []})

for model_name, model in tree_classifiers.items():

        start_time = time.time()
        model.fit(x_train, y_train)
        pred = model.predict(x_val)

        total_time = time.time() - start_time

        print(f'Finished {model_name}')
        
        results = results.append({"Model":  model_name,
                                  "Accuracy": metrics.accuracy_score(y_val, pred)*100,
                                  "Bal Acc.": metrics.balanced_accuracy_score(y_val, pred)*100,
                                  "Time":     total_time}, 
                                  ignore_index=True)

Finished Decision Tree
Finished Random Forest


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

In [None]:
results_ord = results.sort_values(by=['Accuracy'], ascending=False, ignore_index=True)

print(results_ord)