In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import gzip
import random

In [2]:

def load_mnist(filename, type, n_datapoints):
    # MNIST Images have 28*28 pixels dimension
    image_size = 28
    f = gzip.open(filename)
    
    if(type == 'image'):
        f.read(16)    # Skip Non-Image information
        buf = f.read(n_datapoints * image_size * image_size)
        data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
        data = data.reshape(n_datapoints, image_size, image_size, 1)
    elif(type == 'label'):
        f.read(8) # Skip Inessential information
        buf = f.read(n_datapoints)
        data = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
        data = data.reshape(n_datapoints, 1)
    return data

In [3]:

# Training Dataset
train_size = 60000
test_size = 10000
dirpath = ''
X = load_mnist(dirpath + 'train-images-idx3-ubyte.gz', 'image', train_size)
y = load_mnist(dirpath + 'train-labels-idx1-ubyte.gz', 'label', train_size)
X_test = load_mnist(dirpath + 't10k-images-idx3-ubyte.gz', 'image', test_size)
y_test = load_mnist(dirpath + 't10k-labels-idx1-ubyte.gz', 'label', test_size)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X[:(train_size//10)], y[:(train_size//10)], test_size=0.25, random_state=28)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

(4500, 28, 28, 1) (1500, 28, 28, 1) (4500, 1) (1500, 1)


use three different models. the Knearest neighbors, SupportVectorMachine and RandomForest. hard voting ensures that all the classifiers get a say in the final vote.

In [6]:
%%time
#voting classifier- Multiple Model Ensemble
from sklearn.ensemble import VotingClassifier
from sklearn import svm
from sklearn import ensemble

sv = svm.SVC()


print('Training the Model')
knn = KNeighborsClassifier(n_neighbors=5)


rfc = ensemble.RandomForestClassifier()
evc = VotingClassifier(estimators=[('sv',sv),('knn',knn),('rfc',rfc)], voting ='hard')

evc.fit(X.reshape(X.shape[0], 28*28), y.ravel())

Training the Model
CPU times: total: 14min 47s
Wall time: 20min 34s


using joblib module, the trained model can be saved. we use the gzip module to compress the trained model so as to reduce its size.

In [7]:
import joblib

joblib.dump(evc, 'evc_model.gzip', compress=('gzip',3))

['evc_model.gzip']