In [50]:
import matplotlib.pyplot as plt
import cv2
import numpy as np
import tensorflow as tf
from mnist import MNIST
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import StratifiedKFold
# from tensorflow import keras
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.optimizers import SGD
#import tensorflow_datasets as tfds

In [52]:
def load_data():
    """
    loads data from local folder Data
    converts the array.array objects to numpy ndarrays
    """
    mndata = MNIST('./Data/MNIST_Data')
    train_X, train_Y = mndata.load_training()
    test_X, test_Y = mndata.load_testing()
    train_X =np.asarray(train_X)
    test_X = np.asarray(test_X)
    train_Y = np.asarray(train_Y)
    test_Y = np.asarray(test_Y)
    return train_X, train_Y, test_X, test_Y

In [38]:
def load_my_data():
    
    """
    loads data from local folder emilysdata
    """
    myLabels = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    myImages = np.zeros((10, 28, 28, 1), dtype=np.float32)
    folderpath= "Data/emily_data/numbers"
    filename = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
    for i in range(10):
        color_img = cv2.imread("%s/%s.png" % (folderpath, filename[i]))
        gray_img = cv2.cvtColor(color_img, cv2.COLOR_BGR2GRAY)
        # name = "test_binary" + str(i) + ".png"
        ret,binarized_img = cv2.threshold(gray_img,100,255,cv2.THRESH_BINARY_INV)
        binarized_img = np.reshape(binarized_img, (28, 28, 1))
        # cv2.imwrite(name, binarized_img)
        myImages[i, :, :, :] = binarized_img
    
    return myImages, myLabels  

In [39]:
img, lab = load_my_data()

In [46]:
def preprocess_image_data(image_data):
    """
    takes in an array object that is assumed to be X image data for MNIST.
    reshape to get grayscale 28x28 images for each row.
    converts array to float and normalizes values betweeen 0 and 1
    
    param image_data: a array.array object that is training or test data
    return image_array_norm: normalized image array
    """
    image_array = np.reshape(image_data, (image_data.shape[0], 28, 28, 1))
    image_array = image_array.astype(np.float32)
    image_array_norm = image_array / 255.0
    return image_array_norm

In [47]:
def preprocess_label_data(label_data):
    """
    takes in an array object and reshapes to 2D array. 
    One hot encodes labels since they are categorical.
    
    param label_data: label data
    return encoded_labels: (-1,10) array of encoded data labels
    """
    label_array = label_data.reshape(-1, 1)
    hot_encoder = OneHotEncoder(dtype=np.uint8)
    hot_encoder.fit(label_array)
    encoded_labels = hot_encoder.transform(label_array).toarray()
    return encoded_labels

In [41]:
# find out if dataset is balanced 
def visualize_balance_of_dataset(y, name):
    """
    output bar chart showing number of elements
    for multiclass (0, 1, 2,...9). 
    
    Used to visualize how balanced the data set is. 
    
    param y: label array
    """
    u, counts = np.unique(y, return_counts=True)
    sum_counts = np.sum(counts)
    distro_list = []
    for i in counts:
        distro =(i / sum_counts) * 100
        distro_list.append(distro)
    # print('distribution = ', distro_list)    
    plt.figure(figsize=(10, 5))
    if name == "Train":
        col = "blue"
    else:
        col = "red"
    plt.bar(u, counts, color=col)
    plt.title(name + " Dataset Distribution")
    plt.xticks(np.arange(min(u), max(u)+1, 1.0))
    plt.xlabel("Label Values - Numerical Characters")
    plt.ylabel("Number of Label Value Occurrences")
    plt.savefig(name + "_barChart.png")

In [59]:
def create_CNNmodel():
    # add layers for CNN
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28, 1)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    
    opt = SGD(lr=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [12]:
def cross_validate_model(Xtrain, Ytrain):
    history_list = []
    accuracy_list = []
    kfold = KFold(n_splits=5, shuffle=True, random_state=10)
    for i, j in kfold.split(Xtrain):
        Xtrain_fold, Ytrain_fold = Xtrain[i], Ytrain[i]
        XVal_fold, YVal_fold = Xtrain[j], Ytrain[j]
        
        cnn = create_CNNmodel()
        history = cnn.fit(Xtrain_fold, Ytrain_fold, epochs=10, batch_size=32, validation_data=(XVal_fold, YVal_fold), verbose=0)
        _, acc = cnn.evaluate(XVal_fold, YVal_fold, verbose=1)
        
        history_list.append(history)
        accuracy_list.append(acc)
        # print('accuracy = ', (acc * 100))
    return history_list, accuracy_list

In [60]:
def train_evaluate(Xtrain, Ytrain, Xtest, Ytest):
    cnn = create_CNNmodel()
    history = cnn.fit(Xtrain, Ytrain, batch_size=32, epochs=15, validation_split=0.1)
    results = cnn.evaluate(Xtest, Ytest, verbose=1)
    cnn.save('cnn_model')
    return history, results

In [42]:
# load MNIST data set
def run_from_saved_model_with_Test_Data():
    X_train_i, Y_train_i, X_test_i, Y_test_i = load_data() 
    
    # preprocess training and test labels 
    Xtest = preprocess_image_data(X_test_i)
    Ytest = preprocess_label_data(Y_test_i)
    
    # load pre-trained model
    cnn = load_model('cnn_model')
    results = cnn.evaluate(Xtest, Ytest, verbose=1)
    return results

In [None]:
def run_from_beginning():
    X_train_i, Y_train_i, X_test_i, Y_test_i = load_data()  # load data
    
    # preprocess training and test labels 
    Xtrain = preprocess_image_data(X_train_i)
    Xtest = preprocess_image_data(X_test_i)
    
    Ytrain = preprocess_label_data(Y_train_i)
    Ytest = preprocess_label_data(Y_test_i)
    
    # train and evaluate model using training and test data
    history, results = train_evaluate(Xtrain, Ytrain, Xtest, Ytest)
    
    return history, results
   

In [52]:
def run_from_saved_model_with_my_data():
    x, y = load_my_data()
    # preprocess training and test labels 
    X_data = preprocess_image_data(x)
    Y_data = preprocess_label_data(y)
    
    # load pre-trained model
    cnn = load_model('cnn_model')
    results = cnn.evaluate(X_data, Y_data, verbose=1)
    predictions = cnn.predict(X_data)
    print('results', results)
    # print('predictions', predictions)
    return results

In [53]:
run_from_saved_model_with_my_data()


results [0.422191858291626, 0.800000011920929]
predictions [[9.99993563e-01 1.36723870e-11 1.79051085e-07 5.84513438e-10
  2.09913602e-13 5.47135492e-09 1.96993000e-09 5.52941538e-06
  6.53251107e-07 6.76099461e-08]
 [2.35393666e-11 9.99998450e-01 1.24889743e-09 8.14682349e-11
  1.37778818e-06 4.15465840e-09 1.39784335e-08 6.39289621e-10
  6.93748561e-08 5.16587554e-12]
 [1.98959288e-17 3.80857672e-11 1.00000000e+00 3.81361401e-11
  1.39120243e-25 5.32178124e-19 1.80802628e-22 9.06344277e-10
  1.21788656e-11 2.37931724e-15]
 [2.67076470e-22 5.65474113e-15 2.62395747e-11 1.00000000e+00
  5.57537104e-19 8.32973135e-11 1.17271560e-20 7.70321695e-10
  3.62009762e-11 1.25702096e-11]
 [2.88916941e-14 1.57988961e-05 1.42890658e-10 1.63627919e-06
  9.99963880e-01 4.66914019e-09 1.08661947e-10 8.24129458e-08
  4.34636462e-07 1.81955911e-05]
 [1.91965618e-21 1.45065928e-20 5.77287502e-22 5.35493871e-10
  1.05006007e-22 1.00000000e+00 3.39945615e-19 2.25828426e-23
  2.37832690e-16 2.14926910e-10]

[0.422191858291626, 0.800000011920929]

(48000, 28, 28, 1)
Y (48000, 10)
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
> 98.525
(48000, 28, 28, 1)
Y (48000, 10)
> 98.750
(48000, 28, 28, 1)
Y (48000, 10)
> 98.975
(48000, 28, 28, 1)
Y (48000, 10)
> 98.608
(48000, 28, 28, 1)
Y (48000, 10)
> 98.542


Train on 54000 samples, validate on 6000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
accuracy 0.99


Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 13, 13, 32)        0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 5408)              0         
_________________________________________________________________
dense_19 (Dense)             (None, 100)               540900    
_________________________________________________________________
dense_20 (Dense)             (None, 10)                1010      
Total params: 542,230
Trainable params: 542,230
Non-trainable params: 0
_________________________________________________________________
