# Inference - Large

This file evaluates the machine learning model on the full MNIST test set.  

See the bottom for the code you need to accelerate.  


In [1]:
%matplotlib inline
import cProfile
import json
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import truncnorm
import timeit

In [2]:
image_size = 28 # width and length
no_of_different_labels = 10 #  i.e. 0, 1, 2, 3, ..., 9
image_pixels = image_size * image_size
data_path = "./data/"

In [3]:
test_data = pd.read_csv(data_path + "mnist_test.csv", delimiter=",").values

fac = 0.99 / 255
test_imgs = np.asfarray(test_data[:, 1:], dtype=np.float32) * fac + 0.01
test_imgs = test_imgs.reshape(test_imgs.shape[0], 1, test_imgs.shape[1])

test_labels = np.asfarray(test_data[:, :1], dtype=np.float32)

lr = np.arange(no_of_different_labels)
# transform labels into one hot representation
test_labels_one_hot = (lr==test_labels).astype(np.float32)

# we don't want zeroes and ones in the labels neither:
test_labels_one_hot[test_labels_one_hot==0] = 0.001
test_labels_one_hot[test_labels_one_hot==1] = 0.999

## Machine Learning Model

Based on: https://towardsdatascience.com/math-neural-network-from-scratch-in-python-d6da9f29ce65

All training (backward propagation) code removed

In [4]:
# Base class
class Layer:
    def __init__(self):
        self.input = None
        self.output = None

    # computes the output Y of a layer for a given input X
    def forward_propagation(self, input):
        raise NotImplementedError

    # backward_propagation removed

In [5]:
# inherit from base class Layer
class FCLayer(Layer):
    # input_size = number of input neurons
    # output_size = number of output neurons
    def __init__(self, input_size, output_size):
        self.weights = np.random.rand(input_size, output_size) - 0.5
        self.bias = np.random.rand(1, output_size) - 0.5

    # returns output for a given input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = np.dot(self.input, self.weights) + self.bias
        return self.output

    # backward_propagation removed

In [6]:
# inherit from base class Layer
class ActivationLayer(Layer):
    def __init__(self, activation, activation_prime):
        self.activation = activation
        self.activation_prime = activation_prime

    # returns the activated input
    def forward_propagation(self, input_data):
        self.input = input_data
        self.output = self.activation(self.input)
        return self.output

    # backward_propagation removed

In [7]:
class TanhLayer(ActivationLayer):
    # static
    e = 2.71828182845904523536028747135266249775724709369995
    
    #http://www.plunk.org/~hatch/rightway.php
    #https://math.stackexchange.com/questions/518758/alternative-form-for-sinhx-coshx
    @staticmethod
    def tanh(x):   
        e = TanhLayer.e
        return (1 - e ** (-2 * x)) / (1 + e ** (-2 * x)) 
        #return (1-np.exp(-2 * x))/(1+np.exp(-2 * x))

    @staticmethod
    def tanh_prime(x):
        return 1-TanhLayer.tanh(x)**2
    
    def __init__(self):
        super(TanhLayer,self).__init__(self.tanh, self.tanh_prime)

In [8]:
def mse(): pass
def mse_prime(): pass

In [9]:
class Network:
    def __init__(self):
        self.layers = []

    # predict output for given input
    def predict(self, input_data):
        # sample dimension first
        samples = len(input_data)
        result = []

        # run network over all samples
        for i in range(samples):
            # forward propagation
            output = input_data[i]
            for layer in self.layers:
                output = layer.forward_propagation(output)
            result.append(output)

        return result
    
    # backward_propagation removed

    @classmethod
    def load(cls, fname):
        import pickle
        with open(fname, "br") as fh:
            return pickle.load(fh)

## Load the model

In [10]:
net = Network.load('network.pkl')

In [11]:
def evaluate(net, data, labels):
    corrects, wrongs = 0, 0
    for i in range(len(data)):
        res = np.array(net.predict(data[i]))
        res = res.argmax()
        if res == labels[i]:
            corrects += 1
        else:
            wrongs += 1
    return corrects, wrongs

In [12]:
def evaluate_hw(net, data, labels):
    corrects, wrongs = 0,0
    overlay = Overlay('./verilog/bitstreams/784_10.bit')
    dma = overlay.axi_dma_0
    mmio = overlay.axi_bitcount_0.S_AXI_LITE
    
    # Send data to FPGA for prediction
    # FPGA returns results
    res = None # Res = results from FPGA
    
    if res == labels[i]:
        corrects +=1
    else:
        wrongs +=1
    
    return corrects, wrongs

# ============================
# This is the part you need to accelerate:
# ============================

In [13]:
start = timeit.default_timer()

corrects, wrongs = evaluate(net, test_imgs, test_labels)

stop = timeit.default_timer()

print ("Total Correct:" + str(corrects))
print ("Total Incorrect: " + str(wrongs))
print("Overall Accruracy: " + str(corrects / ( corrects + wrongs)))
print("Overall Accruracy (%): " + str( int( 1000* corrects / ( corrects + wrongs)) / 10) + "%")
print ()
print('Run Time: ' + str(stop - start) + ' Seconds')  

Total Correct:9565
Total Incorrect: 434
Overall Accruracy: 0.9565956595659566
Overall Accruracy (%): 95.6%

Run Time: 22.295984556999883 Seconds


### For reference, my Pynq takes ~22.4 seconds for the above evaluation.  

# Your mission:  Make this run faster, while keeping >95% overall accuracy!

### Please refer to the "Interence_Small" for details about swapping out individual layers

In [14]:
from pynq import Xlnk
from pynq import Overlay

class hardwareLayer(Layer):
    def __init__(self, bit, input_sz, output_sz):
        self.overlay = Overlay(bit)
        self.dma = self.overlay.axi_dma_0
        
        xlnk = Xlnk()        
        self.input_buffer = xlnk.cma_array(
                                shape=(input_sz,), 
                                dtype=np.float32)
        self.output_buffer = xlnk.cma_array(
                                shape=(output_sz,),
                                dtype=np.float32)
        
    def forward_propagation(self, input):
        # use this for first layer
        np.copyto(self.input_buffer, input)

        self.dma.sendchannel.transfer(self.input_buffer)    
        self.dma.recvchannel.transfer(self.output_buffer)
        
        self.dma.sendchannel.wait()
        self.dma.recvchannel.wait()
        
        #output expects a [1,output_sz] matrix (not vector)
        return self.output_buffer.reshape(
                                1, len(self.output_buffer))

In [15]:
hw_net = Network()

# 784 and 10 are hardcoded in bitstream
hw_net.layers.append(hardwareLayer('./verilog/bitstreams/784_10.bit', 784, 10))

In [16]:
start = timeit.default_timer()

corrects, wrongs = evaluate(hw_net, test_imgs, test_labels)

stop = timeit.default_timer()

print ("Total Correct:" + str(corrects))
print ("Total Incorrect: " + str(wrongs))
print("Overall Accruracy: " + str(corrects / ( corrects + wrongs)))
print("Overall Accruracy (%): " + str( int( 1000* corrects / ( corrects + wrongs)) / 10) + "%")
print ()
print('Run Time: ' + str(stop - start) + ' Seconds')  

Total Correct:9562
Total Incorrect: 437
Overall Accruracy: 0.9562956295629563
Overall Accruracy (%): 95.6%

Run Time: 11.303748370000449 Seconds
