# Library

In [3]:
import numpy as np
import torch
import torch.nn as nn

from utils import *
from dataset import CollisionDataset
from torch.utils.data import DataLoader

# Set up hardware

In [4]:
# Import libraries needed to use the accelerator
import pynq.lib.dma # For using the DMA
from pynq import Xlnk # Used for allocating contiguous arrays
import numpy as np # Xlnk uses numpy arrays
from pynq import Overlay # Used to download the bitstream
import struct
from pynq import DefaultIP # Used for AXI-Lite class

In [5]:
overlay = Overlay('/home/xilinx/Linear2x7_112420_2wayDMA_2/backward_lite_features.bit') # Download the bitstream onto the FPGA

# In this accelerator, we are accelerating two kernels and each has its own DMA which are assigned here:
dma1 = overlay.axi_dma_0 # Backward
# dma2 = overlay.axi_dma_1 # Equation Matrix

# Since this IP uses AXI-Lite for the output, we can associate that to a variable and then use our class defined above through 
# this
# backward_ip = overlay.backward_lite_0

xlnk = Xlnk() # Used for allocation
# Allocating the contiguous arrays of a fixed size:
in_stream = xlnk.cma_array(shape=(32*2+32*2+(32*7),1), dtype=np.float32)
out_stream = xlnk.cma_array(shape=((7+1)*2), dtype=np.float32)

in_buffer = xlnk.cma_array(shape=(32,2), dtype=np.float32)
out_buffer = xlnk.cma_array(shape=(32,7), dtype=np.float32)

# Model

In [6]:
class NaiveMLP(nn.Module):

    def __init__(self, in_dim, out_dim):
        super(NaiveMLP, self).__init__()
        self.hidden_dim = 128
        self.fc_1 = nn.Sequential(
            nn.Linear(in_dim, self.hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(self.hidden_dim, self.hidden_dim),
            nn.ReLU(inplace=True),
        )
        self.fc_out = nn.Linear(self.hidden_dim, out_dim)

    def forward(self, x):
        x = self.fc_1(x)
        x = self.fc_out(x)

        return x

In [62]:
def train_model(model, train_loader, test_loader, num_epochs, optimizer, scheduler, criterion):
    model.train()
        
    # Training the Model
    min_test_dif = float('inf')
    epoch_loss = []
    for epoch in range(num_epochs):
        batch_loss = []
        for i, data in enumerate(train_loader):
            # get the inputs
            x = data['x']
            y = data['y']
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            layer1 = net[0]
            z = layer1(x)
            y_hat = net(x)
            loss = criterion(y_hat, y)
            
            
            # Pre-processing for the other accelerator
            batch_x_stream = x.t()
            batch_x_stream = batch_x_stream.reshape(32*7, 1)
            y_stream = y.t()
            y_stream = y_stream.reshape(32 * 2, 1);
            y_hat_stream = y_hat.t()
            y_hat_stream = y_hat_stream.reshape(32 * 2, 1);
            z_stream = z.t()
            z_stream = z_stream.reshape(32*2,1)
            
            in_stream[:] = torch.cat(( y_stream.data, y_hat_stream.data, z_stream.data, batch_x_stream.data), 0).numpy()[:]
            
            t = time.time()
            loss.backward()
            pytorchLatency = time.time() - t

            # Transfer data to the DMA
            t = time.time()
            dma1.sendchannel.transfer(in_stream)
            dma1.recvchannel.transfer(out_stream)
            dma1.sendchannel.wait()
            dma1.recvchannel.wait()
            PYNQLatency = time.time() - t
            
#             manualGrad = backward_manual(y_hat, y, z, x)

            weight_grad = torch.reshape(torch.tensor(out_stream[0:14]), (2,7))
            bias_grad = torch.squeeze(torch.tensor(out_stream[14:16]))
            print()
            print()
            flag = True
            for param in model.parameters():
                if (flag):
                    RMSE = np.sqrt(np.mean(np.square((weight_grad - param.grad).numpy())))
                    print('PyTorch - PYNQ backprop RMSE for W:')
                    print(RMSE)
                    flag = False
                else:
                    RMSE = np.sqrt(np.mean(np.square((bias_grad - param.grad).numpy())))
                    print('PyTorch - PYNQ backprop RMSE for b:')
                    print(RMSE)
                    flag = True
                    
            
        
            print()
            print('Pytorch backprop latency:')
            print(str(round(pytorchLatency,5)) + " s")
            print('PYNQ backprop latency:')
            print(str(round(PYNQLatency,5)) + " s")
            print('Acceleration factor (CPU_Latency / PYNQ_Latency): ')
            print(round(pytorchLatency / PYNQLatency,5))
            print()
            
            optimizer.step()
            
            batch_loss.append(loss.item())

        # Results every epoch
        cur_epoch_loss = sum(batch_loss) / len(batch_loss)
        
        # Scheduler
        scheduler.step(cur_epoch_loss)
        
        # Test the network
        train_dif = test_model(model, train_loader, criterion)
        test_dif = test_model(model, test_loader, criterion)
        
        # Print the result
        print('Epoch: %d Train Loss: %f Train Dif: %f Test Dif: %f' 
              % (epoch, cur_epoch_loss, train_dif, test_dif))
        epoch_loss.append(cur_epoch_loss)
        
#         for param in net.parameters():
#         #     print(param)
#             print(param)
        print(torch.max(weight_grad))
        print(torch.max(bias_grad))
        
        if min_test_dif > test_dif:
            min_test_dif = test_dif
            print('Best')
        
    return epoch_loss

def test_model(model, test_loader, criterion):
    # Test the Model
    model.eval()
    
    batch_loss = []
    for i, data in enumerate(test_loader):

        # get the inputs
        x = data['x']
        y = data['y']

        # x = x.cuda()
        # y = y.cuda()

        y_hat = net(x)
        loss = criterion(y_hat, y)
        batch_loss.append(loss.item())

    # Results every epoch
    cur_epoch_loss = sum(batch_loss) / len(batch_loss)
    
    model.train()
    
    return cur_epoch_loss

In [63]:
%%time

import time

overlay = Overlay('/home/xilinx/Linear2x7_112420_2wayDMA_2/backward_lite_features.bit') # Download the bitstream onto the FPGA

dma1 = overlay.axi_dma_0 # Backward
xlnk = Xlnk() # Used for allocation

in_stream = xlnk.cma_array(shape=(32*(3 * 2 + 7),1), dtype=np.float32)
out_stream = xlnk.cma_array(shape=((7+1)*2,1), dtype=np.float32)

#################### Hyperparameters ####################
num_epochs = 4
learning_rate = 0.001
weight_decay = 0
in_frames_num = 3
pre_frames_num = 15
factor = 0.95
patience = 40
batch_size = 32
#################### Hyperparameters ####################
# net = NaiveMLP(in_dim=7, out_dim=2).cuda()
# net = NaiveMLP(in_dim=7, out_dim=2)
# net = nn.Linear(7,2)
net = nn.Sequential(
    nn.Linear(7, 2),
    nn.ReLU(inplace=True)
        )
criterion = torch.nn.MSELoss()
# criterion = torch.nn.functional.smooth_l1_loss

train_set = CollisionDataset(
    './dataset/dataset/uIsPoint3/train', 
    sample_num=32
)

test_set = CollisionDataset(
    './dataset/dataset/uIsPoint3/test',
)

print(len(train_set), len(test_set))

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=len(test_set), shuffle=False)

optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)
# optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.9)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=factor, 
    patience=patience, 
    verbose=True, 
    threshold=1e-3
)

train_loss = train_model(
    net, 
    train_loader, 
    test_loader, 
    num_epochs, 
    optimizer, 
    scheduler, 
    criterion
)

32 512


PyTorch - PYNQ backprop RMSE for W:
0.000631185
PyTorch - PYNQ backprop RMSE for b:
0.000137269

Pytorch backprop latency:
0.0012 s
PYNQ backprop latency:
0.00048 s
Acceleration factor (CPU_Latency / PYNQ_Latency): 
2.53186

Epoch: 0 Train Loss: 66.412178 Train Dif: 66.134560 Test Dif: 57.004086
tensor(20.0771)
tensor(5.3026)
Best


PyTorch - PYNQ backprop RMSE for W:
0.000578955
PyTorch - PYNQ backprop RMSE for b:
0.00011917

Pytorch backprop latency:
0.00131 s
PYNQ backprop latency:
0.00046 s
Acceleration factor (CPU_Latency / PYNQ_Latency): 
2.828

Epoch: 1 Train Loss: 66.134560 Train Dif: 65.858490 Test Dif: 56.792515
tensor(19.9928)
tensor(5.2794)
Best


PyTorch - PYNQ backprop RMSE for W:
0.000654468
PyTorch - PYNQ backprop RMSE for b:
0.000150596

Pytorch backprop latency:
0.00128 s
PYNQ backprop latency:
0.00046 s
Acceleration factor (CPU_Latency / PYNQ_Latency): 
2.77686

Epoch: 2 Train Loss: 65.858490 Train Dif: 65.583984 Test Dif: 56.581882
tensor(19.9084)
tensor(5.

# Backprop Testbed

In [64]:
overlay = Overlay('/home/xilinx/Linear2x7_112420_2wayDMA_2/backward_lite_features.bit') # Download the bitstream onto the FPGA

# In this accelerator, we are accelerating two kernels and each has its own DMA which are assigned here:
dma1 = overlay.axi_dma_0 # Backward
# dma2 = overlay.axi_dma_1 # Equation Matrix

# Since this IP uses AXI-Lite for the output, we can associate that to a variable and then use our class defined above through 
# this
# backward_ip = overlay.backward_lite_0

xlnk = Xlnk() # Used for allocation
# Allocating the contiguous arrays of a fixed size:
in_stream = xlnk.cma_array(shape=(32*2+32*2+(32*7),1), dtype=np.float32)
out_stream = xlnk.cma_array(shape=((7+1)*2,1), dtype=np.float32)

in_buffer = xlnk.cma_array(shape=(32,2), dtype=np.float32)
out_buffer = xlnk.cma_array(shape=(32,7), dtype=np.float32)

# Set up pytorch network, get x and y data from train_loader

In [65]:
overlay = Overlay('/home/xilinx/Linear2x7_112420_2wayDMA_2/backward_lite_features.bit') # Download the bitstream onto the FPGA

# In this accelerator, we are accelerating two kernels and each has its own DMA which are assigned here:
dma1 = overlay.axi_dma_0 # Backward
# dma2 = overlay.axi_dma_1 # Equation Matrix

# Since this IP uses AXI-Lite for the output, we can associate that to a variable and then use our class defined above through 
# this
# backward_ip = overlay.backward_lite_0

xlnk = Xlnk() # Used for allocation
# Allocating the contiguous arrays of a fixed size:
in_stream = xlnk.cma_array(shape=(32*2+32*2+(32*7),1), dtype=np.float32)
out_stream = xlnk.cma_array(shape=((7+1)*2,1), dtype=np.float32)

in_buffer = xlnk.cma_array(shape=(32,2), dtype=np.float32)
out_buffer = xlnk.cma_array(shape=(32,7), dtype=np.float32)


In [66]:
#################### Hyperparameters ####################
num_epochs = 50
learning_rate = 0.001
weight_decay = 0
in_frames_num = 3
pre_frames_num = 15
factor = 0.95
patience = 40
batch_size = 32
#################### Hyperparameters ####################
# net = NaiveMLP(in_dim=7, out_dim=2).cuda()
# net = NaiveMLP(in_dim=7, out_dim=2)

net = nn.Linear(7,2)
criterion = torch.nn.MSELoss()
# criterion = torch.nn.functional.smooth_l1_loss

train_set = CollisionDataset(
    './dataset/dataset/uIsPoint3/train', 
    sample_num=32
)

test_set = CollisionDataset(
    './dataset/dataset/uIsPoint3/test',
)

print(len(train_set), len(test_set))

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=len(test_set), shuffle=False)

optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate, weight_decay=weight_decay)
# optimizer = torch.optim.SGD(net.parameters(), lr=0.1, momentum=0.9)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=factor, 
    patience=patience, 
    verbose=True, 
    threshold=1e-3
)

32 512


In [67]:
for i, data in enumerate(train_loader):
    # get the inputs
    x = data['x']
    y = data['y']

# Run single forward/backward pass, compare FPGA to CPU results

In [68]:
overlay = Overlay('/home/xilinx/Linear2x7_112420_2wayDMA_2/backward_lite_features.bit') # Download the bitstream onto the FPGA

In [69]:
# In this accelerator, we are accelerating two kernels and each has its own DMA which are assigned here:
dma1 = overlay.axi_dma_0 # Backward
# dma2 = overlay.axi_dma_1 # Equation Matrix

# Since this IP uses AXI-Lite for the output, we can associate that to a variable and then use our class defined above through 
# this
# backward_ip = overlay.backward_lite_0

xlnk = Xlnk() # Used for allocation
# Allocating the contiguous arrays of a fixed size:
in_stream = xlnk.cma_array(shape=(32*(3 * 2 + 7),1), dtype=np.float32)
out_stream = xlnk.cma_array(shape=((7+1)*2,1), dtype=np.float32)

in_buffer = xlnk.cma_array(shape=(32,2), dtype=np.float32)
out_buffer = xlnk.cma_array(shape=(32,7), dtype=np.float32)



# model = nn.Linear(7,2)
# net = model
# model = net
model = nn.Sequential(
    nn.Linear(7, 2),
    nn.ReLU(inplace=True)
        )
# net = model
# model = net

# zero the parameter gradients
# optimizer.zero_grad()

# forward + backward + optimize
layer0 = model[0]
layer1 = model[1]
# z = layer1(x)
# y_hat = model(x)

# zero the parameter gradients
optimizer.zero_grad()

z = layer0(x)
y_hat = model(x)

loss = criterion(y_hat, y)
print('loss')
print(loss.item())

# Pre-processing for the other accelerator
batch_x_stream = x.t()
batch_x_stream = batch_x_stream.reshape(32*7, 1)

y_stream = y.t()
y_stream = y_stream.reshape(32 * 2, 1)
y_hat_stream = y_hat.t()
y_hat_stream = y_hat_stream.reshape(32 * 2, 1)
z_stream = z.t()
z_stream = z_stream.reshape(32*2,1)

in_stream = xlnk.cma_array(shape=(32*(3*2+7),1), dtype=np.float32)
out_stream = xlnk.cma_array(shape=((7+1)*2,1), dtype=np.float32)

in_stream_zeros = xlnk.cma_array(shape=(32*(3*2+7),1), dtype=np.float32)
in_stream_zeros[:] = np.zeros((32*(3*2+7),1), dtype=np.float32)

# print(y[1:10])
# print(y_stream)
in_stream[:] = torch.cat((y_stream.data, y_hat_stream.data, z_stream.data, batch_x_stream.data), 0).numpy()[:]
# print('in_stream')
# print(in_stream)

loss
55.46831130981445


In [70]:
import numpy as np
def column(matrix, i):
    return [row[i] for row in matrix]


def backward_manual(y_hat, y, z, x):
    diff = y_hat - y


    diff = diff * 1 / 32
    diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])
    diff[z < 0] = 0;


    # print(diff)

    rowIndex = 0
    colIndex = 0

    manualGrad = np.zeros((2,7))


    for row in diff:
#         print(manualGrad[0,0])
#         print(x[rowIndex][0])
#         print(diff[rowIndex][0])
#         print(1/32 * (y_hat[rowIndex][0] - y[rowIndex][0]) / (-1 * ))
        
        for i in range(2):
            for j in range(7):
                manualGrad[i,j] = manualGrad[i,j] + row[i] * x[rowIndex][j]
                
        rowIndex = rowIndex + 1
    return torch.tensor(manualGrad)


In [71]:
# Compare backprop results
t = time.time()
loss.backward()
pytorchLatency = time.time() - t


# Transfer data to the DMA
t = time.time()
dma1.sendchannel.transfer(in_stream)
dma1.recvchannel.transfer(out_stream)
dma1.sendchannel.wait()
dma1.recvchannel.wait()
PYNQLatency = time.time() - t

weight_grad = torch.reshape(torch.tensor(out_stream[0:14]), (2,7))

bias_grad = torch.reshape(torch.tensor(out_stream[14:16]), (1,2))

# manualGrad = backward_manual(y_hat, y, z, x)

flag = True
for param in model.parameters():
    if (flag):
        RMSE = np.sqrt(np.mean(np.square((weight_grad - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for W:')
        print(RMSE)
        flag = False
    else:
        RMSE = np.sqrt(np.mean(np.square((bias_grad - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for b:')
        print(RMSE)
        flag = True



print()
print('Pytorch backprop latency:')
print(str(round(pytorchLatency,5)) + " s")
print('PYNQ backprop latency:')
print(str(round(PYNQLatency,5)) + " s")
print('Acceleration factor (CPU_Latency / PYNQ_Latency): ')
print(round(pytorchLatency / PYNQLatency,5))
print()

PyTorch - PYNQ backprop RMSE for W:
0.00047887
PyTorch - PYNQ backprop RMSE for b:
7.3935e-05

Pytorch backprop latency:
0.00252 s
PYNQ backprop latency:
0.00219 s
Acceleration factor (CPU_Latency / PYNQ_Latency): 
1.14872



# 7x7 --> 7x2

In [84]:
overlay = Overlay('/home/xilinx/Linear2x7_7x7_120220_2wayDMA/backward_lite_features.bit') # Download the bitstream onto the FPGA

dma1 = overlay.axi_dma_0 # Backward
xlnk = Xlnk() # Used for allocation

N_in = 7
N_hidden_0 = 7 
N_out = 2

# model = nn.Linear(7,2)
model = nn.Sequential(
    nn.Linear(N_in, N_hidden_0),
    nn.Linear(N_hidden_0, N_out)
        )
net = model

# zero the parameter gradients
optimizer.zero_grad()

# forward + backward + optimize
layer0 = model[0]
layer1 = model[1]

z = layer0(x)
y_hat = model(x)

# print('y_hat')
# print(y_hat)

loss = criterion(y_hat, y)
print('loss')
print(loss.item())

t = time.time()
loss.backward()
pytorchLatency = time.time() - t

# Merge y_hat, y, z, x, weights into in_stream vector

y_stream = y.t()
y_stream = y_stream.reshape(32 * 2, 1)

y_hat_stream = y_hat.t()
y_hat_stream = y_hat_stream.reshape(32 * 2, 1)

z_stream = z.t()
z_stream = z_stream.reshape(32*7,1)

w = layer1.weight
w_stream = w.t()
w_stream = w_stream.reshape(2*7,1)

batch_x_stream = x.t()
batch_x_stream = batch_x_stream.reshape(32*7, 1)

# Instream [y[32x2] y_hat[32x2] z[32x7] w[2x7] x[32x7]]
in_stream = xlnk.cma_array(shape=(32*(2+2+7+7) + 2*7, 1), dtype=np.float32)
# Outstream [W_2_7[14] b_2_7[2] W_7_7[49] b_7_7[7]]
out_stream = xlnk.cma_array(shape=((7+1)*2+(7+1)*7,1), dtype=np.float32)

# print(y[1:10])
# print(y_stream)
in_stream[:] = torch.cat((y_stream.data, y_hat_stream.data, z_stream.data, batch_x_stream.data, w_stream.data), 0).numpy()[:]
# print('in_stream')
# print(str(in_stream).replace('[','').replace(']',','))

loss
80.36248016357422


In [85]:
# print(w)
# print(w_stream)

# print(w[1,2])
# print(w_stream[2 * 2 + 1])

# print(z[2,1])
# print(z_stream[2 + 32 * 1])

# print(x[2,1])
# print(batch_x_stream[2 + 32 * 1])

# # x[i,j] = x_stream[i + j * height(x)]

In [86]:
import numpy as np
def column(matrix, i):
    return [row[i] for row in matrix]

def backward_manual_noReLU(y_hat, y, x):
    diff = y_hat - y


    diff = diff * 1 / 32
    diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])
#     diff[z < 0] = 0;


    # print(diff)

    rowIndex = 0
    colIndex = 0

    manualGrad = np.zeros((2,7))


    for row in diff:
#         print(manualGrad[0,0])
#         print(x[rowIndex][0])
#         print(diff[rowIndex][0])
#         print(1/32 * (y_hat[rowIndex][0] - y[rowIndex][0]) / (-1 * ))
        
        for i in range(2):
            for j in range(7):
                manualGrad[i,j] = manualGrad[i,j] + row[i] * x[rowIndex][j]
                
        rowIndex = rowIndex + 1
    return torch.tensor(manualGrad), diff

def backward_manual_2_7_7_7(y_hat, y, z, x, w):
    diff = y_hat - y

    diff = diff * 1 / 32
    diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])
#     diff[z < 0] = 0;
    
#     diff = diff * 1 / 32
#     diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])

#     print(diff)
#     print(z)

    rowIndex = 0
    colIndex = 0

    manualGrad1 = np.zeros((2,7))
    dh = np.zeros((2,7))
    manualGrad0 = np.zeros((7,7))

    for row in diff:
        for i in range(2):
            for j in range(7):
                manualGrad1[i,j] = manualGrad1[i,j] + row[i] * z[rowIndex][j]
                for k in range(7):
                    manualGrad0[j,k] = manualGrad0[j,k] + row[i] * w[i][j] * x[rowIndex][k]
        rowIndex = rowIndex + 1
        
    return [torch.tensor(manualGrad0), torch.tensor(manualGrad1), diff]

# weights = layer1.weight
# print(weights)

manualGrad = backward_manual_2_7_7_7(y_hat, y, z, x, w)
manualGrad_1layer = backward_manual_noReLU(y_hat, y, z)



In [90]:
# Compare backprop results
# loss.backward()

# manualGrad = backward_manual(y_hat, y, z, x)

# Transfer data to the DMA
t = time.time()
dma1.sendchannel.transfer(in_stream)
dma1.recvchannel.transfer(out_stream)
dma1.sendchannel.wait()
dma1.recvchannel.wait()
PYNQLatency = time.time() - t

# Obtaining the output from the AXI-Lite interface, as well as post-processing
#             bias_grad = torch.tensor([backward_ip.bias1, backward_ip.bias2])
#             weight_grad = torch.tensor([[backward_ip.w1_1, backward_ip.w1_2, backward_ip.w1_3, backward_ip.w1_4, backward_ip.w1_5, backward_ip.w1_6, backward_ip.w1_7], 
#                                         [backward_ip.w2_1, backward_ip.w2_2, backward_ip.w2_3, backward_ip.w2_4, backward_ip.w2_5, backward_ip.w2_6, backward_ip.w2_7]])

# TODO Reshape
weight_grad = { }
bias_grad = { }
weight_grad[1] = torch.reshape(torch.tensor(out_stream[0:14]), (2,7))
bias_grad[1] = torch.reshape(torch.tensor(out_stream[14:16]), (1,2))

weight_grad[0] = torch.reshape(torch.tensor(out_stream[16:16+49]), (7,7))
bias_grad[0] = torch.reshape(torch.tensor(out_stream[16+49:16+49+7]), (1,7))


# bias_grad = torch.tensor(out_stream[14:16])
 
i = 0
flag = True
for param in model.parameters():
    if (flag):
        RMSE = np.sqrt(np.mean(np.square((weight_grad[i] - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for W', str(i), ':')
        print(RMSE)
        flag = False
    else:
        RMSE = np.sqrt(np.mean(np.square((bias_grad[i] - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for b', str(i), ':')
        print(RMSE)
        flag = True
        i = i + 1

print()
print('Pytorch backprop latency:')
print(str(round(pytorchLatency,5)) + " s")
print('PYNQ backprop latency:')
print(str(round(PYNQLatency,5)) + " s")
print('Acceleration factor (CPU_Latency / PYNQ_Latency): ')
print(round(pytorchLatency / PYNQLatency,5))

PyTorch - PYNQ backprop RMSE for W 0 :
0.000497502
PyTorch - PYNQ backprop RMSE for b 0 :
0.000530824
PyTorch - PYNQ backprop RMSE for W 1 :
16.7313
PyTorch - PYNQ backprop RMSE for b 1 :
0.000223968

Pytorch backprop latency:
0.00323 s
PYNQ backprop latency:
0.00204 s
Acceleration factor (CPU_Latency / PYNQ_Latency): 
1.58618


# 7x7 --> ReLU --> 7x2

In [94]:
def reshapeStreamList(w):
    for i in range(len(w)):
        if i == 0:
            w_stream = reshapeStream(w[i])
        else:
            w_stream = torch.cat((w_stream,reshapeStream(w[i])),0)
    return w_stream
    
def reshapeStream(x):
    x_stream = x.t()
    x_stream = x_stream.reshape(np.size(x.data.numpy()), 1)   
    return x_stream
        
def getNetworkFeatures(model, x):
    z = []
    a = []
    w = []
    zIndex = 0
    aIndex = 0
    reluFlag = 0
    for name, layer in model.named_modules():
        if isinstance(layer, torch.nn.Linear):
            if(zIndex == 0):
                z.append(layer(x))
            else:
                if(reluFlag):
                    z.append(layer(a[aIndex-1]))
                else:
                    z.append(layer(z[zIndex-1]))
                w.append(layer.weight)
            zIndex = zIndex + 1
        if isinstance(layer, torch.nn.ReLU):
            reluFlag = 1
            a.append(layer(z[zIndex - 1]))
            aIndex = aIndex + 1
            
    z = z[0:len(z) - 1] # Discard last element of z ( = y_hat)
    return a, z, w

In [95]:
overlay = Overlay('/home/xilinx/Linear7x7_ReLU_7x2/backward_lite_features.bit') # Download the bitstream onto the FPGA

dma1 = overlay.axi_dma_0 # Backward
xlnk = Xlnk() # Used for allocation

N_in = 7
N_hidden_0 = 7 
N_out = 2

# model = nn.Linear(7,2)
model = nn.Sequential(
    nn.Linear(N_in, N_hidden_0),
    nn.ReLU(inplace=True),
    nn.Linear(N_hidden_0, N_out)
        )
net = model

# zero the parameter gradients
optimizer.zero_grad()

# forward + backward + optimize
a,z,w = getNetworkFeatures(model, x)


a,z,w = getNetworkFeatures(model, x)
y_hat = model(x)

y_stream = reshapeStream(y)
y_hat_stream = reshapeStream(y_hat)
x_stream = reshapeStream(x)

a_stream = reshapeStreamList(a)
z_stream = reshapeStreamList(z)
w_stream = reshapeStreamList(w)

loss = criterion(y_hat, y)
print('loss')
print(loss.item())
t = time.time()
loss.backward()
pytorchLatency = time.time() - t

in_stream_data = torch.cat((y_stream.data, y_hat_stream.data, a_stream.data, z_stream.data, x_stream.data, w_stream.data), 0).numpy()[:]
# print(np.size(in_stream_data))
sizeInputData = np.size(in_stream_data)
numHiddenLayers = len(w) - 1

# Pre-processing for the other accelerator

# Instream [y[N_out * BATCH_SIZE] 
#           y_hat[N_out * BATCH_SIZE]
#           a[N_hidden * BATCH_SIZE * numReLULayers] 
#           z[N_hidden * BATCH_SIZE * numHiddenLayers] 
#           x[N_input * BATCH_SIZE]
#           w[N_hidden * N_hidden * numHiddenLayers + N_hidden * N_out]
in_stream = xlnk.cma_array(shape=(sizeInputData,1), dtype=np.float32)
# Outstream [W b W b ... W0 b0]
out_stream = xlnk.cma_array(shape=((N_hidden_0+1)*N_out + (N_hidden_0+1)*N_hidden_0 * numHiddenLayers + (N_in+1)*N_hidden_0,1), dtype=np.float32)

in_stream[:] = torch.cat((y_stream.data, y_hat_stream.data, a_stream.data, z_stream.data, x_stream.data, w_stream.data), 0).numpy()[:]
# print('in_stream')
# print(str(in_stream).replace('[','').replace(']',','))

loss
53.46218490600586


In [96]:
import numpy as np
def column(matrix, i):
    return [row[i] for row in matrix]

def backward_manual_noReLU(y_hat, y, x):
    diff = y_hat - y

    diff = diff * 1 / 32
    diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])
#     diff[z < 0] = 0;


    # print(diff)

    rowIndex = 0
    colIndex = 0

    manualGrad = np.zeros((2,7))

    for row in diff:
#         print(manualGrad[0,0])
#         print(x[rowIndex][0])
#         print(diff[rowIndex][0])
#         print(1/32 * (y_hat[rowIndex][0] - y[rowIndex][0]) / (-1 * ))
        
        for i in range(2):
            for j in range(7):
                manualGrad[i,j] = manualGrad[i,j] + row[i] * x[rowIndex][j]
                
        rowIndex = rowIndex + 1
    return torch.tensor(manualGrad), diff

def backward_manual_2_7_ReLU_7_7(y_hat, y, a, z, x, w):
    diff = y_hat - y

    diff = diff * 1 / 32
    diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])
#     diff[z < 0] = 0;
    
#     diff = diff * 1 / 32
#     diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])

#     print(diff)
#     print(z)

    rowIndex = 0
    colIndex = 0

    manualGrad1 = np.zeros((2,7))
    dh = np.zeros((2,7))
    manualGrad0 = np.zeros((7,7))
#     print(z)
#     print(a)
    

    for row in diff:
        for i in range(2):
            for j in range(7):
                manualGrad1[i,j] = manualGrad1[i,j] + row[i] * a[rowIndex][j]
                for k in range(7):
                    manualGrad0[j,k] = manualGrad0[j,k] + np.heaviside(z[rowIndex][j].detach(),0) * row[i] * w[i][j] * x[rowIndex][k]
                
#             for i in range(7):        
#                 manualGrad0[i,j] = manualGrad0[i,j] + (row[i] * weights[i,j] * x[rowIndex][j])
        rowIndex = rowIndex + 1
        
                
#     for row in diff:
#         for i in range(7):
#             for j in range(7):
#                 manualGrad0[i,j] = manualGrad0[i,j] + (row[i] * weights[j,i] * x[rowIndex][j])
#         rowIndex = rowIndex + 1

#     dh = np.dot(manualGrad1, weights.detach().numpy())
#     print(dh)
#     manualGrad0 = np.dot(dh, x.T)
    
    return [torch.tensor(manualGrad0), torch.tensor(manualGrad1), diff]


manualGrad = backward_manual_2_7_ReLU_7_7(y_hat, y, a[0], z[0], x, w[0])
manualGrad_1layer = backward_manual_noReLU(y_hat, y, a[0])



In [97]:
# Transfer data to the DMA

t = time.time()
dma1.sendchannel.transfer(in_stream)
dma1.recvchannel.transfer(out_stream)
dma1.sendchannel.wait()
dma1.recvchannel.wait()
PYNQLatency = time.time() - t
# Obtaining the output from the AXI-Lite interface, as well as post-processing
#             bias_grad = torch.tensor([backward_ip.bias1, backward_ip.bias2])
#             weight_grad = torch.tensor([[backward_ip.w1_1, backward_ip.w1_2, backward_ip.w1_3, backward_ip.w1_4, backward_ip.w1_5, backward_ip.w1_6, backward_ip.w1_7], 
#                                         [backward_ip.w2_1, backward_ip.w2_2, backward_ip.w2_3, backward_ip.w2_4, backward_ip.w2_5, backward_ip.w2_6, backward_ip.w2_7]])

# TODO Reshape
weight_grad = { }
bias_grad = { }
weight_grad[1] = torch.reshape(torch.tensor(out_stream[0:14]), (2,7))

bias_grad[1] = torch.reshape(torch.tensor(out_stream[14:16]), (1,2))
weight_grad[0] = torch.reshape(torch.tensor(out_stream[16:16+49]), (7,7))

bias_grad[0] = torch.reshape(torch.tensor(out_stream[16+49:16+49+7]), (1,7))


PYNQ backprop latency:


In [98]:
i = 0
flag = True
for param in model.parameters():
    if (flag):
        RMSE = np.sqrt(np.mean(np.square((weight_grad[i] - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for W', str(i), ':')
        print(RMSE)
        flag = False
    else:
        RMSE = np.sqrt(np.mean(np.square((bias_grad[i] - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for b', str(i), ':')
        print(RMSE)
        flag = True
        i = i + 1

print()
print('Pytorch backprop latency:')
print(str(round(pytorchLatency,5)) + " s")
print('PYNQ backprop latency:')
print(str(round(PYNQLatency,5)) + " s")
print('Acceleration factor (CPU_Latency / PYNQ_Latency): ')
print(round(pytorchLatency / PYNQLatency,5))

PyTorch - PYNQ backprop RMSE for W 0 :
0.000176482
PyTorch - PYNQ backprop RMSE for b 0 :
0.000171727
PyTorch - PYNQ backprop RMSE for W 1 :
0.000280094
PyTorch - PYNQ backprop RMSE for b 1 :
0.000233868

Pytorch backprop latency:
0.00383 s
PYNQ backprop latency:
0.00214 s
Acceleration factor (CPU_Latency / PYNQ_Latency): 
1.79145


# 7x16 --> ReLU --> 16x16 --> ReLU --> 7x2
#### Similar to Naive_MLP: (7x128 --> ReLU --> 128x128 --> ReLU --> 128x2)

In [132]:
def reshapeStreamList(w):
    for i in range(len(w)):
        if i == 0:
            w_stream = reshapeStream(w[i])
        else:
            w_stream = torch.cat((w_stream,reshapeStream(w[i])),0)
    return w_stream
    
def reshapeStream(x):
    x_stream = x.t()
    x_stream = x_stream.reshape(np.size(x.data.numpy()), 1)   
    return x_stream
        
def getNetworkFeatures(model, x):
    z = []
    a = []
    w = []
    zIndex = 0
    aIndex = 0
    reluFlag = 0
    for name, layer in model.named_modules():
        if isinstance(layer, torch.nn.Linear):
            if(zIndex == 0):
                z.append(layer(x))
            else:
                if(reluFlag):
                    z.append(layer(a[aIndex-1]))
                else:
                    z.append(layer(z[zIndex-1]))
                w.append(layer.weight)
            zIndex = zIndex + 1
        if isinstance(layer, torch.nn.ReLU):
            reluFlag = 1
            a.append(layer(z[zIndex - 1]))
            aIndex = aIndex + 1
            
    z = z[0:len(z) - 1] # Discard last element of z ( = y_hat)
    return a, z, w

In [143]:
import numpy as np
def column(matrix, i):
    return [row[i] for row in matrix]

def backward_manual_noReLU(y_hat, y, x):
    diff = y_hat - y

    diff = diff * 1 / 32
    diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])
#     diff[z < 0] = 0;


    # print(diff)

    rowIndex = 0
    colIndex = 0

    manualGrad = np.zeros((2,16))

    for row in diff:
#         print(manualGrad[0,0])
#         print(x[rowIndex][0])
#         print(diff[rowIndex][0])
#         print(1/32 * (y_hat[rowIndex][0] - y[rowIndex][0]) / (-1 * ))
        
        for i in range(2):
            for j in range(16):
                manualGrad[i,j] = manualGrad[i,j] + row[i] * x[rowIndex][j]
                
        rowIndex = rowIndex + 1
    return torch.tensor(manualGrad), diff

def backward_manual_2x16_ReLU_16x16_ReLU_7x7(in_stream):
    N_in = 7 
    N_hidden = 16
    N_out = 2 
    BATCH_SIZE = 32
    diff = y_hat - y
    numHiddenLayers = 1
    numReluLayers = 2
                   
                   
    size_y = BATCH_SIZE * (N_out);
    size_a = BATCH_SIZE * (N_hidden * numReluLayers);
    size_z = BATCH_SIZE * (N_hidden * (numHiddenLayers+1));
    size_x = BATCH_SIZE * (N_in);
    size_w = (N_out * N_hidden) + (N_hidden * N_hidden * numHiddenLayers);
    
    LIM1 = size_y;      
    LIM2 = size_y * 2;   
    LIM3 = size_y * 2 + size_a;   
    LIM4 = size_y * 2 + size_a + size_z;   
    LIM5 = size_y * 2 + size_a + size_z + size_x; 
    LIM6 = size_y * 2 + size_a + size_z + size_x + size_w; 
       
    
    LIM2W = (N_out) * (N_hidden)
    LIM2b = (N_out) * (N_hidden + 1)

    LIM1W = (N_out) * (N_hidden + 1) + (N_hidden) * (N_hidden)
    LIM1b = (N_out) * (N_hidden + 1) + (N_hidden) * (N_hidden + 1)

    LIM0W = (N_out) * (N_hidden + 1) + (N_hidden) * (N_hidden + 1) \
                                     + (N_hidden) * (N_in)
    LIM0b = (N_out) * (N_hidden + 1) + (N_hidden) * (N_hidden + 1) \
                                     + (N_hidden) * (N_in + 1)
               
    nn_out_mat = in_stream[0:LIM1][:]
    batch_y_mat = in_stream[LIM1:LIM2][:]  
    batch_a_mat = in_stream[LIM2:LIM3][:]
    batch_z_mat = in_stream[LIM3:LIM4][:]
    batch_x_mat = in_stream[LIM4:LIM5][:]
    batch_w_mat = in_stream[LIM5:LIM6][:]
               
    dataOut = np.zeros([(N_out) * (N_hidden + 1) + \
                        (N_hidden) * (N_hidden + 1) + \
                        (N_hidden) * (N_in + 1)])
               
    diff = batch_y_mat - nn_out_mat
    diff = diff * 1 / 32
    diff[abs(diff) > 1] = diff[abs(diff) > 1] / abs(diff[abs(diff) > 1])

    rowIndex = 0
    colIndex = 0

    manualGrad2 = np.zeros((N_out,N_hidden))
    manualGrad1 = np.zeros((N_hidden,N_hidden))
    manualGrad0 = np.zeros((N_hidden,N_in))
#     print(z)
#     print(a)
    
    dRelu = 0
    
    
#     print(np.size(manualGrad1),0)

    for batchIndex in range(BATCH_SIZE):
        for i in range(N_out):
            for j in range(N_hidden):
                manualGrad2[i,j] = manualGrad2[i,j] \
                                   + diff[batchIndex + i * BATCH_SIZE] \
                                   * batch_a_mat[batchIndex + j * BATCH_SIZE + N_hidden * BATCH_SIZE]
                for k in range(N_hidden):
                    manualGrad1[j,k] = manualGrad1[j,k] \
                                       + np.heaviside(batch_z_mat[batchIndex + j * BATCH_SIZE + BATCH_SIZE * N_hidden],0) \
                                       * diff[batchIndex + i * BATCH_SIZE] \
                                       * batch_w_mat[i + j * N_out + N_hidden * N_hidden] \
                                       * batch_z_mat[batchIndex + k * BATCH_SIZE]

                    for k0 in range(N_in):
                        manualGrad0[k,k0] = manualGrad0[k,k0] \
                                           + np.heaviside(batch_z_mat[batchIndex + k * BATCH_SIZE],0) \
                                           * diff[batchIndex + i * BATCH_SIZE] \
                                           * np.heaviside(batch_z_mat[batchIndex + j * BATCH_SIZE + BATCH_SIZE * N_hidden],0) \
                                           * batch_w_mat[i + j * N_out + N_hidden * N_hidden] \
                                           * batch_w_mat[j + k * N_hidden] * batch_x_mat[batchIndex + k0 * BATCH_SIZE]
    
    return [torch.tensor(manualGrad0), torch.tensor(manualGrad1), torch.tensor(manualGrad2), diff]

# in_stream_vector = torch.cat((y_stream.data, y_hat_stream.data, a_stream.data, z_stream.data, batch_x_stream.data, w_stream.data), 0).numpy()[:]

# manualGrad = backward_manual_2x16_ReLU_16x16_ReLU_7x7(in_stream_vector)
# manualGrad_1layer = backward_manual_noReLU(y_hat, y, a[1])

# print(manualGrad[0])
# print(manualGrad[1])
# print(manualGrad[1] - pytorchGrad[1])
# print(manualGrad_1layer[0])
# print(manualGrad[3])

In [144]:
overlay = Overlay('/home/xilinx/Linear7x16_ReLU_16x16_ReLU_16x2/backward_lite_features.bit') # Download the bitstream onto the FPGA
import sys
np.set_printoptions(threshold=sys.maxsize)

dma1 = overlay.axi_dma_0 # Backward
xlnk = Xlnk() # Used for allocation

N_in = 7
N_hidden_0 = 16 
N_hidden = N_hidden_0
N_out = 2

# model = nn.Linear(7,2)
model = nn.Sequential(
    nn.Linear(N_in, N_hidden_0),
    nn.ReLU(inplace=True),
    nn.Linear(N_hidden_0, N_hidden_0),
    nn.ReLU(inplace=True),
    nn.Linear(N_hidden_0, N_out)
        )
net = model

# zero the parameter gradients
optimizer.zero_grad()

a,z,w = getNetworkFeatures(model, x)
y_hat = model(x)

y_stream = reshapeStream(y)
y_hat_stream = reshapeStream(y_hat)
x_stream = reshapeStream(x)

a_stream = reshapeStreamList(a)
z_stream = reshapeStreamList(z)
w_stream = reshapeStreamList(w)

loss = criterion(y_hat, y)
print('loss')
print(loss.item())
t = time.time()
loss.backward()
pytorchLatency = time.time() - t

# Store pytorch grad in list
pytorchGrad = []
flag = True
layerNum = 0
for param in model.parameters():
    if (flag):
        pytorchGrad.append(param.grad)
        flag = False
    else:
        flag = True
        layerNum = layerNum + 1

in_stream_data = torch.cat((y_stream.data, y_hat_stream.data, a_stream.data, z_stream.data, x_stream.data, w_stream.data), 0).numpy()[:]
sizeInputData = np.size(in_stream_data)
numHiddenLayers = len(w) - 1

# Pre-processing for the other accelerator
# Instream [y[N_out * BATCH_SIZE] 
#           y_hat[N_out * BATCH_SIZE]
#           a[N_hidden * BATCH_SIZE * numReLULayers] 
#           z[N_hidden * BATCH_SIZE * numHiddenLayers] 
#           x[N_input * BATCH_SIZE]
#           w[N_hidden * N_hidden * numHiddenLayers + N_hidden * N_out]
in_stream = xlnk.cma_array(shape=(sizeInputData,1), dtype=np.float32)

# Outstream [W b W b ... W0 b0]
out_stream = xlnk.cma_array(shape=((N_hidden_0+1)*N_out + (N_hidden_0+1)*N_hidden_0 * numHiddenLayers + (N_in+1)*N_hidden_0,1), dtype=np.float32)

in_stream[:] = torch.cat((y_stream.data, y_hat_stream.data, a_stream.data, z_stream.data, x_stream.data, w_stream.data), 0).numpy()[:]
# print('in_stream')
# print(str(in_stream).replace('[','').replace(']',','))

loss
54.51510238647461


In [145]:
# Transfer data to the DMA

t = time.time()
dma1.sendchannel.transfer(in_stream)
dma1.recvchannel.transfer(out_stream)
dma1.sendchannel.wait()
dma1.recvchannel.wait()
PYNQLatency = time.time() - t

# TODO Reshape
weight_grad = { }
bias_grad = { }

LIM2W = N_out * N_hidden;
LIM2b = LIM2W + N_out;

LIM1W = LIM2b + N_hidden * N_hidden;
LIM1b = LIM1W + N_hidden;

LIM0W = LIM1b + N_in * N_hidden;
LIM0b = LIM0W + N_hidden;

weight_grad[2] = torch.reshape(torch.tensor(out_stream[0:LIM2W]), (N_out,N_hidden))
bias_grad[2] = torch.reshape(torch.tensor(out_stream[LIM2W:LIM2b]), (1,N_out))

weight_grad[1] = torch.reshape(torch.tensor(out_stream[LIM2b:LIM1W]), (N_hidden,N_hidden))
bias_grad[1] = torch.reshape(torch.tensor(out_stream[LIM1W:LIM1b]), (1,N_hidden))

weight_grad[0] = torch.reshape(torch.tensor(out_stream[LIM1b:LIM0W]), (N_hidden,N_in))
bias_grad[0] = torch.reshape(torch.tensor(out_stream[LIM0W:LIM0b]), (1,N_hidden))


In [146]:
i = 0
flag = True
for param in model.parameters():
    if (flag):
        RMSE = np.sqrt(np.mean(np.square((weight_grad[i] - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for W', str(i), ':')
        print(RMSE)
        flag = False
    else:
        RMSE = np.sqrt(np.mean(np.square((bias_grad[i] - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for b', str(i), ':')
        print(RMSE)
        flag = True
        i = i + 1

print()
print('Pytorch backprop latency:')
print(str(round(pytorchLatency,5)) + " s")
print('PYNQ backprop latency:')
print(str(round(PYNQLatency,5)) + " s")
print('Acceleration factor (CPU_Latency / PYNQ_Latency): ')
print(round(pytorchLatency / PYNQLatency,5))

PyTorch - PYNQ backprop RMSE for W 0 :
0.0934093
PyTorch - PYNQ backprop RMSE for b 0 :
0.0124329
PyTorch - PYNQ backprop RMSE for W 1 :
0.11091
PyTorch - PYNQ backprop RMSE for b 1 :
0.0434416
PyTorch - PYNQ backprop RMSE for W 2 :
0.327921
PyTorch - PYNQ backprop RMSE for b 2 :
0.223616

Pytorch backprop latency:
0.00396 s
PYNQ backprop latency:
0.01155 s
Acceleration factor (CPU_Latency / PYNQ_Latency): 
0.34314


# 7x16 --> ReLU --> 16x16 --> ReLU --> 7x2 No Opt
#### Similar to Naive_MLP: (7x128 --> ReLU --> 128x128 --> ReLU --> 128x2)
### No Loop unrolling or pipelining

In [147]:
overlay = Overlay('/home/xilinx/Linear7x16_ReLU_16x16_ReLU_16x2_NoOpt/backward_lite_features.bit') # Download the bitstream onto the FPGA
import sys
np.set_printoptions(threshold=sys.maxsize)

dma1 = overlay.axi_dma_0 # Backward
xlnk = Xlnk() # Used for allocation

N_in = 7
N_hidden_0 = 16 
N_hidden = N_hidden_0
N_out = 2

# model = nn.Linear(7,2)
model = nn.Sequential(
    nn.Linear(N_in, N_hidden_0),
    nn.ReLU(inplace=True),
    nn.Linear(N_hidden_0, N_hidden_0),
    nn.ReLU(inplace=True),
    nn.Linear(N_hidden_0, N_out)
        )
net = model

# zero the parameter gradients
optimizer.zero_grad()

a,z,w = getNetworkFeatures(model, x)
y_hat = model(x)

y_stream = reshapeStream(y)
y_hat_stream = reshapeStream(y_hat)
x_stream = reshapeStream(x)

a_stream = reshapeStreamList(a)
z_stream = reshapeStreamList(z)
w_stream = reshapeStreamList(w)

loss = criterion(y_hat, y)
print('loss')
print(loss.item())
t = time.time()
loss.backward()
pytorchLatency = time.time() - t

# Store pytorch grad in list
pytorchGrad = []
flag = True
layerNum = 0
for param in model.parameters():
    if (flag):
        pytorchGrad.append(param.grad)
        flag = False
    else:
        flag = True
        layerNum = layerNum + 1

in_stream_data = torch.cat((y_stream.data, y_hat_stream.data, a_stream.data, z_stream.data, x_stream.data, w_stream.data), 0).numpy()[:]
sizeInputData = np.size(in_stream_data)
numHiddenLayers = len(w) - 1

# Pre-processing for the other accelerator
# Instream [y[N_out * BATCH_SIZE] 
#           y_hat[N_out * BATCH_SIZE]
#           a[N_hidden * BATCH_SIZE * numReLULayers] 
#           z[N_hidden * BATCH_SIZE * numHiddenLayers] 
#           x[N_input * BATCH_SIZE]
#           w[N_hidden * N_hidden * numHiddenLayers + N_hidden * N_out]
in_stream = xlnk.cma_array(shape=(sizeInputData,1), dtype=np.float32)

# Outstream [W b W b ... W0 b0]
out_stream = xlnk.cma_array(shape=((N_hidden_0+1)*N_out + (N_hidden_0+1)*N_hidden_0 * numHiddenLayers + (N_in+1)*N_hidden_0,1), dtype=np.float32)

in_stream[:] = torch.cat((y_stream.data, y_hat_stream.data, a_stream.data, z_stream.data, x_stream.data, w_stream.data), 0).numpy()[:]
# print('in_stream')
# print(str(in_stream).replace('[','').replace(']',','))

loss
57.34963607788086


In [148]:
# Transfer data to the DMA

t = time.time()
dma1.sendchannel.transfer(in_stream)
dma1.recvchannel.transfer(out_stream)
dma1.sendchannel.wait()
dma1.recvchannel.wait()
PYNQLatency = time.time() - t

# TODO Reshape
weight_grad = { }
bias_grad = { }

LIM2W = N_out * N_hidden;
LIM2b = LIM2W + N_out;

LIM1W = LIM2b + N_hidden * N_hidden;
LIM1b = LIM1W + N_hidden;

LIM0W = LIM1b + N_in * N_hidden;
LIM0b = LIM0W + N_hidden;

weight_grad[2] = torch.reshape(torch.tensor(out_stream[0:LIM2W]), (N_out,N_hidden))
bias_grad[2] = torch.reshape(torch.tensor(out_stream[LIM2W:LIM2b]), (1,N_out))

weight_grad[1] = torch.reshape(torch.tensor(out_stream[LIM2b:LIM1W]), (N_hidden,N_hidden))
bias_grad[1] = torch.reshape(torch.tensor(out_stream[LIM1W:LIM1b]), (1,N_hidden))

weight_grad[0] = torch.reshape(torch.tensor(out_stream[LIM1b:LIM0W]), (N_hidden,N_in))
bias_grad[0] = torch.reshape(torch.tensor(out_stream[LIM0W:LIM0b]), (1,N_hidden))


In [149]:
i = 0
flag = True
for param in model.parameters():
    if (flag):
        RMSE = np.sqrt(np.mean(np.square((weight_grad[i] - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for W', str(i), ':')
        print(RMSE)
        flag = False
    else:
        RMSE = np.sqrt(np.mean(np.square((bias_grad[i] - param.grad).numpy())))
        print('PyTorch - PYNQ backprop RMSE for b', str(i), ':')
        print(RMSE)
        flag = True
        i = i + 1

print()
print('Pytorch backprop latency:')
print(str(round(pytorchLatency,5)) + " s")
print('PYNQ backprop latency:')
print(str(round(PYNQLatency,5)) + " s")
print('Acceleration factor (CPU_Latency / PYNQ_Latency): ')
print(round(pytorchLatency / PYNQLatency,5))

PyTorch - PYNQ backprop RMSE for W 0 :
0.0740186
PyTorch - PYNQ backprop RMSE for b 0 :
0.0131191
PyTorch - PYNQ backprop RMSE for W 1 :
0.119391
PyTorch - PYNQ backprop RMSE for b 1 :
0.0404381
PyTorch - PYNQ backprop RMSE for W 2 :
0.411409
PyTorch - PYNQ backprop RMSE for b 2 :
0.353297

Pytorch backprop latency:
0.00377 s
PYNQ backprop latency:
0.01141 s
Acceleration factor (CPU_Latency / PYNQ_Latency): 
0.33003
