# Test functional correctness of quant layer implementation

In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import math

from src.quant_layer import attention, ffn, layer_kernel_gt
from src import quant_layer
from src.quant_ops import tensor_quant_scale

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
text_512 = 'This project aims to implement a transformer layer on a cluster of FPGAs. In recent years transformers have outperformed traditional convolutional neural networks in many fields, but serial performance is dismal and parallel GPU performance is power-intensive. Specialized architectures have been studied little, especially using FPGA platforms. This research will improve transformer inference performance by offloading computationally intensive sections of the network to reconfigurable accelerators running on a cluster of multiple FPGA devices. This research will result in an acceleration architecture for a single layer of a transformer network along with a performance comparison with CPU and GPU baselines. We propose the investigation of distributed transformer inference across a cluster of multiple field programmable gate arrays (FPGAs). This research will investigate the partitioning of a transformer layer across multiple FPGA devices along with networking between FPGAs in the cluster. Transformers have become a dominant machine learning architecture for many domains such as natural language processing, therefore high speed inference is desirable. However, networks sizes and limited FPGA resources often make inference on a single FPGA slow due to limited parallelism and pipeline depth or impossible due to limited resources. The purpose of this research is to explore methods to overcome these challenges by introducing parallelism through multi-FPGA clusters. Transformers are highly parallel neural network architectures which consist of stacks of encoder and decoder layers. These layers consist of many linear transformations on matrices which are represented by matrix-matrix multiplication. Within an encoder/decoder layer there is an opportunity to parallelize both between concurrent general matrix multiplies (GeMM) and within each GeMM. Attempting to serialize these operations on a CPU leads to high execution time and is a poor utilization of the CPU\'s general purpose architecture. GPUs can deliver high throughput inference for transformers, though they are power-hungry and do not achieve the low latency required by some applications. Both in the datacenter and at the edge, low-latency and efficient inference is desired. Optimally, there would be an architecture that could scale between these two extremes of computational demand. State-of-the-art transformers can contain upwards of 12 layers and multiply matrices on the order of 1024x1024 elements. In addition, the trend of increasing transformer size does not show signs of slowing. This large use of memory and FLOPs leads to difficulty mapping an entire transformer network to a '
text_128 = 'This project aims to implement a transformer layer on a cluster of FPGAs. In recent years transformers have outperformed traditional convolutional neural networks in many fields, but serial performance is dismal and parallel GPU performance is power-intensive. Specialized architectures have been studied little, especially using FPGA platforms. This research will improve transformer inference performance by offloading computationally intensive sections of the network to reconfigurable accelerators running on a cluster of multiple FPGA devices. This research will result in an acceleration architecture for a single layer of a transformer network along with a  '
text = text_128
encoded_input = tokenizer(text, return_tensors='pt')
embedding_output = model.embeddings(
    input_ids=encoded_input['input_ids'],
    position_ids=None,
    token_type_ids=encoded_input['token_type_ids'],
    inputs_embeds=None,
    past_key_values_length=0,
)

In [3]:
embedding_output.shape

torch.Size([1, 128, 768])

In [4]:
layer = model.encoder.layer[0]
attention_out = attention(layer, embedding_output)
output_gt = ffn(layer, attention_out)
print(output_gt)

attention_probs_scale 0.007628676470588235
attention_probs_int tensor([[[[  1.,   2.,   1.,  ...,   2.,   2.,   3.],
          [  4.,   1.,   2.,  ...,   1.,   0.,   1.],
          [  2.,   1.,   1.,  ...,   0.,   0.,   1.],
          ...,
          [  1.,   1.,   1.,  ...,   2.,   1.,   2.],
          [  1.,   1.,   1.,  ...,   2.,   2.,   2.],
          [  1.,   1.,   1.,  ...,   1.,   2.,   2.]],

         [[  7.,   0.,   0.,  ...,   3.,   0.,   0.],
          [  0.,   0.,   2.,  ...,   0.,   0.,   0.],
          [  0.,   0.,   0.,  ...,   0.,   0.,   1.],
          ...,
          [  0.,   0.,   2.,  ...,   1.,   3.,   0.],
          [  0.,   1.,   1.,  ...,   3.,   3.,   0.],
          [  0.,   1.,   0.,  ...,  12.,   4.,   0.]],

         [[ 48.,   2.,   1.,  ...,   1.,   1.,   2.],
          [ 70.,  12.,   1.,  ...,   0.,   1.,   1.],
          [ 13.,  65.,   4.,  ...,   1.,   0.,   1.],
          ...,
          [  4.,   0.,   0.,  ...,   1.,   2.,   6.],
          [ 31.,   4.,  

In [5]:
output_gt_int, output_scale = tensor_quant_scale(output_gt, bits=8)
output_gt_int = output_gt_int.type(torch.int8)
output_gt_int

tensor([[[  1,  -1,  -2,  ...,   1,   0,   0],
         [ -6,   6,   2,  ...,   1,   7,   2],
         [  4,   6,   2,  ...,   0,   5, -11],
         ...,
         [ -6,   9,   9,  ...,   1, -14,   9],
         [  5,   7,   9,  ...,   4,   0,   8],
         [ -2,   2,   2,  ...,  -1,   1,  -5]]], dtype=torch.int8)

In [6]:
output_test = quant_layer.pipeline(layer, embedding_output)
print(output_test)

TypeError: pipeline() missing 2 required positional arguments: 'stage3_args' and 'stage4_args'

In [7]:
assert torch.allclose(output_gt, output_test)

NameError: name 'output_test' is not defined

# Test stage by stage

## Stage 1
Quantize the ground truth output so we can compare it to the quantized output.

In [8]:
(stage1_query_gt, stage1_key_gt, stage1_value_gt) = quant_layer.stage1_dynamic(layer, embedding_output)

stage1_query_gt_int, _ = tensor_quant_scale(stage1_query_gt)
stage1_query_gt_int = stage1_query_gt_int.type(torch.int8)
stage1_key_gt_int, _ = tensor_quant_scale(stage1_key_gt)
stage1_key_gt_int = stage1_key_gt_int.type(torch.int8)
stage1_value_gt_int, _ = tensor_quant_scale(stage1_value_gt)
stage1_value_gt_int = stage1_value_gt_int.type(torch.int8)

In [9]:
stage1_args, stage2_args, stage3_args, stage4_args = layer_kernel_gt(layer, embedding_output)
stage1_query_uut, stage1_key_uut, stage1_value_uut = quant_layer.stage1(**stage1_args)

In [10]:
assert torch.allclose(stage1_query_gt_int, stage1_query_uut)
assert torch.allclose(stage1_key_gt_int, stage1_key_uut)
assert torch.allclose(stage1_value_gt_int, stage1_value_uut)

## Stage 2

In [11]:
stage2_gt = quant_layer.stage2_dynamic(layer, embedding_output, stage1_query_gt, stage1_key_gt, stage1_value_gt)
stage2_gt_int, _ = tensor_quant_scale(stage2_gt, bits=8)
stage2_gt_int = stage2_gt_int.type(torch.int8)
stage2_gt_int

attention_probs_scale 0.007628676470588235
attention_probs_int tensor([[[[  1.,   2.,   1.,  ...,   2.,   2.,   3.],
          [  4.,   1.,   2.,  ...,   1.,   0.,   1.],
          [  2.,   1.,   1.,  ...,   0.,   0.,   1.],
          ...,
          [  1.,   1.,   1.,  ...,   2.,   1.,   2.],
          [  1.,   1.,   1.,  ...,   2.,   2.,   2.],
          [  1.,   1.,   1.,  ...,   1.,   2.,   2.]],

         [[  7.,   0.,   0.,  ...,   3.,   0.,   0.],
          [  0.,   0.,   2.,  ...,   0.,   0.,   0.],
          [  0.,   0.,   0.,  ...,   0.,   0.,   1.],
          ...,
          [  0.,   0.,   2.,  ...,   1.,   3.,   0.],
          [  0.,   1.,   1.,  ...,   3.,   3.,   0.],
          [  0.,   1.,   0.,  ...,  12.,   4.,   0.]],

         [[ 48.,   2.,   1.,  ...,   1.,   1.,   2.],
          [ 70.,  12.,   1.,  ...,   0.,   1.,   1.],
          [ 13.,  65.,   4.,  ...,   1.,   0.,   1.],
          ...,
          [  4.,   0.,   0.,  ...,   1.,   2.,   6.],
          [ 31.,   4.,  

tensor([[[ 1, -1, -2,  ...,  1,  0,  1],
         [-2,  2, -1,  ...,  2,  2,  0],
         [ 1,  1, -1,  ...,  1,  0, -2],
         ...,
         [-2,  2,  3,  ...,  0, -4,  3],
         [ 1,  2,  1,  ...,  0,  0,  2],
         [-1,  2,  1,  ..., -2, -1, -1]]], dtype=torch.int8)

In [12]:
stage2_uut = quant_layer.stage2(stage1_query_uut, stage1_key_uut, stage1_value_uut, **stage2_args)
stage2_uut

attention_probs_int tensor([[[[  1,   1,   0,  ...,   1,   1,   2],
          [  3,   0,   1,  ...,   0,   0,   0],
          [  1,   0,   0,  ...,   0,   0,   0],
          ...,
          [  1,   0,   0,  ...,   2,   1,   2],
          [  0,   0,   0,  ...,   1,   2,   1],
          [  1,   1,   0,  ...,   1,   1,   2]],

         [[  6,   0,   0,  ...,   3,   0,   0],
          [  0,   0,   2,  ...,   0,   0,   0],
          [  0,   0,   0,  ...,   0,   0,   0],
          ...,
          [  0,   0,   1,  ...,   0,   2,   0],
          [  0,   0,   0,  ...,   2,   3,   0],
          [  0,   0,   0,  ...,  12,   4,   0]],

         [[ 47,   1,   0,  ...,   0,   0,   1],
          [ 68,  11,   1,  ...,   0,   0,   1],
          [ 13,  63,   3,  ...,   1,   0,   1],
          ...,
          [  4,   0,   0,  ...,   0,   2,   5],
          [ 30,   3,   0,  ...,  34,   8,  12],
          [ 33,   1,   0,  ...,   5,  45,  16]],

         ...,

         [[  1,   1,   0,  ...,   0,   0,   0],
  

tensor([[[  2,  -2,  -7,  ...,   1,   1,   2],
         [ -6,   7,   0,  ...,   4,   8,   2],
         [  4,   4,  -1,  ...,   1,   0,  -7],
         ...,
         [ -7,   8,   8,  ...,   1, -11,   8],
         [  3,   7,   4,  ...,   0,  -1,   4],
         [ -3,   5,   2,  ...,  -7,  -3,  -3]]], dtype=torch.int8)

In [14]:
(1/stage2_args['M_attention_probs'])

0.00784313725490196

In [16]:
stage1_args['M_query'], stage1_args['M_key'], stage1_args['M_value'] 

(0.0070562844355619315, 0.008036052309477855, 0.003417007858954613)

Testing C++ impl

In [24]:
def genmat(A, mod):
    for i in range(A.size()[0]):
        for j in range(A.size()[1]):
            A[i][j] = (i*dmodel+j) % mod

In [25]:
seqlen = 6
dmodel = 8
hidden_states = torch.zeros(seqlen,dmodel)
query_weight_t = torch.zeros(dmodel,dmodel)
key_weight_t = torch.zeros(dmodel,dmodel)
value_weight_t = torch.zeros(dmodel,dmodel)
query_bias = torch.zeros(1,dmodel)
key_bias = torch.zeros(1,dmodel)
value_bias = torch.zeros(1,dmodel)
genmat(hidden_states, 7)
genmat(query_weight_t, 9)
genmat(key_weight_t, 11)
genmat(value_weight_t, 13)
genmat(query_bias, 63)
genmat(key_bias, 65)
genmat(value_bias, 67)

In [28]:
query = torch.matmul(hidden_states, query_weight_t) + query_bias
query *= 0.5

In [29]:
query

tensor([[49.0000, 55.5000, 57.5000, 55.0000, 48.0000, 36.5000, 20.5000, 31.5000],
        [56.0000, 58.5000, 56.5000, 50.0000, 39.0000, 23.5000, 35.0000, 42.0000],
        [59.5000, 58.0000, 52.0000, 41.5000, 26.5000, 38.5000, 46.0000, 49.0000],
        [59.5000, 54.0000, 44.0000, 29.5000, 42.0000, 50.0000, 53.5000, 52.5000],
        [56.0000, 46.5000, 32.5000, 45.5000, 54.0000, 58.0000, 57.5000, 52.5000],
        [49.0000, 35.5000, 49.0000, 58.0000, 62.5000, 62.5000, 58.0000, 49.0000]])