# General testing notebook for qtransform and quantization
## Import stuff

In [47]:
import torch
import numpy as np
from typing import List, Tuple
from torch.utils.data import Dataset, DataLoader
from logging import getLogger
import os
from omegaconf import DictConfig

## Experiments with dataclasses and python classes

In [48]:
from abc import abstractclassmethod, ABC
from dataclasses import dataclass, replace

@dataclass
class Metadata():
    encoding: str

@dataclass
class BarMetadata(Metadata):
    other: str = ""

class Foo(ABC):
    def __init__(self, encoding: str):
        self.metadata: Metadata = Metadata(encoding)


    def load_metadata():
        pass

    @abstractclassmethod
    def test(self, file: str):
        file += "   padding"

class Bar(Foo):
    def __init__(self, encoding: str):
        super().__init__()
        self.metadata: BarMetadata

    def test(self, file: str):
        super().test(file)
        print(file)

In [49]:
from dataclasses import dataclass
@dataclass
class Metadata():
    encoding: str

@dataclass
class BarMetadata(Metadata):
    other: str = ""

In [50]:
test = BarMetadata(encoding="gpt2", other="ok")
import dataclasses
dataclasses.replace(test, **{"other": "Bruh"})

BarMetadata(encoding='gpt2', other='Bruh')

In [51]:
test: Metadata = Metadata("gpt2")
test: BarMetadata = BarMetadata(**test, other="other")

TypeError: __main__.BarMetadata() argument after ** must be a mapping, not Metadata

In [None]:
obj = test
params = set(inspect.signature(Metadata.__init__).parameters.keys()) - set(['self'])
{x:getattr(obj, x) for x in params}

{'encoding': 'gpt2'}

In [None]:
from dataclasses import asdict, 
asdict(test)

{'encoding': 'gpt2'}

In [None]:
#test if inner functions can access member attributes
class Foo():
    def __init__(self):
        self.a = 10
    def function(self):
        def other():
            print(self.a)
        other()

Foo().function()

10


In [None]:
# padding does not get appended to the parameter as it is a seperate function
Bar().test("test")

test


## Tests with torch framework to gain familiarity

In [None]:
b,c,e = 4, 5,6
tensor_3d = torch.arange(b*c*e).reshape(b,c,e)
tensor_3d

In [None]:
#batch has 5 rows, only want 3 
index = torch.tile(torch.arange(3).reshape(3,1), (b,1,e))
#you only consider the first batch
torch.gather(tensor_3d, dim=1, index=index)

tensor([[[  0,   1,   2,   3,   4,   5],
         [  6,   7,   8,   9,  10,  11],
         [ 12,  13,  14,  15,  16,  17]],

        [[ 30,  31,  32,  33,  34,  35],
         [ 36,  37,  38,  39,  40,  41],
         [ 42,  43,  44,  45,  46,  47]],

        [[ 60,  61,  62,  63,  64,  65],
         [ 66,  67,  68,  69,  70,  71],
         [ 72,  73,  74,  75,  76,  77]],

        [[ 90,  91,  92,  93,  94,  95],
         [ 96,  97,  98,  99, 100, 101],
         [102, 103, 104, 105, 106, 107]]])

In [None]:
#objective: retrieve first rows of tensor_3d -> if we specify dim=1, we collapse along the rows (we perform indexing for each row)
#b,c,e = 4,5,6
#i always want the first row -> specify by row, dim=1
#how do i reduce the amount of rows if the index tensor has to be of the same dimension?
#dimension has to be the same but not the shape
#torch.zeros(4,1,6) gets the first row of the tensor, but it is problematic if i want multiple rows as i 
#then use the same index (0) while having the output shape that i want
#solution: arange
#index=torch.zeros(4,1,6) -> if we use 5 instead of 6, each row has 5 columns
#meaning: we need a row containing the same index 
tensor_3d.gather(dim=1, index=torch.zeros(4,2,6, dtype=torch.int64))

tensor([[[ 0,  1,  2,  3,  4],
         [ 0,  1,  2,  3,  4]],

        [[30, 31, 32, 33, 34],
         [30, 31, 32, 33, 34]],

        [[60, 61, 62, 63, 64],
         [60, 61, 62, 63, 64]],

        [[90, 91, 92, 93, 94],
         [90, 91, 92, 93, 94]]])

In [None]:
torch.arange(2).reshape(2,1)

tensor([[0],
        [1]])

In [None]:
y = torch.tensor([
     [
       [1, 2, 3],
       [4, 5, 6],
       [0, 0, 0],
       [0, 0, 0]
     ],
     [
       [1, 2, 3],
       [4, 5, 6],
       [0, 0, 0],
       [0, 0, 0]
     ],
     [
       [1, 2, 3],
       [4, 5, 6],
       [0, 0, 0],
       [0, 0, 0]
     ]
   ])
#size is: 3, 4, 3. if you collapse in the first dimension (dim=0), the result tensor becomes of size 4,3. if you collapse it in the second dimension, you get a tensor of size 3,3

In [None]:
y.sum(dim=1)
#in transformers, we usually have tensors of shape b,c,e (batch_size, context, embedding_dimension).
#if we specify dim=0, we perform the operation along the entire batch, in dim=1 along the context and in dim=2 along the embedding dimension.
#if we were to sum the tensors together, sum(dim=1) will yield the sum of the embeddings of each word.
#think of it as squishing a dimension together so that it is of size 1, meaning that we have to squeeze in that dimension.

tensor([[5, 7, 9],
        [5, 7, 9],
        [5, 7, 9]])

In [None]:
#test if torch.tile and tensor.repeat are the same
c = 2 #simulate two words
a = torch.arange(c).reshape((c,1)).repeat((3,1,4))
b = torch.tile(torch.arange(c).reshape((c,1)), (3,1,4))
print(a.equal(b))
print(a)

True
tensor([[[0, 0, 0, 0],
         [1, 1, 1, 1]],

        [[0, 0, 0, 0],
         [1, 1, 1, 1]],

        [[0, 0, 0, 0],
         [1, 1, 1, 1]]])


In [57]:
"experiments with torch.gather"
M = torch.tensor([[1,2,3], [4,7,18], [19,9,23]])
#if there is more than one value inside of the last dimension, continue along current index
#meaning at dim=1:
#[1,1,1] -> 2,7,9
#[0,0,0] -> 1,4,19
#increments along the current dimension
#at new row, reset counter ->
#[1] -> 2
#[1] -> 2
indexes = torch.tensor([1,1,2]).view(-1,1) 

dimension = 0 #2d, meaning dim=0 along rows, dim=1 along columns
out = M.gather(dimension ,indexes) #dim=0: , dim=1: tensor([[ 2],[ 7],[23]])
M.gather(1, torch.Tensor([[1],[1],[2]]).to(dtype=torch.long)) #counter along the current dimension for the dimension of index
#M.gather(1, torch.tensor([[0,0,0],[0,1,0]]))

tensor([[ 2],
        [ 7],
        [23]])

## Test BatchNorm with Padding

In [None]:
from qtransform.model.modules import BatchNorm as BatchNormWithPadding
"test if padding does not lower values"
#first word of each batch -> gather by column
#result tensor: (3, 1, 64)
#retrieving an index from the dimension increases the counter along index of said dimension by one
#e.g. indexing 0 twice will retrieve two different values
FEATURES = 16
EMBEDDINGS = 64
BATCH_SIZE = 3
bn = torch.nn.BatchNorm1d(FEATURES)
#get first word embeddings of three batches
embedding_layer = torch.nn.Embedding(FEATURES, EMBEDDINGS)
batch = embedding_layer(torch.randint(16, (BATCH_SIZE, FEATURES)))
index = torch.arange(1).repeat(BATCH_SIZE,1,EMBEDDINGS).to(dtype=torch.long)
embd_first_word = torch.gather(batch, index=index, dim=1)
padding_bn = BatchNormWithPadding(FEATURES,bias=True)
norm_padding = padding_bn(embd_first_word)
norm = bn(batch)
#check if values are the same
print(f'Values are: {"same" if torch.gather(norm, index=index, dim=1).equal(norm_padding) else "different"}')

Values are: same


## Test huggingface dataset processing

In [None]:
import os
#test if huggingface datasets can be created from text files
import datasets

BASEDIR = '/home/mabot004/.qtransform/datasets/files/shakespeare/untokenized/'
#number of rows depends on the amount of files
files = [os.path.join(BASEDIR, 'shakespeare.txt'), os.path.join(BASEDIR, 'shakespeare_2.txt')]
#does the same as huggingface mapping but now with files
def gen_text():
    for filename in files:
        with open(filename, 'r') as file:
            yield {"text": file.read()}

#chunk size from config, default 100
def chunk_examples(examples):
                #splits the text of each row into chunks of length chunk_length. currently it is only used
                #for character tokenization to avoid feeding large samples to the tokenizer
    chunk_length = 100
                #perform tokenization on a handful of characters at a time
                #from: https://huggingface.co/docs/datasets/process#split-long-examples            
    chunks = []
    
    for sentence in examples["text"]:
        new_chunks = [sentence[i:i + chunk_length] for i in range(0, len(sentence), chunk_length)]
        chunks.extend(new_chunks)
    return {"chunks": chunks}
from tiktoken import get_encoding
tokenizer = get_encoding("gpt2")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
shakespeare = datasets.Dataset.from_generator(gen_text)
chunks = shakespeare.map(chunk_examples, batched=True, remove_columns = "text")
rotten_tomatoes = datasets.load_dataset('rotten_tomatoes')
rotten_tomatoes["train"].shard(num_shards=1000, index=0, contiguous = True)

Generating train split: 2 examples [00:00, 32.58 examples/s]


In [None]:
# status bar like huggingface dataset map process
from tqdm import tqdm
msg = 'ok'
for i in tqdm(range(100), desc=f'{msg}'):
    msg = str(i)
from tqdm import tqdm
import time
for i, data in tqdm(enumerate(range(10)), desc='test progress bar and other stdout stuff'):
    print(data)
    time.sleep(0.5)

ok: 100%|██████████| 100/100 [00:00<00:00, 842229.72it/s]


In [None]:
#error occurs because the splits have more than one feature and this function changes the amount of samples in each split of one feature without changint the other
#so: 5 samples, 2 features. after mapping: text has 10 samples, other feature still has 5 features
#from: https://github.com/huggingface/datasets/issues/1817#issuecomment-774066254
rt_chunks = datasets.concatenate_datasets(rotten_tomatoes.select_columns("text").map(chunk_examples, batched=True, remove_columns = "text").values())
print(rt_chunks)
#tokenize
rt_chunks = rt_chunks.map(
    #map function expects dictionary or dataset object, tokenize function returns list of tokens (integers)
    lambda batch: {"input_ids": [tokenizer.encode(x) for x in batch["chunks"]]}, 
    batched=True, 
    remove_columns = "chunks",
    #num_proc=os.cpu_count()//2 if cfg.encoding != 'character' else 1 
    desc="tokenizing the dataset from chunks")
rt_chunks.save_to_disk('/home/mabot004/custom_hf_datasets/')
"test if tokenizing is correct"
tokenizer.decode(rt_chunks["train"]["input_ids"][0])

In [None]:

#https://huggingface.co/docs/datasets/create_dataset#from-local-files
shakespeare = datasets.Dataset.from_generator(gen_text)
shakespeare = shakespeare.map(chunk_examples, batched=True, remove_columns = "text")
shakespeare = shakespeare.map(
    #map function expects dictionary or dataset object, tokenize function returns list of tokens (integers)
    lambda batch: {"input_ids": [tokenizer.encode(x) for x in batch["chunks"]]}, 
    batched=True, 
    remove_columns = "chunks",
    #num_proc=os.cpu_count()//2 if cfg.encoding != 'character' else 1 
    desc="tokenizing the dataset from chunks")

In [None]:
tokenizer.decode(np.concatenate(shakespeare[:3]["input_ids"]))


"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us"

In [None]:
def write_memmap(memmap, start, end, data):
    memmap[start:end] = data

### test generating huggingface datasets from files

In [None]:
def gen_text():
    for i in range(163):
        yield {"text": i}

test_threading = datasets.Dataset.from_generator(gen_text)

Generating train split: 163 examples [00:00, 31912.97 examples/s]


In [None]:
test_threading.rename_column("text", "chunks")

Dataset({
    features: ['chunks'],
    num_rows: 163
})

In [None]:
test_threading.shard(num_shards=30, index=17)

Dataset({
    features: ['text'],
    num_rows: 5
})

In [None]:
import threading
num_threads = 3 #os.cpu_count // 2
batch_size = 30
num_samples = len(test_threading)
# 163 // 30 shards
# -> 3 threads, each having a batch size of 30 samples
# dataset has 163 samples -> each thread should have around 50-60 samples max
# -> divide samples of dataset with num_threads
# -> each thread should have the entire dataset as an arg, but split differently
# range of splitting should be specified as an arg in thread -> index arg in parameter

In [None]:
#why should you use multithreading? the writing process is I/O based
#if anything, the amount of write requests increases with the amount of threads
memmap = np.memmap('test', mode='w+', shape=(163,), dtype=np.int64)

In [None]:
#playing around with error messages
try:
    int("abcd")
except Exception as e:
    print(str(e))

invalid literal for int() with base 10: 'abcd'


In [None]:
"""
test memory usage in worst case scenarios
"""

#no high memory usage as memmap values are lazily loaded, only overhead is the pages (around 5MB per memmap )
memmap = np.memmap('/home/mabot004/.qtransform/datasets/huggingface/openwebtext/tokenized/gpt2/openwebtext-float32.bin', dtype=np.float32, mode='r')
memmap2 = np.memmap('/home/mabot004/.qtransform/datasets/huggingface/openwebtext/tokenized/gpt2/openwebtext-float32.bin', dtype=np.float32, mode='r')
memmap3 = np.memmap('/home/mabot004/.qtransform/datasets/huggingface/openwebtext/tokenized/gpt2/openwebtext-float32.bin', dtype=np.float32, mode='r')
memmap4 = np.memmap('/home/mabot004/.qtransform/datasets/huggingface/openwebtext/tokenized/gpt2/openwebtext-float32.bin', dtype=np.float32, mode='r')
import psutil
# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

In [None]:
from qtransform.dataset import MemmapDataset
#token_file: str, dtype: np.dtype, block_size: int, start: float=0.0, end: float = 1.0
memmap_ds = MemmapDataset(
    token_file='/home/mabot004/.qtransform/datasets/huggingface/openwebtext/tokenized/gpt2/openwebtext-float32.bin',
    dtype=np.float32,
    block_size=64,
    start=0.0,
    end=0.3
)
len(memmap_ds)

2709600997

### test torch Dataloader

In [None]:
dataloader = DataLoader(memmap_ds, batch_size=12, num_workers=8)
next(iter(dataloader))

In [None]:
for i, data in enumerate(dataloader):
    if i == 10:
        break
    input, labels = data
    print(f'{input.size()}, {labels.size()}')

torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])
torch.Size([12, 64]), torch.Size([12, 64])


## Testing quantization

In [54]:
#testing batchnorm quant
#https://github.com/Xilinx/brevitas/issues/542
#https://github.com/Xilinx/brevitas/issues/363
#test merge_bn from qtransform
from qtransform.model.modules import merge_bn_mha, CausalSelfAttention
from qtransform.model.modules import BatchNorm as BatchNormWithPadding, MLP
from qtransform.model.gpt import GPTConfig
import brevitas.nn as qnn
from brevitas.nn import utils as qutils
import torch
import torch.nn as nn
from brevitas.quant import scaled_int
#simulate values from embedding, skip positional encoding
wte = torch.nn.Embedding(16,64)
tokens = torch.randint(16, (3,16))
embeddings = wte(tokens)
embeddings.size()

torch.Size([3, 16, 64])

In [None]:
#test if quantized layers having return_quant_tensor set to True are compatible with torch operations 
quant_tensor_linear = qnn.QuantLinear(1,1,True,return_quant_tensor=True)
quant_tensor_linear(torch.Tensor(8,1)) #works

In [None]:
#debug loading quantized checkpoint
CHECKPOINT = '/home/mabot004/eki-transformer-dev/qtransform/outputs/models/GPT_2024-01-17_08:30:49__epoch:1'
#doesnt work since qtransform.dataset cannot be found
#but module info about tokenizers is not saved in checkpoint, only their names
checkpoint = torch.load(CHECKPOINT)
checkpoint.keys()

dict_keys(['model_state_dict', 'optimizer_state_dict', 'epoch', 'model_cfg', 'tokenizer_cfg', 'metrics'])

In [None]:
#check if info about quant params are even saved within checkpoint
import re
keys = checkpoint["model_state_dict"].keys()
#quant param that exists within checkpoint: 
#transformer.layer.0.mlp.active.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value 
weights_and_biases = list(filter(lambda x: re.search(r'.+\.(weight|bias)$', x), keys))
def find(x):
    if not re.search(r'.+\.(weight|bias)$', x):
        return x
other_keys = list(filter(find, keys))
len(keys) == len(weights_and_biases) # not only weights and biases in state dict
#only scaling_impl is saved in state dict
#no multiheadattention though
#in gpt quant config, every single layer has a quantizer (most commonly Int8WeightPerTensorFloat)
#that quantizer has ScalingImplType STATS
#the layers with scaling_impl had an activation quantizer named Int8ActPerTensorFloat
#it had the ScalingImplType PARAMETER_FROM_STATS
other_keys

['transformer.layer.0.attn.attn_mask',
 'transformer.layer.0.mlp.active.input_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value',
 'transformer.layer.0.mlp.active.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value',
 'transformer.layer.1.attn.attn_mask',
 'transformer.layer.1.mlp.active.input_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value',
 'transformer.layer.1.mlp.active.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value']

In [None]:
#check if qparam is not one 
checkpoint["model_state_dict"]["transformer.layer.0.mlp.active.input_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value"]

tensor(2.6414)

In [53]:
#test if scaling_impl params exist within model
test_mha = qnn.QuantMultiheadAttention(num_heads=2, embed_dim=256)
#simulate some learning steps for param
print(test_mha.v_quant.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value)
test_mha.v_quant.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value = torch.nn.Parameter(torch.tensor(3.1415))
test_mha.v_quant.act_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value

Parameter containing:
tensor(1., requires_grad=True)


Parameter containing:
tensor(3.1415, requires_grad=True)

In [None]:
torch.save(test_mha.state_dict(), 'mha.chpt')
#v_quant etc. not appearing within state_dict
test_mha.state_dict().keys()

In [None]:
#test if brevitas layers relevant for Transformers return qparams in state_dict
print(qnn.QuantLinear(1,1,True,input_quant=scaled_int.Int8ActPerTensorFloat).state_dict())
print(qnn.QuantIdentity(act_quant=scaled_int.Int8ActPerTensorFloat).state_dict())
print(qnn.QuantReLU(act_quant=scaled_int.Int8ActPerTensorFloat).state_dict())

OrderedDict([('weight', tensor([[0.9874]])), ('bias', tensor([-0.8623]))])

In [None]:
re.search(r'(?!hallo|welt).*$', "hallo")

<re.Match object; span=(1, 5), match='allo'>

In [None]:
#check if storing checkpoints of quantized models even is working
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.network = torch.nn.ModuleDict(dict(
            wte = qnn.QuantEmbedding(32, 128),
            pos = qnn.QuantEmbedding(16, 128),
            logic = nn.ModuleDict(dict(
                layer1 = qnn.QuantLinear(128, 16, True),
                layer2 = qnn.QuantLinear(16,1, True))
            )
        ))
    def forward(self, x):
        embd = self.network.wte(x)
        b,t = x.size()
        pos = torch.arange(0, t, dtype=torch.long).unsqueeze(0) # shape (1, t)
        pos = self.network.pos(pos)
        output = embd + pos
        for name, layer in self.network.logic.items():
            output = layer(output)
        return output

In [None]:
model = Model()
model(torch.randint(32, (1,16)))

  return super().rename(names)


tensor([[[-0.3944],
         [ 0.2287],
         [-0.5937],
         [-0.8445],
         [ 0.4049],
         [-0.1961],
         [ 0.2558],
         [ 0.5325],
         [-0.2270],
         [ 0.0485],
         [-0.5637],
         [ 0.1862],
         [ 0.7595],
         [-0.2511],
         [ 0.1841],
         [-0.3207]]], grad_fn=<ViewBackward0>)

In [None]:
#doesnt work, Quantizer cannot be found in brevitas.inject
#why are they being searched for in inject if they are in brevitas.quant.scaled_int
torch.save(model, 'quantized_test') 

PicklingError: Can't pickle <class 'brevitas.inject.Int8WeightPerTensorFloat'>: attribute lookup Int8WeightPerTensorFloat on brevitas.inject failed

In [56]:
from qtransform import DeviceSingleton
#check if value from class is set in object
DeviceSingleton.device = 'cuda'
singleton = DeviceSingleton()
singleton.device

'cuda'

### Testing Batchnorm and Conv merging

In [None]:
#from: 
def fuse_conv_and_bn(conv, bn):
	#
	# init
	fusedconv = torch.nn.Conv1d(
		conv.in_channels,
		conv.out_channels,
		kernel_size=conv.kernel_size,
		stride=conv.stride,
		padding=conv.padding,
		bias=True
	)
	#
	# prepare filters
	w_conv = conv.weight.clone().view(conv.out_channels, -1)
	w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps+bn.running_var)))
	fusedconv.weight.copy_( torch.mm(w_bn, w_conv).view(fusedconv.weight.size()) )
	#
	# prepare spatial bias
	if conv.bias is not None:
		b_conv = conv.bias
	else:
		b_conv = torch.zeros( conv.weight.size(0) )
	b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
	fusedconv.bias.copy_( torch.matmul(w_bn, b_conv) + b_bn )
	#
	# we're done
	return fusedconv


torch.set_grad_enabled(False)
batch_size = (16, 64, 256)
x = torch.randn(16, 64, 256)

net = torch.nn.Sequential(
    torch.nn.Conv1d(64, 64, kernel_size=(256,256)),
    torch.nn.BatchNorm1d(64)
)
y1 = net.forward(x)
fusedconv = fuse_conv_and_bn(net[0], net[1])
y2 = fusedconv.forward(x)
d = (y1 - y2).norm().div(y1.norm()).item()
print("error: %.8f" % d)

error: 0.49767026


In [None]:
cv1 = qnn.QuantLinear(5,5,bias=True)
cv1_copy = qnn.QuantLinear(5,5,bias=True)
cv1_copy.load_state_dict(cv1.state_dict())
bn1 = torch.nn.BatchNorm1d(5)
qnn.utils.merge_bn(cv1, bn1)
input = torch.Tensor(2,5)

In [None]:
cv1 is cv1_copy

False

In [None]:
cv1(input)

tensor([[ 3.0883e-01, -9.2026e-03,  3.9793e-01,  3.7391e-01,  4.2723e-01],
        [-3.9785e+20,  2.8513e+20, -5.6362e+19,  2.4866e+20,  3.8127e+20]],
       grad_fn=<AddmmBackward0>)

In [None]:
#output is the same without batchnorm, why?
output = cv1_copy(input)
output

tensor([[ 3.0883e-01, -9.2026e-03,  3.9794e-01,  3.7391e-01,  4.2723e-01],
        [-3.9785e+20,  2.8513e+20, -5.6362e+19,  2.4866e+20,  3.8127e+20]],
       grad_fn=<AddmmBackward0>)

In [None]:
output = bn1(input)
cv1_copy(output)

tensor([[ 0.3088, -0.0092,  0.3979,  0.3739,  0.4272],
        [ 0.3088, -0.0092,  0.3979,  0.3739,  0.4272]],
       grad_fn=<AddmmBackward0>)

In [None]:
tensor = torch.randint(30, (3,5,20)).to(dtype=torch.float32) / 10

In [None]:
torch.torch.nn.BatchNorm1d(5)(qnn.QuantConv1d(5,5,kernel_size=3)(tensor))

tensor([[[-1.5470e+00,  1.5270e-01, -2.5464e-01,  2.9082e-01,  1.2001e+00,
           6.7265e-01,  7.3291e-01, -1.8250e-01, -1.2105e+00, -1.0728e+00,
           3.1590e-01,  2.5334e+00, -1.3411e-01,  2.5901e+00,  6.9120e-02,
          -5.1725e-01,  1.9493e-01,  2.2703e+00],
         [-3.4492e-01, -9.6099e-01, -9.2788e-01, -5.6099e-01, -2.0823e+00,
           8.8297e-01,  4.6034e-01,  9.3609e-01,  1.8312e+00, -8.3214e-01,
          -1.0253e+00, -1.3361e+00, -1.3721e+00,  4.9575e-01, -6.1378e-01,
           3.7313e-01, -1.6607e+00, -7.7247e-01],
         [ 1.8919e+00,  8.4047e-01,  5.7258e-01,  3.1601e-01,  1.4367e-01,
          -4.8590e-01,  7.8809e-01,  5.6231e-01, -9.9302e-01, -5.6623e-01,
          -2.1978e-01, -8.2209e-01, -2.8324e-02,  9.5371e-01,  9.6952e-02,
          -6.7169e-01, -8.7423e-02, -4.8495e-02],
         [-2.3595e+00, -4.5535e-01,  1.1663e+00,  1.6639e+00,  5.8315e-01,
          -4.0086e-01, -4.8103e-01, -2.0247e+00,  4.6665e-01,  2.1141e-01,
           2.7884e-02,  1

In [None]:
cv1(tensor)

tensor([[[ 1.5805,  1.6874,  1.4232,  0.4953,  0.3392,  1.6921,  0.4289,
           0.6056,  1.4143, -0.3349,  0.7054,  1.2688,  0.7755,  0.8200,
           0.8781,  1.1979,  0.5057,  0.3852],
         [-0.6557,  0.1111,  0.1168,  0.1569, -0.1044, -0.7017, -0.4960,
          -0.0382, -0.2186,  0.1632,  0.3991, -0.2319, -0.1412, -1.0671,
          -0.2227, -0.4349,  0.0598, -0.0862],
         [-1.8093, -1.3581, -1.4218, -1.9729, -2.4638, -1.5891, -2.7596,
          -1.0707, -0.9489, -1.2964, -0.4576, -2.2601, -2.3817, -2.9702,
          -2.1696, -1.4183, -1.6867, -1.5476],
         [ 0.0087,  0.9669,  0.2861, -0.0195,  1.7805, -0.1940,  1.5801,
           0.4090,  0.1231, -0.3474,  0.4677,  0.5724,  0.8208,  1.0850,
           1.3689,  0.1611,  0.5700, -0.0545],
         [-1.7327, -0.0812, -0.3896, -1.2272, -0.7673, -1.1279, -1.3454,
          -0.1006,  0.1854, -0.1570, -0.0988, -0.2189, -0.5590, -1.7284,
          -0.0418, -0.0416,  0.0663, -0.0783]],

        [[ 1.6639,  0.7003,  1.20

### Debug QuantMultiheadAttention and merge_bn

In [None]:

small_attn = CausalSelfAttention(GPTConfig(block_size=16, n_embd=64, n_head=2))
#if batchnorm and mha are merged together, padding should not be necessary for inference
small_attn.mha = qnn.QuantMultiheadAttention(num_heads=2, embed_dim=64)
bn = torch.nn.BatchNorm1d(16)
bn_alt = torch.nn.BatchNorm1d(64) #along embedding dimension, if that works then merge_bn has to be changed
#bn_alt.load_state_dict(bn.state_dict())



torch.Size([3, 16, 64])

In [None]:
mul_factor, add_factor = qutils.mul_add_from_bn(
    bn_mean=bn.running_mean,
    bn_var=bn.running_var,
    bn_eps=bn.eps,
    bn_weight=bn.weight.data.clone(),
    bn_bias=bn.bias.data.clone())
output_channel_dim = 0
layer = small_attn.mha
#nan, why?
#mul_factor.view(...) returns a tensor of shape (context, 1)
out_ch_weight_shape = qutils.compute_channel_view_shape(layer.out_proj.weight, output_channel_dim)
print(f'{bn.running_mean}, {mul_factor.view(out_ch_weight_shape)}')
layer = small_attn.mha
output_channel_dim = 0
#out_proj_weight has shape of embedding length
print(f'out_ch_weight_shape: {out_ch_weight_shape}, out_proj_weight: {layer.out_proj.weight.size()}, output_channel_dim: {output_channel_dim}')
print(f'out_proj_weight_quant: {getattr(layer, "out_proj_weight_quant", None)}, out_proj_bias_quant: {getattr(layer, "out_proj_bias_quant", None)}')
print(f'mul_factor.view(out_ch_weight_shape): {mul_factor.view(out_ch_weight_shape)}')
#shape is relationship between context (context, context)
#apparently, out_proj is applied before the shape of the output is changed back to the input shape
print(f'layer.out_proj.weight.data: {layer.out_proj.weight.data.size()}')

tensor([-0.0003, -0.0022, -0.0028, -0.0017, -0.0015, -0.0014, -0.0017, -0.0016,
        -0.0016, -0.0014, -0.0015, -0.0016, -0.0017, -0.0016, -0.0019, -0.0017]), tensor([[1.0484],
        [1.0517],
        [1.0521],
        [1.0525],
        [1.0525],
        [1.0524],
        [1.0524],
        [1.0525],
        [1.0526],
        [1.0525],
        [1.0525],
        [1.0526],
        [1.0527],
        [1.0526],
        [1.0527],
        [1.0528]])
out_ch_weight_shape: (-1, 1), out_proj_weight: torch.Size([64, 64]), output_channel_dim: 0
out_proj_weight_quant: None, out_proj_bias_quant: None
mul_factor.view(out_ch_weight_shape): tensor([[1.0484],
        [1.0517],
        [1.0521],
        [1.0525],
        [1.0525],
        [1.0524],
        [1.0524],
        [1.0525],
        [1.0526],
        [1.0525],
        [1.0525],
        [1.0526],
        [1.0527],
        [1.0526],
        [1.0527],
        [1.0528]])
layer.out_proj.weight.data: torch.Size([64, 64])


In [None]:
small_attn.mha.out_proj.weight.size()

torch.Size([64, 64])

#### Test if forward pass even works

In [None]:
#size: 3,16,64
#number of heads: 2
#-> 64 / 2 = 32
attn_batchfirst = qnn.QuantMultiheadAttention(num_heads=2, embed_dim=64,batch_first=True)
attn_no_batchfirst = qnn.QuantMultiheadAttention(num_heads=2, embed_dim=64,batch_first=False)

In [None]:
embeddings.size()

torch.Size([3, 16, 64])

In [None]:
attn_no_batchfirst(embeddings, embeddings, embeddings)

  return super().rename_(names)
q_scaled: torch.Size([32, 3, 32]), k_transposed: torch.Size([32, 3, 32])


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [32, 32] but got: [32, 3].

In [None]:
#tensor = torch.Tensor(1,16,64)
q,k,v = [embeddings for _ in range(3)]
#it probably has something to do with the attention mask, maybe
#TODO: find out why attention mask is important
attn_no_batchfirst(q,k,v)

  return super().rename_(names)
q_scaled: torch.Size([32, 3, 32]), k_transposed: torch.Size([32, 3, 32])


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [32, 32] but got: [32, 3].

In [None]:
tensor = torch.Tensor(1,16,64)
attn_no_batchfirst.mha_shape_check(tensor,tensor,tensor, None, None, 2)

True

In [None]:
#code copied from forward pass of QuantMultiheadAttention
#function is called if in_proj quantization has been set
#TODO: find out what it does
def chunk(x, num=3, dim=-1):
    _len, _bsz, _dim = x.shape
    x = x.reshape(_len, _bsz, num, dim)
    return x[:, :, 0, :], x[:, :, 1, :], x[:, :, 2, :]
assert attn_no_batchfirst.in_proj is not None
from brevitas.nn.utils import check_tensors_same_ptr
#no idea what it does, it has to be True or else an Exception will be thrown
assert check_tensors_same_ptr([embeddings, embeddings, embeddings]) == True
torch._C._get_tracing_state()

query = embeddings
query.rename_('L', 'N', 'E')
#no idea why q,k,v are infered from the query and params key and value are still used
#this is an issue if no in_proj is specified i think
q,k,v = chunk(attn_no_batchfirst.in_proj(query))
print(f'{q.size()}, {k.size()}, {v.size()}')
#issue with wrong shapes could be that batch size is transposed instead of embedding dimension
#q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)

torch.Size([3, 16, 64]), torch.Size([3, 16, 64]), torch.Size([3, 16, 64])


In [None]:
tensor = torch.arange(9).reshape(3,3)
#columns become rows, rows become columns
print(f'{tensor}, \n{tensor.transpose(1,0)}')

tensor([[0, 1, 2],
        [3, 4, 5],
        [6, 7, 8]]), 
tensor([[0, 3, 6],
        [1, 4, 7],
        [2, 5, 8]])


In [None]:
small_attn_cpy = CausalSelfAttention(GPTConfig(block_size=16, n_embd=64, n_head=2))
#if batchnorm and mha are merged together, padding should not be necessary for inference
small_attn_cpy.mha = qnn.QuantMultiheadAttention(num_heads=2, embed_dim=64)
from brevitas import config
config.IGNORE_MISSING_KEYS = True #copy state dict does not return brevitas qparams
small_attn_cpy.load_state_dict(small_attn.state_dict())
#qparams from state dict are set to 1 at first
print(small_attn.mha.in_proj.input_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value)
print(small_attn_cpy.mha.in_proj.input_quant.fused_activation_quant_proxy.tensor_quant.scaling_impl.value)



Parameter containing:
tensor(1., requires_grad=True)
Parameter containing:
tensor(1., requires_grad=True)


In [None]:
"""
idea from: https://github.com/Xilinx/brevitas/issues/542#issuecomment-1446338490
merge_bn does not delete current batchnorm, meaning that one model has to be initialiized without bn and the parameters from the trained model
have to be copied to the model without bn
TODO: find more ressource efficient ways
"""
#at one step in merge_bn_mha, layer.out_proj.weight.data.mul_(mul_factor.view(out_ch_weight_shape)) is performed
#weight is of shape (embd_dim, embd_dim), mul_factor is of (shape features, 1)
#meaning that batchnorm probably normalizes along the embeddings instead of each sentence
"bn_alt feature length is 64 (embedding dimension)"


In [None]:
#merge_bn_mha appends batchnorm to mha, TODO: prepend it (maybe use input_quant_tensor or something)
#problem: merged and unmerged outputs are not the same, possibly since feature length is different
no_merge_attn_output = small_attn(embeddings)
no_merge_bn_output = bn(no_merge_attn_output)
try:
    merge_bn_mha(small_attn.mha, bn, output_channel_dim=0)
except Exception:
    merge_bn_mha(small_attn.mha, bn, output_channel_dim=1)
except Exception:
    merge_bn_mha(small_attn.mha, bn, output_channel_dim=2)
merge_attn_output = small_attn(embeddings)
assert torch.equal(no_merge_bn_output, merge_attn_output) == True

RuntimeError: The size of tensor a (64) must match the size of tensor b (16) at non-singleton dimension 1

In [None]:
help(layer.out_proj.weight.data.mul_)

Help on built-in function mul_:

mul_(...) method of torch.Tensor instance
    mul_(value) -> Tensor
    
    In-place version of :meth:`~Tensor.mul`.



In [None]:
#a.mul_(tensor) basically is a = a * tensor
a = torch.Tensor([1,2,3])
a.mul_(3)
a

tensor([3., 6., 9.])

In [None]:
small_attn(torch.Tensor(3,16,64)).size()

torch.Size([3, 16, 64])

In [None]:
m = torch.nn.Conv1d(16, 33, 3, stride=2)
input = torch.randn(20, 16, 50)
output = m(input)
output.size()

torch.Size([20, 33, 24])

In [None]:
#conv1d and batchnorm1d merge

qnn.quant_layer.merge_bn

<function brevitas.nn.utils.merge_bn(layer, bn, output_channel_dim=0)>

In [None]:
tensor = torch.rand((3,6,9))
tensor

tensor([[[0.9917, 0.4984, 0.6176, 0.5039, 0.8158, 0.8521, 0.0155, 0.1858,
          0.8048],
         [0.1621, 0.4298, 0.3947, 0.5427, 0.8238, 0.9419, 0.7478, 0.4333,
          0.0647],
         [0.0897, 0.2927, 0.9780, 0.6710, 0.0377, 0.8199, 0.1301, 0.8592,
          0.8216],
         [0.2074, 0.6790, 0.2042, 0.7838, 0.5414, 0.5088, 0.8481, 0.2490,
          0.1760],
         [0.0197, 0.6737, 0.1897, 0.2794, 0.4024, 0.3306, 0.8610, 0.8641,
          0.6871],
         [0.7651, 0.4413, 0.9831, 0.4328, 0.2344, 0.0799, 0.4901, 0.1151,
          0.9380]],

        [[0.4503, 0.5180, 0.3012, 0.7354, 0.2637, 0.9073, 0.9226, 0.7925,
          0.0674],
         [0.9067, 0.1654, 0.9186, 0.1072, 0.0438, 0.4049, 0.1374, 0.3990,
          0.6381],
         [0.3767, 0.8549, 0.5588, 0.2489, 0.2599, 0.6461, 0.5800, 0.1559,
          0.0832],
         [0.9381, 0.2192, 0.7259, 0.7615, 0.1411, 0.1472, 0.9268, 0.6733,
          0.9049],
         [0.1468, 0.8668, 0.3151, 0.5401, 0.4347, 0.5541, 0.0995, 0.

In [None]:
#normalized values along second dimension, meaning: along sentences
#are 
torch.nn.BatchNorm1d(6)(tensor)

tensor([[[ 1.5546, -0.3177,  0.1346, -0.2969,  0.8870,  1.0246, -2.1506,
          -1.5043,  0.8454],
         [-0.9784, -0.1143, -0.2279,  0.2499,  1.1573,  1.5384,  0.9118,
          -0.1030, -1.2930],
         [-1.5225, -0.7464,  1.8741,  0.7002, -1.7215,  1.2696, -1.3680,
           1.4198,  1.2762],
         [-1.0571,  0.6570, -1.0687,  1.0382,  0.1571,  0.0385,  1.2718,
          -0.9059, -1.1712],
         [-1.5085,  0.9969, -0.8575, -0.5138, -0.0426, -0.3174,  1.7148,
           1.7266,  1.0484],
         [ 1.0773, -0.1349,  1.8936, -0.1669, -0.9097, -1.4882,  0.0477,
          -1.3565,  1.7249]],

        [[-0.5003, -0.2434, -1.0663,  0.5820, -1.2088,  1.2342,  1.2923,
           0.7987, -1.9539],
         [ 1.4249, -0.9680,  1.4631, -1.1558, -1.3605, -0.1948, -1.0582,
          -0.2139,  0.5578],
         [-0.4253,  1.4034,  0.2712, -0.9140, -0.8717,  0.6049,  0.3522,
          -1.2693, -1.5475],
         [ 1.5987, -1.0141,  0.8276,  0.9569, -1.2980, -1.2758,  1.5576,
       

In [None]:
tensor[0][0].mean()

tensor(0.5873)

In [None]:
#tensor retains size, batchnorm essentially is a linear transformation to shift values to have a mean of 0 and a standard deviation of 1
torch.nn.BatchNorm1d(10)(torch.Tensor(3,10,16)).size()

torch.Size([3, 10, 16])

In [None]:
identity = qnn.QuantIdentity()
tensor = torch.Tensor(12,64,256)

In [None]:
tensor[0][0][0]

tensor(2.8026e-45)

In [None]:
output[0][0][0]

tensor(0.)

In [None]:
#test if quantidentity is a simple wrapper around a tensor that does nothing
#if so, it could be useful for merging with batchnorm
tensor = torch.Tensor(2,3,4)
print(tensor)
print("\n" + 30* "-" + "\n")
print(qnn.QuantIdentity()(tensor).isclose(tensor).all().item())

tensor([[[-1.1617e+35,  3.0907e-41, -1.5597e+37,  3.0907e-41],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  1.4013e-45,  0.0000e+00]],

        [[ 0.0000e+00,  0.0000e+00,  1.1351e-43,  0.0000e+00],
         [-1.5597e+37,  3.0907e-41, -3.0176e+34,  3.0907e-41],
         [ 0.0000e+00,  0.0000e+00,  1.4013e-45,  0.0000e+00]]])

------------------------------

False


In [None]:
output = identity(tensor)
output.size == tensor.size
output == tensor

tensor([[[False,  True, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False]],

        [[False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         [False, False, False,  ..., False, False, False],
         ...,
         [False, False, False,  ..., False, False, False],
         [