<a href="https://colab.research.google.com/github/donghuna/AI-Expert/blob/main/%EC%B5%9C%EC%A0%95%EC%9A%B1/qtorch_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting Started: Install QPyTorch

In [1]:
# 자신의 Google Drive 를 Google Collab 에 마운트 시켜줍니다.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
# Google Drive 폴더로 이동합니다.
%cd /content/drive/MyDrive/AI-Expert/최정욱/QPytorch
# 폴더 내 파일 목록 확인
!ls

/content/drive/MyDrive/AI-Expert/최정욱/QPytorch
PreResNet_fp32.pth  qtorch_tutorial.ipynb  requirements.txt


In [18]:
!pip install -r requirements.txt

Collecting nbsphinx (from -r requirements.txt (line 3))
  Downloading nbsphinx-0.9.5-py3-none-any.whl.metadata (2.1 kB)
Collecting Ninja (from -r requirements.txt (line 4))
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)
Collecting transformers==4.37.0 (from -r requirements.txt (line 8))
  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting qtorch==0.3.0 (from -r requirements.txt (line 9))
  Downloading qtorch-0.3.0-py3-none-any.whl.metadata (455 bytes)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.0->-r requirements.txt (line 8))
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting jedi>=0.16 (from ipython>=5.0.0->ipykernel->-r requirements.txt (line 2))
  Using cached jedi-0.19.1-py2.py3-none-any.whl

# Lab 1. Quantizer Tutorial

In [19]:
import torch
from qtorch.quant import Quantizer, quantizer
from qtorch import FloatingPoint

If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


### Input Tensor

In [20]:
random_input = torch.rand([3,3])
print(random_input)

tensor([[0.3564, 0.5577, 0.3888],
        [0.3668, 0.2829, 0.6164],
        [0.9219, 0.8011, 0.5533]])


In [21]:
constant_input = torch.tensor([[255., -255.],[1.4, 1.6]])
print(constant_input)

tensor([[ 255.0000, -255.0000],
        [   1.4000,    1.6000]])


### Quantization

In [22]:
# Example: FP4 (E2M1) Quantization
bit = FloatingPoint(exp=2, man=1)
quant = quantizer(forward_number=bit, forward_rounding="nearest")

In [23]:
random_output = quant(random_input)
print(random_output)

tensor([[0.5000, 0.5000, 0.5000],
        [0.5000, 0.5000, 0.5000],
        [1.0000, 1.0000, 0.5000]])


In [24]:
constant_output = quant(constant_input)
print(constant_output)

tensor([[ 6.0000, -6.0000],
        [ 1.5000,  1.5000]])


# Lab 2. Reduced-Precision Training

In [25]:
import argparse
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from qtorch.quant import Quantizer, quantizer
from qtorch.optim import OptimLP
from torch.optim import SGD
from qtorch import FloatingPoint
from tqdm import tqdm
import math

In [26]:
# loading data
ds = torchvision.datasets.CIFAR10
path = os.path.join("./data", "CIFAR10")
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
train_set = ds(path, train=True, download=True, transform=transform_train)
test_set = ds(path, train=False, download=True, transform=transform_test)
loaders = {
        'train': torch.utils.data.DataLoader(
            train_set,
            batch_size=128,
            shuffle=True,
            num_workers=4,
            pin_memory=True
        ),
        'test': torch.utils.data.DataLoader(
            test_set,
            batch_size=128,
            num_workers=4,
            pin_memory=True
        )
}

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/CIFAR10/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:17<00:00, 9870881.15it/s]


Extracting ./data/CIFAR10/cifar-10-python.tar.gz to ./data/CIFAR10
Files already downloaded and verified


In [27]:
# define two floating point formats
bit_8 = FloatingPoint(exp=5, man=2)
bit_16 = FloatingPoint(exp=6, man=9)

# define quantization functions
weight_quant = quantizer(forward_number=bit_8,
                        forward_rounding="nearest")
grad_quant = quantizer(forward_number=bit_8,
                        forward_rounding="nearest")
momentum_quant = quantizer(forward_number=bit_16,
                        forward_rounding="stochastic")
acc_quant = quantizer(forward_number=bit_16,
                        forward_rounding="stochastic")

# define a lambda function so that the Quantizer module can be duplicated easily
act_error_quant = lambda : Quantizer(forward_number=bit_8, backward_number=bit_8,
                        forward_rounding="nearest", backward_rounding="nearest")

In [29]:
# Define model
def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, quant, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes)
        self.downsample = downsample
        self.stride = stride
        self.quant = quant()

    def forward(self, x):
        residual = x

        ####### FIXME #######
        out = self.bn1(x)
        out = self.relu(out)
        out = self.quant(out)
        out = self.conv1(out)
        out = self.quant(out)

        out = self.bn2(out)
        out = self.relu(out)
        out = self.quant(out)
        out = self.conv2(out)
        out = self.quant(out)
        #####################

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual

        return out

class PreResNet(nn.Module):

    def __init__(self,quant, num_classes=10, depth=20):

        super(PreResNet, self).__init__()
        assert (depth - 2) % 6 == 0, 'depth should be 6n+2'
        n = (depth - 2) // 6

        block = BasicBlock

        self.inplanes = 16
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1,
                               bias=False)
        self.layer1 = self._make_layer(block, 16, n, quant)
        self.layer2 = self._make_layer(block, 32, n, quant, stride=2)
        self.layer3 = self._make_layer(block, 64, n, quant, stride=2)
        self.bn = nn.BatchNorm2d(64 * block.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.avgpool = nn.AvgPool2d(8)
        self.fc = nn.Linear(64 * block.expansion, num_classes)
        self.quant = quant()
        IBM_half = FloatingPoint(exp=6, man=9)
        self.quant_half = Quantizer(IBM_half, IBM_half, "nearest", "nearest")
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, quant, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
            )

        layers = list()
        layers.append(block(self.inplanes, planes, quant , stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, quant))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.quant_half(x)
        x = self.conv1(x)
        x = self.quant(x)

        x = self.layer1(x)  # 32x32
        x = self.layer2(x)  # 16x16
        x = self.layer3(x)  # 8x8
        x = self.bn(x)
        x = self.relu(x)
        x = self.quant(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        x = self.quant_half(x)

        return x

In [30]:
model = PreResNet(act_error_quant)

In [31]:
device = 'cuda' # change device to 'cuda' if you want to run this example on cuda
model = model.to(device=device)

In [32]:
optimizer = SGD(model.parameters(), lr=0.05, momentum=0.9, weight_decay=5e-4)
optimizer = OptimLP(optimizer,
                    weight_quant=weight_quant,
                    grad_quant=grad_quant,
                    momentum_quant=momentum_quant,
                    acc_quant=acc_quant,
                    grad_scaling=1/1000 # do loss scaling
)

In [33]:
def run_epoch(loader, model, criterion, optimizer=None, phase="train"):
    assert phase in ["train", "eval"], "invalid running phase"
    loss_sum = 0.0
    correct = 0.0

    if phase=="train": model.train()
    elif phase=="eval": model.eval()

    ttl = 0
    with torch.autograd.set_grad_enabled(phase=="train"):
        for i, (input, target) in tqdm(enumerate(loader), total=len(loader)):
            input = input.to(device=device)
            target = target.to(device=device)
            output = model(input)
            loss = criterion(output, target)
            loss_sum += loss.cpu().item() * input.size(0)
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).sum()
            ttl += input.size()[0]

            if phase=="train":
                loss = loss * 1000 # do loss scaling
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

    correct = correct.cpu().item()
    return {
        'loss': loss_sum / float(ttl),
        'accuracy': correct / float(ttl) * 100.0,
    }

### Accuracy Before Training

In [34]:
test_res = run_epoch(loaders['test'], model, F.cross_entropy,
                        optimizer=optimizer, phase="eval")
print(test_res)

100%|██████████| 79/79 [00:02<00:00, 39.31it/s]

{'loss': 3.2637169151306153, 'accuracy': 9.8}





### 3-Epoch Training

In [35]:
for epoch in range(3):
    train_res = run_epoch(loaders['train'], model, F.cross_entropy,
                                optimizer=optimizer, phase="train")
    print(f'=====> Epoch {epoch}')
    print(f'Train loss    : {train_res["loss"]}')

100%|██████████| 391/391 [00:13<00:00, 28.16it/s]

=====> Epoch 0
Train loss    : 1.6568851916122436



100%|██████████| 391/391 [00:13<00:00, 28.62it/s]


=====> Epoch 1
Train loss    : 1.1912027248764039


100%|██████████| 391/391 [00:13<00:00, 29.17it/s]

=====> Epoch 2
Train loss    : 0.9807163693237305





### Accuracy After Training

In [None]:
test_res = run_epoch(loaders['test'], model, F.cross_entropy,
                        optimizer=optimizer, phase="eval")
print(test_res)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 79/79 [00:00<00:00, 99.80it/s]

{'loss': 1.0952622838974, 'accuracy': 61.1}





# Lab 3.1. Trans-Precision Inference on CNN Models

In [36]:
# Define model
def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, quant, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes)
        self.downsample = downsample
        self.stride = stride
        self.quant = quant()

    def forward(self, x):
        residual = x

        out = self.bn1(x)
        out = self.relu(out)
        out = self.quant(out)
        out = self.conv1(out)
        out = self.quant(out)

        out = self.bn2(out)
        out = self.relu(out)
        out = self.quant(out)
        out = self.conv2(out)
        out = self.quant(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual

        return out

class PreResNet(nn.Module):

    def __init__(self,quant, num_classes=10, depth=20):

        super(PreResNet, self).__init__()
        assert (depth - 2) % 6 == 0, 'depth should be 6n+2'
        n = (depth - 2) // 6

        block = BasicBlock

        self.inplanes = 16
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1,
                               bias=False)
        self.layer1 = self._make_layer(block, 16, n, quant)
        self.layer2 = self._make_layer(block, 32, n, quant, stride=2)
        self.layer3 = self._make_layer(block, 64, n, quant, stride=2)
        self.bn = nn.BatchNorm2d(64 * block.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.avgpool = nn.AvgPool2d(8)
        self.fc = nn.Linear(64 * block.expansion, num_classes)
        self.quant = quant()
        IBM_half = FloatingPoint(exp=6, man=9)
        self.quant_half = Quantizer(IBM_half, IBM_half, "nearest", "nearest")
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, quant, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
            )

        layers = list()
        layers.append(block(self.inplanes, planes, quant , stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, quant))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.quant_half(x)
        x = self.conv1(x)
        x = self.quant(x)

        x = self.layer1(x)  # 32x32
        x = self.layer2(x)  # 16x16
        x = self.layer3(x)  # 8x8
        x = self.bn(x)
        x = self.relu(x)
        x = self.quant(x)

        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        x = self.quant_half(x)

        return x

In [37]:
# define Three floating point formats
bit_8 = FloatingPoint(exp=5, man=2)
bit_16 = FloatingPoint(exp=6, man=9)
bit_32 = FloatingPoint(exp=8, man=23)

# define a lambda function so that the Quantizer module can be duplicated easily
act_error_quant = lambda : Quantizer(forward_number=bit_8, backward_number=bit_8,
                        forward_rounding="nearest", backward_rounding="nearest")

In [38]:
model = PreResNet(act_error_quant)

In [39]:
checkpoint = torch.load('./PreResNet_fp32.pth')
model.load_state_dict(checkpoint['model'])

  checkpoint = torch.load('./PreResNet_fp32.pth')


<All keys matched successfully>

In [40]:
device = 'cuda' # change device to 'cuda' if you want to run this example on cuda
model = model.to(device=device)

In [41]:
def run_test(loader, model, criterion, optimizer=None):
    loss_sum = 0.0
    correct = 0.0

    model.eval()

    ttl = 0
    with torch.no_grad():
        for i, (input, target) in tqdm(enumerate(loader), total=len(loader)):
            input = input.to(device=device)
            target = target.to(device=device)
            output = model(input)
            loss = criterion(output, target)
            loss_sum += loss.cpu().item() * input.size(0)
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).sum()
            ttl += input.size()[0]

    correct = correct.cpu().item()
    return {
        'loss': loss_sum / float(ttl),
        'accuracy': correct / float(ttl) * 100.0,
    }

In [42]:
test_res = run_test(loaders['test'], model, F.cross_entropy, optimizer=optimizer)
print(test_res)

100%|██████████| 79/79 [00:00<00:00, 99.22it/s] 

{'loss': 0.26722962763309477, 'accuracy': 92.12}





# Lab 3.2. Trans-Precision Inference on LLMs

In [56]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [57]:
model_id = "Qwen/Qwen2-0.5B-Instruct"
device = "cuda" # the device to load the model onto

LLM = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
    attn_implementation="eager",
    device_map=device
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [58]:
prompt = "Give me a short introduction of Samsung Electronics."

In [59]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = LLM.generate(
    model_inputs.input_ids,
    num_beams=1,
    do_sample=False,
    max_new_tokens=128
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)



Samsung Electronics is one of the world's largest and most successful electronics companies, with a presence in over 190 countries. The company was founded in 1938 by Lee Dong-hwan and has since grown to become one of the world's leading producers of consumer electronics, including smartphones, TVs, computers, and other devices.

Samsung Electronics operates several major divisions, including the Mobile Phone Division, which produces smartphones, tablets, and other mobile devices; the TV Division, which produces TVs, LCD monitors, and other home entertainment products; the Consumer Electronics Division, which produces a wide range of consumer electronics products, including televisions


In [60]:
LLM

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Linear(in_features=8

## Define Quantized Modules

In [61]:
from typing import List, Optional, Tuple, Union
from transformers.cache_utils import Cache, DynamicCache

class MatMul(nn.Module):
    def forward(self, A, B):
        return A @ B

class QuantMatMul(nn.Module):
    def __init__(self, qbit):
        super().__init__()
        self.quantizer = Quantizer(forward_number=qbit, backward_number=qbit,
                     forward_rounding="nearest", backward_rounding="nearest")
    def forward(self, A, B):
        qA = self.quantizer.quantize(A.data)
        qB = self.quantizer.quantize(B.data)
        return qA @ qB

from transformers.models.qwen2.modeling_qwen2 import rotate_half, apply_rotary_pos_emb, repeat_kv

def attn_forward(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    if "padding_mask" in kwargs:
        warnings.warn(
            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
        )
    bsz, q_len, _ = hidden_states.size()

    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)

    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        if self.layer_idx is None:
            raise ValueError(
                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                "with a layer index."
            )
        kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

    if past_key_value is not None:
        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
        key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

    # repeat k/v heads if n_kv_heads < n_heads
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    attn_weights = self.matmul1(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)

    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
        raise ValueError(
            f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
            f" {attn_weights.size()}"
        )

    if attention_mask is not None:
        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
            raise ValueError(
                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
            )

        attn_weights = attn_weights + attention_mask

    # upcast attention to fp32
    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
    attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
    attn_output = self.matmul2(attn_weights, value_states)

    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
        raise ValueError(
            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
            f" {attn_output.size()}"
        )

    attn_output = attn_output.transpose(1, 2).contiguous()
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

    attn_output = self.o_proj(attn_output)

    if not output_attentions:
        attn_weights = None

    return attn_output, attn_weights, past_key_value

In [62]:
class QuantLinear(nn.Linear):
    def __init__(self,in_features,out_features,bias,qbit):
        super().__init__(in_features,out_features,bias)
        self.quantizer_x = Quantizer(forward_number=qbit, backward_number=qbit,
                     forward_rounding="nearest", backward_rounding="nearest")
        self.quantizer_w = Quantizer(forward_number=qbit, backward_number=qbit,
                     forward_rounding="nearest", backward_rounding="nearest")
    def forward(self,x):
        w = self.weight.clone()
        qx = self.quantizer_x.quantize(x.data)
        qw = self.quantizer_w.quantize(w.data)
        return F.linear(qx,qw,self.bias)

In [63]:
LLM.named_modules()

<generator object Module.named_modules at 0x7ca9cc0b86d0>

In [64]:
LLM

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
  )
  (lm_head): Linear(in_features=8

In [65]:
LLM.model.layers[0].self_attn

Qwen2Attention(
  (q_proj): Linear(in_features=896, out_features=896, bias=True)
  (k_proj): Linear(in_features=896, out_features=128, bias=True)
  (v_proj): Linear(in_features=896, out_features=128, bias=True)
  (o_proj): Linear(in_features=896, out_features=896, bias=False)
  (rotary_emb): Qwen2RotaryEmbedding()
)

In [66]:
from types import MethodType

def get_QLLM(LLM, qbit):
    from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention
    for name, module in LLM.named_modules():
        if isinstance(module, Qwen2Attention):
            setattr(module, "matmul1", QuantMatMul(qbit))
            setattr(module, "matmul2", QuantMatMul(qbit))
            module.forward = MethodType(attn_forward, module)

    wrapped_modules={}
    module_dict={}
    it=[(name,m) for name,m in LLM.named_modules()]
    for name,m in it:
        module_dict[name]=m
        idx=name.rfind('.')
        if idx==-1:
            idx=0
        father_name=name[:idx]
        if father_name in module_dict:
            father_module=module_dict[father_name]
        else:
            raise RuntimeError(f"father module {father_name} not found")
        if isinstance(m,nn.Linear) and 'head' not in name:
            idx = idx+1 if idx != 0 else idx
            new_m = QuantLinear(m.in_features,m.out_features,m.bias is not None,qbit=qbit)
            new_m.weight.data=m.weight.data
            new_m.bias=m.bias
            replace_m=new_m
            wrapped_modules[name] = new_m
            setattr(father_module,name[idx:],replace_m)
    LLM.eval()
    return wrapped_modules

In [67]:
qbit = FloatingPoint(exp=5, man=2)
wrapped_modules = get_QLLM(LLM, qbit)

In [68]:
prompt = "Give me a short introduction of Samsung Electronics."

In [69]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = LLM.generate(
    model_inputs.input_ids,
    num_beams=1,
    do_sample=False,
    max_new_tokens=128
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Samsung Electronics is a South Korean multinational corporation headquartered in Seoul, Korea. The company was founded in 1978 and has since become one of the world's largest electronics manufacturers with operations in over 100 countries worldwide.

Samsung Electronics is known for its high-quality smartphones, tablets, smart TVs, and other consumer electronics products. It also produces various other consumer goods such as televisions, refrigerators, washing machines, and dishwashers.

In addition to these products, Samsung Electronics also operates several other businesses including automotive, home appliances, and medical devices. The company has been recognized globally for its innovative technology and commitment to quality


In [70]:
LLM

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): QuantLinear(
            in_features=896, out_features=896, bias=True
            (quantizer_x): Quantizer()
            (quantizer_w): Quantizer()
          )
          (k_proj): QuantLinear(
            in_features=896, out_features=128, bias=True
            (quantizer_x): Quantizer()
            (quantizer_w): Quantizer()
          )
          (v_proj): QuantLinear(
            in_features=896, out_features=128, bias=True
            (quantizer_x): Quantizer()
            (quantizer_w): Quantizer()
          )
          (o_proj): QuantLinear(
            in_features=896, out_features=896, bias=False
            (quantizer_x): Quantizer()
            (quantizer_w): Quantizer()
          )
          (rotary_emb): Qwen2RotaryEmbedding()
          (matmul1): QuantMatMul(
            (