In [1]:
import os
import sys
from tqdm import tqdm
import torch
from transformers import BertTokenizerFast
import lightning as pl

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))  
pl.seed_everything(42, workers=True)
torch.set_float32_matmul_precision(precision="high")

Seed set to 42


In [3]:
args = {
    "pretrain": '/home/zhulin/pretrain/bert_pretrain_uncased/',
    "model": "./SingleChannelPredictor.pt",
    "dataset": "/home/zhulin/datasets/cdatasets.test.5.csv",
    "batch_size": 4
}


In [27]:
class Timer:
    def __init__(self, epochs):
        self.epochs = epochs
        # 设置用于测量时间的 cuda Event, 这是PyTorch 官方推荐的接口,理论上应该最靠谱
        self.starter = torch.cuda.Event(enable_timing=True)
        self.ender = torch.cuda.Event(enable_timing=True)

    def warmup(self, interface, *args, **kwargs):
        # 预热, GPU 平时可能为了节能而处于休眠状态, 因此需要预热
        with torch.no_grad():
            for _ in range(10):
                interface(*args, **kwargs)
        torch.cuda.synchronize()

    def measure(self, interface, *args, **kwargs):
        timeings = []
        for _ in tqdm(range(self.epochs)):
            self.starter.record()
            interface(*args, **kwargs)
            self.ender.record()
            torch.cuda.synchronize() # 等待GPU任务完成
            t = self.starter.elapsed_time(self.ender)
            
            timeings.append(t)
        return timeings

In [5]:
### load model
tokenizer = BertTokenizerFast.from_pretrained(args["pretrain"], use_fast=True)
predictor = torch.jit.load(args["model"])

In [6]:
### load datasets
import numpy as np
import datatable as dt

data = dt.fread(args["dataset"], fill=True, max_nrows=128 * args["batch_size"]).to_pandas()

In [20]:
@torch.no_grad()
def interface(tokenizer, predictor, data, batchsize):
    for l in range(0, len(data), batchsize):
        padded_sent_seq = tokenizer(data.iloc[l:l+batchsize]["channel"].to_list(), padding=True, truncation=True, max_length=2048, return_tensors="pt")
        predictor(padded_sent_seq["input_ids"].cuda(), padded_sent_seq["attention_mask"].cuda())

In [28]:
predictor.cuda().eval()
timer = Timer(epochs=16)
times = []

data = dt.fread(args["dataset"], fill=True, max_nrows=128 * 8).to_pandas()
timer.warmup(interface, tokenizer, predictor, data, 8)

for batch_size in range(8, 64, 8):
    data = dt.fread(args["dataset"], fill=True, max_nrows=128 * batch_size).to_pandas()
    t = timer.measure(interface, tokenizer, predictor, data, batch_size)
    times.append(sum(t)/len(t))
print(times)

100%|██████████| 16/16 [00:38<00:00,  2.41s/it]
  0%|          | 0/16 [00:00<?, ?it/s]


RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
  File "code/__torch__/core/predictor.py", line 25, in forward
    add_positional_encoding: bool=True) -> Tensor:
    net = self.net
    _0 = (net).forward(x, mask, add_positional_encoding, )
          ~~~~~~~~~~~~ <--- HERE
    return _0
  File "code/__torch__/core/transformer.py", line 24, in forward
      x1 = x0
    transformer = self.transformer
    x3 = (transformer).forward(x1, mask, )
          ~~~~~~~~~~~~~~~~~~~~ <--- HERE
    pooling_net = self.pooling_net
    x4 = (pooling_net).forward(x3, mask, )
  File "code/__torch__/core/transformer.py", line 51, in forward
    layers = self.layers
    _0 = getattr(layers, "0")
    return (_0).forward(x, mask, )
            ~~~~~~~~~~~ <--- HERE
class EncoderBlock(Module):
  __parameters__ = []
  File "code/__torch__/core/transformer.py", line 66, in forward
    mask: Optional[Tensor]=None) -> Tensor:
    attn = self.attn
    attn_out, _1, = (attn).forward(x, mask, )
                     ~~~~~~~~~~~~~ <--- HERE
    dropout = self.dropout
    x5 = torch.add(x, (dropout).forward(attn_out, ))
  File "code/__torch__/core/transformer.py", line 100, in forward
    qkv1 = torch.permute(qkv0, [0, 2, 1, 3])
    q, k, v, = torch.chunk(qkv1, 3, -1)
    values, attention, = _2(q, k, v, mask, )
                         ~~ <--- HERE
    values0 = torch.permute(values, [0, 2, 1, 3])
    values1 = torch.reshape(values0, [batch_size, seq_length, embed_dim])
  File "code/__torch__/core/transformer.py", line 112, in scaled_dot_product
  d_k = (torch.size(q))[-1]
  attn_logits = torch.matmul(q, torch.transpose(k, -2, -1))
  attn_logits0 = torch.div(attn_logits, torch.sqrt(d_k))
                 ~~~~~~~~~ <--- HERE
  if torch.__isnot__(mask, None):
    mask0 = unchecked_cast(Tensor, mask)

Traceback of TorchScript, original code (most recent call last):
  File "/home/zhulin/workspace/Jack/core/predictor.py", line 23, in forward
    def forward(self, x, mask: Optional [torch.Tensor]=None, add_positional_encoding: bool=True):
         return self.net(x, mask=mask, add_positional_encoding=add_positional_encoding)
                ~~~~~~~~ <--- HERE
  File "/home/zhulin/workspace/Jack/core/transformer.py", line 169, in forward
        if add_positional_encoding:
            x = self.positional_encoding(x)
        x = self.transformer(x, mask=mask)              # [Batch, SeqLen, ModDim]
            ~~~~~~~~~~~~~~~~ <--- HERE
        x = self.pooling_net(x, mask=mask)              # GlobalAveragePooling
        x = self.output_net(x)
  File "/home/zhulin/workspace/Jack/core/transformer.py", line 104, in forward
    def forward(self, x, mask: Optional [torch.Tensor]=None):
        for layer in self.layers:
            x = layer(x, mask=mask)
                ~~~~~ <--- HERE
        return x
  File "/home/zhulin/workspace/Jack/core/transformer.py", line 87, in forward
    def forward(self, x, mask: Optional [torch.Tensor]=None):
        # Attention part
        attn_out, _ = self.attn(x, mask=mask)
                      ~~~~~~~~~ <--- HERE
        x = x + self.dropout(attn_out)
        x = self.norm1(x)
  File "/home/zhulin/workspace/Jack/core/transformer.py", line 51, in forward
        q, k, v = qkv.chunk(3, dim=-1)
    
        values, attention = scaled_dot_product(q, k, v, mask=mask)
                            ~~~~~~~~~~~~~~~~~~ <--- HERE
        values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims]
        values = values.reshape(batch_size, seq_length, embed_dim)
  File "/home/zhulin/workspace/Jack/core/transformer.py", line 23, in scaled_dot_product
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
    if mask is not None:
        attn_mask = mask.unsqueeze(1).unsqueeze(2)
RuntimeError: CUDA out of memory. Tried to allocate 1.61 GiB. GPU 0 has a total capacty of 11.66 GiB of which 907.19 MiB is free. Process 19592 has 2.98 GiB memory in use. Process 23209 has 4.78 GiB memory in use. Including non-PyTorch memory, this process has 2.95 GiB memory in use. Of the allocated memory 1.65 GiB is allocated by PyTorch, and 1.00 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


In [12]:
from loguru import logger
from tqdm import tqdm

predictor.cuda().eval()
# 预热, GPU 平时可能为了节能而处于休眠状态, 因此需要预热
logger.info('[+] warm up ...\n')
with torch.no_grad():
    for _ in range(10):
        # _ = predictor(dummy_input)
        interface(tokenizer, predictor, data, 8)
torch.cuda.synchronize()

# 设置用于测量时间的 cuda Event, 这是PyTorch 官方推荐的接口,理论上应该最靠谱
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
# 初始化一个时间容器
timings = np.zeros((100, 1))

logger.info('testing ...\n')
with torch.no_grad():
    for rep in tqdm(range(100)):
        starter.record()
        interface(tokenizer, predictor, data, 8)
        ender.record()
        torch.cuda.synchronize() # 等待GPU任务完成
        curr_time = starter.elapsed_time(ender) # 从 starter 到 ender 之间用时,单位为毫秒
        timings[rep] = curr_time

avg = timings.sum()/100/1000
logger.info('\navg={}s\n'.format(avg))

[32m2024-09-12 16:24:55.718[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m6[0m - [1m[+] warm up ...
[0m
[32m2024-09-12 16:25:16.175[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mtesting ...
[0m
100%|██████████| 100/100 [03:16<00:00,  1.97s/it]
[32m2024-09-12 16:28:32.746[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m29[0m - [1m
avg=1964.0056689453124
[0m


In [14]:
from torch.profiler import profile, record_function, ProfilerActivity

logger.info('[+] warm up ...\n')
with torch.no_grad():
    for _ in range(10):
        # _ = predictor(dummy_input)
        interface(tokenizer, predictor, data, 8)
torch.cuda.synchronize()

with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        interface(tokenizer, predictor, data, 8)
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

[32m2024-09-12 16:47:30.687[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1m[+] warm up ...
[0m
STAGE:2024-09-12 16:47:50 38310:38310 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
STAGE:2024-09-12 16:47:53 38310:38310 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-09-12 16:47:53 38310:38310 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference        81.66%        1.699s       100.00%        2.081s        2.081s             1  
                                                forward         1.24%      25.872ms        16.56%     344.648ms       2.693ms           128  
                                           aten::linear         0.64%      13.221ms         5.02%     104.495ms     116.624us           896  
                                            aten::addmm         2.98%      62.067ms         3.65%      75.898ms      84.708us           896  
      