In [1]:
from IPython.display import Image
import gc
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

$$
\mathbf {QK}^T\in \mathbb R^{N\times N}
$$

- 40 heads, bfloat16
    - multi head 可以并行地去算；

### Multi head

- $h$ 个 head, 每个头的维度为

    $$
    d_k=d_v=\frac{d_{model}}{h}
    $$


- 线性变换生成多头的 $Q_i, K_i, V_i$ (for $i=1,2,\cdots,h$)

    $$
    Q_i=XW_i^Q, K_i=XW_i^K, V_i=XW_i^V\quad \in \mathbb R^{N\times\frac{d_{model}}{h}}
    $$

- 计算每个头的注意力

    $$
    \text{head}_i=\text{Attention}(Q_i,K_i,V_i)=\text{softmax}\left(\frac{Q_iK_i^T}{\sqrt{d_k}}\right)V_i
    $$
  - 尤其需要注意的是 $Q_iK_i^T\in \mathbb R^{N\times N}$
- 拼接多头的输出: $\frac{d_{model}}{h}\times h$

    $$
    \text{MultiHead}(Q,K,V)=\text{Concat}(head_1, head_2,  \cdots, head_h)W^o
    $$
  - $W^o\in \mathbb R^{d_{model}\times d_{model}}$, learnable projection matrix

### 空间存储

- VRAM
- O(N^2)

In [5]:
# N = 1000
# MB
40 * 2 * (1000 ** 2) / (1024*1024)

76.2939453125

In [6]:
# N = 16000
# GB
40 * 2 * (16000 ** 2) / (1024*1024*1024)

19.073486328125

In [10]:
# N = 100,000
# TB
40 * 2 * (100_000 ** 2) / (1024*1024*1024*1024)

0.7275957614183426

In [8]:
def bytes_to_giga_bytes(bytes):
    return bytes / 1024 / 1024 / 1024
def flush():
    gc.collect()
    torch.cuda.empty_cache()
    # torch.cuda.synchronize()
    torch.cuda.reset_peak_memory_stats()

### performance benchmark

In [14]:
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", 
                                             torch_dtype=torch.bfloat16, 
                                             device_map="auto", 
                                             pad_token_id=0)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [15]:
model

GPTBigCodeForCausalLM(
  (transformer): GPTBigCodeModel(
    (wte): Embedding(49152, 6144)
    (wpe): Embedding(8192, 6144)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-39): 40 x GPTBigCodeBlock(
        (ln_1): LayerNorm((6144,), eps=1e-05, elementwise_affine=True)
        (attn): GPTBigCodeSdpaAttention(
          (c_attn): Linear(in_features=6144, out_features=6400, bias=True)
          (c_proj): Linear(in_features=6144, out_features=6144, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((6144,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTBigCodeMLP(
          (c_fc): Linear(in_features=6144, out_features=24576, bias=True)
          (c_proj): Linear(in_features=24576, out_features=6144, bias=True)
          (act): GELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((6144,), e

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")

In [21]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [7]:
prompt = "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer:"

In [22]:
result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
result

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


' Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_gigabytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single'

In [23]:
print(result)

 Here is a Python function that transforms bytes to Giga bytes:

```python
def bytes_to_gigabytes(bytes):
    return bytes / 1024 / 1024 / 1024
```

This function takes a single


In [26]:
bytes_to_giga_bytes(torch.cuda.max_memory_allocated(device='cuda:0')) + bytes_to_giga_bytes(torch.cuda.max_memory_allocated(device='cuda:1'))

28.97451877593994

In [34]:
del pipe
del model

In [17]:
flush()

### quantization

- `bitsandbytes`
    - `load_in_8bit=True`

In [2]:
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_8bit=True, pad_token_id=0)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")

In [6]:
# model

In [3]:
prompt = "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer:"

In [6]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
print(result)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


 Here is a Python function that transforms bytes to Giga bytes:

```python
def bytes_to_gigabytes(bytes):
    return bytes / 1024 / 1024 / 1024
```

This function takes a single


In [10]:
bytes_to_giga_bytes(torch.cuda.max_memory_allocated('cuda:0')) + bytes_to_giga_bytes(torch.cuda.max_memory_allocated('cuda:1')) 

15.230342864990234

In [11]:
bytes_to_giga_bytes(torch.cuda.max_memory_allocated('cuda:1'))

0.0

In [12]:
del model
del pipe

In [13]:
flush()

### flash attention

- outer loop: $\mathbf K_j, \mathbf V_j$
- inner loop: $\mathbf Q_i, \mathbf O_i$

In [14]:
# https://huggingface.co/docs/transformers/v4.35.0/llm_tutorial_optimization#2-flash-attention
Image(url='./imgs/hf_flash_attn.png', width=600)

In [2]:
system_prompt = """Below are a series of dialogues between various people and an AI technical assistant.
The assistant tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble but knowledgeable.
The assistant is happy to help with code questions and will do their best to understand exactly what is needed.
It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer.
That said, the assistant is practical really does its best, and doesn't let caution get too much in the way of being useful.

The Starcoder models are a series of 15.5B parameter models trained on 80+ programming languages from The Stack (v1.2) (excluding opt-out requests).
The model uses Multi Query Attention, was trained using the Fill-in-the-Middle objective, and with 8,192 tokens context window for a trillion tokens of heavily deduplicated data.

-----

Question: Write a function that takes two lists and returns a list that has alternating elements from each input list.

Answer: Sure. Here is a function that does that.

def alternating(list1, list2):
   results = []
   for i in range(len(list1)):
       results.append(list1[i])
       results.append(list2[i])
   return results

Question: Can you write some test cases for this function?

Answer: Sure, here are some tests.

assert alternating([10, 20, 30], [1, 2, 3]) == [10, 1, 20, 2, 30, 3]
assert alternating([True, False], [4, 5]) == [True, 4, False, 5]
assert alternating([], []) == []

Question: Modify the function so that it returns all input elements when the lists have uneven length. The elements from the longer list should be at the end.

Answer: Here is the modified function.

def alternating(list1, list2):
   results = []
   for i in range(min(len(list1), len(list2))):
       results.append(list1[i])
       results.append(list2[i])
   if len(list1) > len(list2):
       results.extend(list1[i+1:])
   else:
       results.extend(list2[i+1:])
   return results

-----
"""

In [3]:
prompt = "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer:"

In [4]:
long_prompt = 10 * system_prompt + prompt

In [5]:
model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [6]:
import time

start_time = time.time()
result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]

print(f"Generated in {time.time() - start_time} seconds.")
print(result)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Generated in 5.260891437530518 seconds.
 Sure, here is a function that does that.

def bytes_to_giga(bytes):
   return bytes / 1024 / 1024 / 1024

Answer: Sure, here is a function that does that.

def


In [9]:
bytes_to_giga_bytes(torch.cuda.max_memory_allocated('cuda:0')) + bytes_to_giga_bytes(torch.cuda.max_memory_allocated('cuda:1'))

33.42166042327881

In [10]:
flush()

In [13]:
# !pip install optimum

In [19]:
# Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type gpt_bigcode. As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. 
# model.to_bettertransformer()

In [14]:
os.environ['TORCH_CUDNN_SDPA_ENABLED'] = '1'

In [15]:
start_time = time.time()
from torch.nn.attention import SDPBackend, sdpa_kernel

with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]

# with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
#     result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]

print(f"Generated in {time.time() - start_time} seconds.")
print(result)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  sdpa_result = torch.nn.functional.scaled_dot_product_attention(
  sdpa_result = torch.nn.functional.scaled_dot_product_attention(
  sdpa_result = torch.nn.functional.scaled_dot_product_attention(
  sdpa_result = torch.nn.functional.scaled_dot_product_attention(
  sdpa_result = torch.nn.functional.scaled_dot_product_attention(
  sdpa_result = torch.nn.functional.scaled_dot_product_attention(


RuntimeError: No available kernel. Aborting execution.