In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "E:\pretrained_models\Qwen\Qwen2___5-0___5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

prompt = f"Luna is 29 years old, married, deeply passionate about environmental conservation and transgender rights, and volunteers their weekends at local park clean-ups. They studied physics and applied math in college, and held several campaigns to reduce the campus’s carbon footprint. Which is more **probable**?\n" + \
f"(a) Luna is an assistant professor in aerospace engineering **and** is an active member of an environmental advocacy group.\n" + \
f"(b) Luna is an assistant professor in aerospace engineering.\n"
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
# text = tokenizer.apply_chat_template(
#     messages,
#     tokenize=False,
#     add_generation_prompt=True
# )
text = prompt
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=1024
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [6]:
PROMPT

('Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?\nA. 30\nB. 0\nC. 20\nD. 10\nE. 11\nF. 5\n',)

In [7]:
NAME = "saves\sft\checkpoint-2100"
PROMPT = "Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?\nA. 30\nB. 0\nC. 20\nD. 10\nE. 11\nF. 5\n"
MES = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": PROMPT}
]
MAX = 1024

import os
import torch
os.environ["PYTORCH_USE_CUDA_DSA"] = "1"
torch.cuda.empty_cache()

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2ForCausalLM, Qwen2MoeForSequenceClassification, LlamaForCausalLM
import torch
from model.global_attn import get_ga_model

torch.cuda.empty_cache()
net = get_ga_model(NAME)[1].from_pretrained(
    NAME, device_map="cuda", torch_dtype="auto", output_hidden_states=True
).eval()
tokenizer = AutoTokenizer.from_pretrained(NAME)




In [10]:
print(PROMPT)

Beth places four whole ice cubes in a frying pan at the start of the first minute, then five at the start of the second minute and some more at the start of the third minute, but none in the fourth minute. If the average number of ice cubes per minute placed in the pan while it was frying a crispy egg was five, how many whole ice cubes can be found in the pan at the end of the third minute?
A. 30
B. 0
C. 20
D. 10
E. 11
F. 5



In [8]:
text = tokenizer.apply_chat_template(
    MES,
    tokenize=False,
    add_generation_prompt=True, 
)

In [9]:
torch.cuda.empty_cache()
with torch.no_grad():
    model_inputs = tokenizer([text], return_tensors="pt").to(net.device)
    generated_ids = net.generate(
        **model_inputs,
        max_new_tokens=MAX, 
    )

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [11]:
generated_ids_ = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids_, skip_special_tokens=True)[0]

print(
    response
)


Let's break down the problem step-by-step to find the total number of ice cubes in the frying pan at the end of the third minute.

### Step 1: Initial Ice Cubes in the Pan
At the start of the first minute, there are 4 ice cubes.
- At the start of the second minute, there are 5 ice cubes.
- At the start of the third minute, we have not placed any more ice cubes (so the number of ice cubes is 0).
- So, the initial total number of ice cubes in the pan is 4 + 5 = <<4+5=9>>9.
- Therefore, the total number of ice cubes in the pan is 9 - 5 = <<9-5=4>>4.
The fourth-minute number of ice cubes is 0.
So, the total number of ice cubes in the pan at the end of the fourth minute is 4 + 5 = <<4+5=9>>9.
#### Average Number of Ice Cubes in the Pan
We need to find the number of ice cubes in the pan for the fourth minute, which is 0.
So, the total number of ice cubes at the end of the fourth minute is 4 + 5 = <<4+5=9>>9.
#### Total Number of Ice Cubes
To find the total number of ice cubes, we add the tot

In [7]:
print(net)

GAForCausalLM(
  (model): GAModel(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,),

In [27]:
# with torch.no_grad():
#     sen = generated_ids["sequences"].detach().clone()[0]
#     has_token = len(model_inputs["input_ids"][0])
#     num_new_token = len(sen) - has_token
#     for i in range(1, num_new_token):
#         print(f"Token {tokenizer.decode(sen[i + has_token - 1])}")
#         for j in range(net.config.num_hidden_layers):
#             print(f"Layer {j}")
#             x = generated_ids["attentions"][i][j].cpu().detach()
#             a = torch.argsort(x[0][0][0])
#             # 解析前 2 个最大值对应的 token
#             print(
#                 tokenizer.decode(
#                     sen[a[:2].tolist()],
#                 )
#             )

Token Given
Layer 0
 passionate rights
Layer 1
 rights Cloud
Layer 2
 deeply<|im_start|>
Layer 3
 Luna member
Layer 4
 rights and
Layer 5
and**
Layer 6
<|im_start|> aerospace
Layer 7
 deeply reduce
Layer 8
 the reduce
Layer 9
 rights carbon
Layer 10
 are the
Layer 11
 by.
Layer 12
 the reduce
Layer 13
system helpful
Layer 14
<|im_start|> are
Layer 15
 the are
Layer 16
.
 footprint
Layer 17
<|im_start|><|im_start|>
Layer 18
 aerospace clean
Layer 19
 helpful campus
Layer 20
 helpful a
Layer 21
 reduce held
Layer 22
 group footprint
Layer 23
 footprint-ups
Token  that
Layer 0
 passionate conservation
Layer 1
 Cloud rights
Layer 2
 about<|im_end|>
Layer 3
 Luna Luna
Layer 4
 rights and
Layer 5
andsystem
Layer 6
 aerospace<|im_start|>
Layer 7
 reduce the
Layer 8
 the’s
Layer 9
 rights transgender
Layer 10
 aresystem
Layer 11
.user
Layer 12
 thesystem
Layer 13
<|im_start|>system
Layer 14
<|im_start|>user
Layer 15
 are by
Layer 16
 group.

Layer 17
<|im_start|><|im_start|>
Layer 18
<|im_star

In [48]:
((2), (2)) + ((2,), )

(2, 2, (2,))

In [2]:
import torch
(torch.randn((2, 3, 4)) @ torch.randn((2, 4, 5, 6))).shape

RuntimeError: The size of tensor a (2) must match the size of tensor b (4) at non-singleton dimension 1

In [1]:
NAME = "E:\\nlp\\toy\\attention\\saves\\sft\\checkpoint-280"
from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2ForCausalLM, Qwen2MoeForSequenceClassification
import torch

from model.global_attn import Qwen2GAForCausalLM

torch.cuda.empty_cache()
net:AutoModelForCausalLM = AutoModelForCausalLM.from_pretrained(
    NAME, device_map="cuda", torch_dtype="auto", output_hidden_states=True
).eval()
tokenizer = AutoTokenizer.from_pretrained(NAME)

print(net)



Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [2]:
from peft import get_peft_model, GAConfig

ga_config = GAConfig()
model = get_peft_model(net, ga_config)

Instantiating CustomAttention without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.


In [3]:
print(model)

PeftModel(
  (base_model): GlobalAttnModel(
    (model): Qwen2ForCausalLM(
      (model): gloabl_attn.GAModel(
        (base_model): Qwen2Model(
          (embed_tokens): Embedding(151936, 896)
          (layers): ModuleList(
            (0-23): 24 x Qwen2DecoderLayer(
              (self_attn): Qwen2SdpaAttention(
                (q_proj): Linear(in_features=896, out_features=896, bias=True)
                (k_proj): Linear(in_features=896, out_features=128, bias=True)
                (v_proj): Linear(in_features=896, out_features=128, bias=True)
                (o_proj): Linear(in_features=896, out_features=896, bias=False)
                (rotary_emb): Qwen2RotaryEmbedding()
              )
              (mlp): Qwen2MLP(
                (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
                (up_proj): Linear(in_features=896, out_features=4864, bias=False)
                (down_proj): Linear(in_features=4864, out_features=896, bias=False)
                

In [4]:
model.print_trainable_parameters()

trainable params: 1,607,424 || all params: 495,640,192 || trainable%: 0.3243


In [10]:
torch.cuda.empty_cache()
with torch.no_grad():
    model_inputs = tokenizer([text], return_tensors="pt").to(net.device)
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=MAX, 
        output_attentions=True, 
        return_dict_in_generate=True,
    )

AttributeError: 'GAModel' object has no attribute 'config'

In [6]:
from transformers import AutoConfig

In [7]:
NAME = "E:\pretrained_models\Qwen\Qwen2___5-0___5B-Instruct"
config = AutoConfig.from_pretrained(NAME)

In [8]:
config.pad_token_id

In [11]:
MODEL_MAPPING._load_attr_from_module(
        config.model_type, MODEL_MAPPING._model_mapping[config.model_type]
    )

transformers.models.qwen2.modeling_qwen2.Qwen2Model

In [18]:
_, lm = get_ga_model(NAME)

In [19]:
lm.from_pretrained(NAME)

Some weights of GAForCausalLM were not initialized from the model checkpoint at E:\pretrained_models\Qwen\Qwen2___5-0___5B-Instruct and are newly initialized: ['model.global_attn.k_proj.bias', 'model.global_attn.k_proj.weight', 'model.global_attn.q_proj.bias', 'model.global_attn.q_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GAForCausalLM(
  (model): GAModel(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,),

In [14]:
import torch
a = torch.randn((1, 2))

In [17]:
a[[0], [0]].shape

torch.Size([1])

In [13]:
a

tensor([[-0.1471,  1.7123],
        [ 1.4042, -1.8336]])

In [None]:
from datasets import load_dataset

load_dataset(
    
)