In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [36]:
checkpoint = "Qwen/Qwen1.5-1.8B-Chat"
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [37]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device)

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

The idea is to run some inference for:

(1) a single input
(2) a batched input

For each case, try setting `num_return_sequences` as 1 and as > 1.

Explore the model outputs.

In [39]:
input_single = "What is the capital of France?"

input_batch = [
    "What is the capital of France?",
    "What is the largest mammal?",
    "Who wrote 'To Kill a Mockingbird'?"
]

In [40]:
tokenizer.padding_side = "left"

In [41]:
# Single input, num_return_sequences=1

# General pipeline is tokenize -> generate -> decode -> parse
tokens_single = tokenizer(input_single, return_tensors="pt", return_attention_mask=True).to(device)

In [42]:
type(tokens_single)

transformers.tokenization_utils_base.BatchEncoding

In [43]:
tokens_single

{'input_ids': tensor([[3838,  374,  279, 6722,  315, 9625,   30]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='mps:0')}

In [44]:
op_tokens_single = model.generate(
    **tokens_single,
    max_new_tokens=50,
    do_sample=True,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
op_tokens_single

  test_elements = torch.tensor(test_elements)


tensor([[ 3838,   374,   279,  6722,   315,  9625,    30, 12095,    13,  1084,
           374,   264, 12752,    11, 31592,    11,   323,  4948,  4126,   304,
          9625,    11,  3881,   369,  1181, 26277, 59924,  1741,   438,   279,
           468,  3092,   301, 21938,    11, 43464,  9420,   373, 56729,    11,
           323,   279,  9729, 48506, 16328,    13, 12095,  1083,   702,   264,
         52314, 18560,  6109,    11,   448,  1657, 42554]], device='mps:0')

In [45]:
op_single = tokenizer.batch_decode(op_tokens_single, skip_special_tokens=True)
op_single

['What is the capital of France? Paris. It is a cultural, artistic, and political center in France, known for its iconic landmarks such as the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum. Paris also has a thriving arts scene, with many galleries']

In [46]:
print(op_single[0][len(input_single):].strip())

Paris. It is a cultural, artistic, and political center in France, known for its iconic landmarks such as the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum. Paris also has a thriving arts scene, with many galleries


In [47]:
# Single input, num_return_sequences > 1
op_tokens_single_seq = model.generate(
    **tokens_single,
    max_new_tokens=50,
    do_sample=True,
    num_return_sequences=5,
    eos_token_id=tokenizer.eos_token_id,
)
op_tokens_single_seq

tensor([[  3838,    374,    279,   6722,    315,   9625,     30,  12095, 151643,
         111632,   3837,  35946, 112914, 105012,   1773,    220,   1304,   3979,
            563,  13970,     11,    358,    614,   9498,    311,  15401,    382,
             16,     13,   3085,    271,     17,     13,   1634,    271,     18,
             13,   1752,    271,     19,     13,   1913,    271,     20,     13,
           3216,  51461,    117,  16038,  31838,  89012,   9370, 109949,    330,
         111632,   3837,  35946],
        [  3838,    374,    279,   6722,    315,   9625,     30,  12095, 151643,
         116379, 100405,  99486, 104053,   3837,  77288, 116379, 102181,  34187,
          99744,  99623, 104330, 104053,   1773,  99424, 101047,  97639, 101942,
         101199,  46944, 106888,  99279,  29490,   5122, 100078,  34204,  99257,
           5373, 114791,   5373, 101064, 103816,  29826,   3837, 104019, 104987,
          99657,   5373,  99614,  33108, 104271,   3837,  91572, 107919, 10

In [48]:
op_single_seq = tokenizer.batch_decode(op_tokens_single_seq, skip_special_tokens=True)
op_single_seq

['What is the capital of France? Paris慢慢地，我学会了欣赏。  ____________ slowly, I have learned to appreciate.\n\n1. With\n\n2. As\n\n3. For\n\n4. On\n\n5. By 根据所给的句子 "慢慢地，我',
 'What is the capital of France? Paris活得简单就是快乐，但活得复杂了也不一定会有快乐。生活中的我们经常处于一个复杂的境地：忙于工作、家务、家庭琐事，还要照顾孩子、朋友和家人，同时又要追求事业上的成功',
 'What is the capital of France? Paris.实地考察法是历史学习的重要方法，某同学在法国进行了实地考察，他首先参观了卢浮宫，欣赏了《蒙娜丽莎》和《胜利女神像》，然后去感受巴黎圣母院的',
 'What is the capital of France? Paris. \n巴黎是法国的首都吗？ \n\n是的，巴黎是法国的首都。以下是详细的解释：\n\n巴黎（Paris）位于法国北部，塞纳河畔，是该国的经济、文化和政治中心。它拥有着',
 'What is the capital of France? Paris. #Paris #France不断地变化，不断发展的城市\n\n巴黎是法国的首都和最大城市，位于塞纳河右岸，东临德国、西濒英吉利海峡，南界地中海。这里是世界著名的文化、艺术']

In [49]:
[x[len(input_single):].strip() for x in op_single_seq]

['Paris慢慢地，我学会了欣赏。  ____________ slowly, I have learned to appreciate.\n\n1. With\n\n2. As\n\n3. For\n\n4. On\n\n5. By 根据所给的句子 "慢慢地，我',
 'Paris活得简单就是快乐，但活得复杂了也不一定会有快乐。生活中的我们经常处于一个复杂的境地：忙于工作、家务、家庭琐事，还要照顾孩子、朋友和家人，同时又要追求事业上的成功',
 'Paris.实地考察法是历史学习的重要方法，某同学在法国进行了实地考察，他首先参观了卢浮宫，欣赏了《蒙娜丽莎》和《胜利女神像》，然后去感受巴黎圣母院的',
 'Paris. \n巴黎是法国的首都吗？ \n\n是的，巴黎是法国的首都。以下是详细的解释：\n\n巴黎（Paris）位于法国北部，塞纳河畔，是该国的经济、文化和政治中心。它拥有着',
 'Paris. #Paris #France不断地变化，不断发展的城市\n\n巴黎是法国的首都和最大城市，位于塞纳河右岸，东临德国、西濒英吉利海峡，南界地中海。这里是世界著名的文化、艺术']

In [50]:
# Batch input, num_return_sequences=1
tokens_batch = tokenizer(input_batch, return_tensors="pt", return_attention_mask=True, padding=True).to(device)
tokens_batch

{'input_ids': tensor([[151643, 151643, 151643,   3838,    374,    279,   6722,    315,   9625,
             30],
        [151643, 151643, 151643,   3838,    374,    279,   7772,  34941,    278,
             30],
        [ 15191,   6139,    364,   1249,  26835,    264,  14563,    287,  22592,
          69990]], device='mps:0'), 'attention_mask': tensor([[0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='mps:0')}

In [51]:
op_tokens_batch = model.generate(
    **tokens_batch,
    max_new_tokens=50,
    do_sample=True,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
op_tokens_batch

tensor([[151643, 151643, 151643,   3838,    374,    279,   6722,    315,   9625,
             30,  12095,     13,  12095,    374,    264,   3283,   7407,    304,
          18172,   9625,    323,    374,   3881,    369,   1181,   9080,   3840,
             11,  26277,  59924,     11,    323,  32976,   7674,     13,    576,
           3283,    702,   1012,    279,   6722,    315,   9625,   2474,    279,
          12592,  48993,    323,    572,  18562,   9555,    438,   1741,    389,
           5768,    220,     17,     16,     11,    220],
        [151643, 151643, 151643,   3838,    374,    279,   7772,  34941,    278,
             30,    576,   7772,  34941,    278,    304,    279,   1879,    553,
           8123,    374,    279,   6303,  50019,    320,     33,   6053,    268,
           2912,   2416,   3091,  41349,    701,    892,    646,   3063,    705,
            311,    220,     18,     15,  20044,   1293,    323,  17529,    916,
            220,     17,     15,     15,  19608,   

In [52]:
op_batch = tokenizer.batch_decode(op_tokens_batch, skip_special_tokens=True)
op_batch

['What is the capital of France? Paris. Paris is a city located in northern France and is known for its rich history, iconic landmarks, and vibrant culture. The city has been the capital of France since the Middle Ages and was officially established as such on July 21, ',
 "What is the largest mammal? The largest mammal in the world by volume is the blue whale (Balaenoptera musculus), which can grow up to 30 meters long and weigh over 200 tons. Blue whales are found in all of Earth's",
 "Who wrote 'To Kill a Mockingbird'? Harper Lee.很好地描述了美国南北战争时期的种族歧视和不公平待遇。她以一名叫Jem的男孩和他的家庭为主线，讲述了他们在南方小镇上生活的故事。Jem是一个聪明、勇敢、善良的孩子，他的父亲因为"]

In [53]:
[x[len(input_batch[i]):].strip() for i, x in enumerate(op_batch)]

['Paris. Paris is a city located in northern France and is known for its rich history, iconic landmarks, and vibrant culture. The city has been the capital of France since the Middle Ages and was officially established as such on July 21,',
 "The largest mammal in the world by volume is the blue whale (Balaenoptera musculus), which can grow up to 30 meters long and weigh over 200 tons. Blue whales are found in all of Earth's",
 'Harper Lee.很好地描述了美国南北战争时期的种族歧视和不公平待遇。她以一名叫Jem的男孩和他的家庭为主线，讲述了他们在南方小镇上生活的故事。Jem是一个聪明、勇敢、善良的孩子，他的父亲因为']

In [54]:
# Batch input, num_return_sequences > 1
op_tokens_batch_seq = model.generate(
    **tokens_batch,
    max_new_tokens=50,
    do_sample=True,
    num_return_sequences=5,
    eos_token_id=tokenizer.eos_token_id,
)
op_tokens_batch_seq

  test_elements = torch.tensor(test_elements)


tensor([[151643, 151643, 151643,   3838,    374,    279,   6722,    315,   9625,
             30,  12095,     13,  12095,    374,    264,   6233,   3283,    304,
           9625,     11,   3881,    369,   1181,  26277,  59924,   1741,    438,
            279,    468,   3092,    301,  21938,     11,  43464,   9420,    373,
          56729,     11,    323,    279,   9729,  48506,  16328,     13,   1084,
            374,   1083,   2114,    311,   1657,  34409,  32000,    323,  32976,
          12752,  16065,     11,   3259,    432,    264],
        [151643, 151643, 151643,   3838,    374,    279,   6722,    315,   9625,
             30,  12095,     13, 151643,   7681,  23967,    429,    330,  59604,
              1,    374,    264,   3283,    323,    537,    264,   3146,     11,
            773,    279,   4226,    374,    330,  59604,   3263,  12095,    374,
            279,   6722,   3283,    315,   9625,     11,    892,    374,    264,
           3146,   7407,    304,  10867,   4505,   

In [55]:
op_tokens_batch_seq.shape

torch.Size([15, 60])

In [56]:
op_batch_seq = tokenizer.batch_decode(op_tokens_batch_seq, skip_special_tokens=True)
op_batch_seq

['What is the capital of France? Paris. Paris is a beautiful city in France, known for its iconic landmarks such as the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum. It is also home to many charming neighborhoods and vibrant cultural scenes, making it a',
 'What is the capital of France? Paris. deduced that "Paris" is a city and not a country, so the answer is "Paris". Paris is the capital city of France, which is a country located in Western Europe. It is known for its rich history, iconic',
 'What is the capital of France? Paris. Is there a place in France where you can learn more about the French language and culture? Yes, there are many places in France where you can learn more about the French language and culture. Here are some popular options:\n\n1. French Language',
 'What is the capital of France? Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of F

In [None]:
batch_seq_responses = []

for i in range(len(input_batch)):
    input_text = input_batch[i]
    input_length = len(input_text)
    
    sequences_for_this_input = op_batch_seq[i*5:(i+1)*5]
    
    cleaned_sequences = [seq[input_length:].strip() for seq in sequences_for_this_input]
    
    batch_seq_responses.append(cleaned_sequences)

In [70]:
batch_seq_responses

[['Paris. Paris is a beautiful city in France, known for its iconic landmarks such as the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum. It is also home to many charming neighborhoods and vibrant cultural scenes, making it a',
  'Paris. deduced that "Paris" is a city and not a country, so the answer is "Paris". Paris is the capital city of France, which is a country located in Western Europe. It is known for its rich history, iconic',
  'Paris. Is there a place in France where you can learn more about the French language and culture? Yes, there are many places in France where you can learn more about the French language and culture. Here are some popular options:\n\n1. French Language',
  'Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France Paris. capital of France',
  'Paris, France.'],
 ['The largest m

In [83]:
# Generation with chat template

def format_prompt(inputs: list) -> list[list[str]]:
    formatted = []
    for i in inputs:
        formatted.append([
            {
                "role": "system",
                "content": "You are brief and concise in your answers. If an answer can be a single word, it should be."
            },
            {
                "role": "user",
                "content": i
            }
        ])
    return formatted

In [92]:
input_batch_chat = tokenizer.apply_chat_template(
    format_prompt(input_batch),
    add_generation_prompt=True,
    tokenize=False
)
input_batch_chat

['<|im_start|>system\nYou are brief and concise in your answers. If an answer can be a single word, it should be.<|im_end|>\n<|im_start|>user\nWhat is the capital of France?<|im_end|>\n<|im_start|>assistant\n',
 '<|im_start|>system\nYou are brief and concise in your answers. If an answer can be a single word, it should be.<|im_end|>\n<|im_start|>user\nWhat is the largest mammal?<|im_end|>\n<|im_start|>assistant\n',
 "<|im_start|>system\nYou are brief and concise in your answers. If an answer can be a single word, it should be.<|im_end|>\n<|im_start|>user\nWho wrote 'To Kill a Mockingbird'?<|im_end|>\n<|im_start|>assistant\n"]

In [93]:
tokens_batch_chat = tokenizer(input_batch_chat, return_tensors="pt", return_attention_mask=True, padding=True).to(device)
tokens_batch_chat

{'input_ids': tensor([[151643, 151643, 151643, 151644,   8948,    198,   2610,    525,   9814,
            323,  63594,    304,    697,  11253,     13,   1416,    458,   4226,
            646,    387,    264,   3175,   3409,     11,    432,   1265,    387,
             13, 151645,    198, 151644,    872,    198,   3838,    374,    279,
           6722,    315,   9625,     30, 151645,    198, 151644,  77091,    198],
        [151643, 151643, 151643, 151644,   8948,    198,   2610,    525,   9814,
            323,  63594,    304,    697,  11253,     13,   1416,    458,   4226,
            646,    387,    264,   3175,   3409,     11,    432,   1265,    387,
             13, 151645,    198, 151644,    872,    198,   3838,    374,    279,
           7772,  34941,    278,     30, 151645,    198, 151644,  77091,    198],
        [151644,   8948,    198,   2610,    525,   9814,    323,  63594,    304,
            697,  11253,     13,   1416,    458,   4226,    646,    387,    264,
           3

In [96]:
op_tokens_batch_chat = model.generate(
    **tokens_batch_chat,
    max_new_tokens=50,
    do_sample=True,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
op_tokens_batch_chat

  test_elements = torch.tensor(test_elements)


tensor([[151643, 151643, 151643, 151644,   8948,    198,   2610,    525,   9814,
            323,  63594,    304,    697,  11253,     13,   1416,    458,   4226,
            646,    387,    264,   3175,   3409,     11,    432,   1265,    387,
             13, 151645,    198, 151644,    872,    198,   3838,    374,    279,
           6722,    315,   9625,     30, 151645,    198, 151644,  77091,    198,
          59604, 151645, 151643, 151643],
        [151643, 151643, 151643, 151644,   8948,    198,   2610,    525,   9814,
            323,  63594,    304,    697,  11253,     13,   1416,    458,   4226,
            646,    387,    264,   3175,   3409,     11,    432,   1265,    387,
             13, 151645,    198, 151644,    872,    198,   3838,    374,    279,
           7772,  34941,    278,     30, 151645,    198, 151644,  77091,    198,
           1639,   1574,     13, 151645],
        [151644,   8948,    198,   2610,    525,   9814,    323,  63594,    304,
            697,  11253, 

In [97]:
op_batch_chat = tokenizer.batch_decode(op_tokens_batch_chat, skip_special_tokens=True)
op_batch_chat

['system\nYou are brief and concise in your answers. If an answer can be a single word, it should be.\nuser\nWhat is the capital of France?\nassistant\nParis',
 'system\nYou are brief and concise in your answers. If an answer can be a single word, it should be.\nuser\nWhat is the largest mammal?\nassistant\nWhale.',
 "system\nYou are brief and concise in your answers. If an answer can be a single word, it should be.\nuser\nWho wrote 'To Kill a Mockingbird'?\nassistant\nHarper Lee"]

In [106]:
import re
for i in op_batch_chat:
    # Extract everything after the last assistant response
    match = re.search(r'assistant\n(.*)', i)
    if match:
        response = match.group(1).strip()
        print(response)
    else:
        print("No response found.")

Paris
Whale.
Harper Lee
