In [1]:
import constraintlm as clm
import constraintlm.models.transformers
import constraintlm.sampling.sequence_sampling.smc
import constraintlm.sampling.sequence_sampling.multinomial
import constraintlm.sampling.token_sampling.tokensampler
import constraintlm.constraints.lengthword


import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qwenllm = clm.models.transformers.TransformersLM("Qwen/Qwen2.5-0.5B")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Some parameters are on the meta device because they were offloaded to the disk and cpu.


### Small intro to HF's transformers library

In [3]:
tokens = qwenllm.tokenizer.tokenize("This is a test, right? Like this.<|endoftext|>", return_tensors="pt")
print(tokens, type(tokens))
ids = qwenllm.tokenizer.convert_tokens_to_ids(tokens)
print(ids)
re_tokens = qwenllm.tokenizer.convert_ids_to_tokens(ids)
print(re_tokens)

['This', 'Ġis', 'Ġa', 'Ġtest', ',', 'Ġright', '?', 'ĠLike', 'Ġthis', '.', '<|endoftext|>'] <class 'list'>
[1986, 374, 264, 1273, 11, 1290, 30, 8909, 419, 13, 151643]
['This', 'Ġis', 'Ġa', 'Ġtest', ',', 'Ġright', '?', 'ĠLike', 'Ġthis', '.', '<|endoftext|>']


In [4]:
print(qwenllm.tokenizer.all_special_tokens, qwenllm.tokenizer.all_special_ids)
print(qwenllm.tokenizer.eos_token, qwenllm.tokenizer.eos_token_id)
print(qwenllm.tokenizer.pad_token, qwenllm.tokenizer.pad_token_id)
print(qwenllm.tokenizer.bos_token, qwenllm.tokenizer.bos_token_id)

['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>'] [151643, 151644, 151645, 151646, 151647, 151648, 151649, 151650, 151651, 151652, 151653, 151654, 151655, 151656]
<|endoftext|> 151643
<|endoftext|> 151643
None None


In [5]:
sentences = [
    "Here is the first sentence.<|endoftext|>",     #here <|endoftext|> is converted into eos_token_id. 
    "And the second sentence is here, and it is much longer",
    "Finally we can see the third one"
]
batch = qwenllm.tokenizer(
    sentences,
    padding=True,         # pad each up to the longest in this batch
    truncation=True,      # (optional) cut off anything beyond max_length
    max_length=32,        # (optional) force a maximum
    return_tensors="pt"   # PyTorch tensors
)

Below it is really weird: in the first sentence of batch, '<|endoftext|>' has the id corresponding to the eos_token_id. However, the attention_mask don't pad this token. But since all the same tokenized sentence have the same size, the tokenizer adds new '<|endoftext|>'. It means that creating the mask and padding with '<|endoftext|>' is two different things, and the attention mask only consider extra added '<|endoftext|>', not the ones that were already here. 

In [6]:
batch

{'input_ids': tensor([[  8420,    374,    279,   1156,  11652,     13, 151643, 151643, 151643,
         151643, 151643, 151643],
        [  3036,    279,   2086,  11652,    374,   1588,     11,    323,    432,
            374,   1753,   5021],
        [ 23949,    582,    646,   1490,    279,   4843,    825, 151643, 151643,
         151643, 151643, 151643]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [7]:
qwenllm.tokenizer.batch_decode(batch.input_ids[0].unsqueeze(0))

['Here is the first sentence.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>']

In [8]:
qwenllm.tokenizer.convert_ids_to_tokens(batch.input_ids[0])

['Here',
 'Ġis',
 'Ġthe',
 'Ġfirst',
 'Ġsentence',
 '.',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>']

### About the vocab size of a HF transformers model

We have two sources to obtain the vocab_size, and none of them corresponds to the shape of the logits.

In [9]:
# mapping: token string → integer ID
vocab_dict = qwenllm.tokenizer.get_vocab()  
vocab_size = len(vocab_dict)
# is equal to len(tokenizer) which is equal to tokenizer.vocab_size + tokenizer.added_tokens_encoder

vocab_size_bis = qwenllm.tokenizer.vocab_size


vocab_size_ter = qwenllm.model.config.vocab_size

inputs = qwenllm.tokenizer("The best football player in the world is Leo", return_tensors="pt")
logits, _ = qwenllm.logits(inputs.input_ids)
probs = torch.softmax(logits, dim=-1, dtype=torch.float) 
new_tok = torch.multinomial(probs, num_samples=1)

print(f'Vocab_size: {vocab_size}\n', f'Vocab_size bis: {vocab_size_bis}\n', f'Difference: {vocab_size - vocab_size_bis}\n', f'Number of added tokens: {len(qwenllm.tokenizer.added_tokens_encoder)}\n', f'Number of special tokens: {len(qwenllm.tokenizer.all_special_tokens)}') 
print(f'Vocab_size ter: {vocab_size_ter}') 
print(f'Number of outputted logits: {logits.shape[1]}\n', f'New token {qwenllm.tokenizer.convert_ids_to_tokens(new_tok)}')

Vocab_size: 151665
 Vocab_size bis: 151643
 Difference: 22
 Number of added tokens: 22
 Number of special tokens: 14
Vocab_size ter: 151936
Number of outputted logits: 151936
 New token ['ĠMessi']


In [10]:
print(qwenllm.tokenizer.convert_ids_to_tokens(151642)) # the 151643th : the last base token

print(qwenllm.tokenizer.convert_ids_to_tokens(151664)) # the 151665th : the last base + added token
print(qwenllm.tokenizer.convert_ids_to_tokens(151665)) # the 151666th : None

â½Ĺ
<|file_sep|>
None


In [11]:
151936/8

18992.0

There is 3 vocab_size: 
* the base size (obtained via tokenizer.vocab_size) 
* the full size that containts the base size + all added tokens (it includes the special tokens) (obtained via len(tokenizer.get_vocab()) or len(tokenizer))
* the embedding size or the logits/probs size: the last vector outputed by the forward of the model

For optimization purposes, the last one must me a multiple of 8 (151936 = 8*18992). 

Theoretically, we could sample a token_id larger than the tokenizer length, and thus the id will be converted into None. (This is very unlikely, and even more if you are using a top_k / top_p).


# Sequence Sampling Test

In [12]:
tests_sentences = [
    "La révolution française",
    "Aujourd'hui; maman est morte. Ou peut-être",
    "The best basketball player of all time is Michael"
]
tests101 = qwenllm.tokenizer(tests_sentences, padding=True, return_tensors="pt")

In [20]:
multin_sampler = clm.sampling.token_sampling.tokensampler.TokenSampler(top_k=5)

In [21]:
input_ids = tests101.input_ids
attn_mask_0 = tests101.attention_mask

# --- t=0 ---
next_token_logits, past_key_values = qwenllm.logits(input_ids, attn_mask_0)
attn_mask = torch.cat([attn_mask_0, torch.ones((len(tests_sentences), 1))], dim=-1)
new_ids = multin_sampler.sample(next_token_logits)

prompt_gen_ids = torch.cat([input_ids, new_ids], dim=-1)
print(qwenllm.tokenizer.batch_decode(prompt_gen_ids))
# --- end of t=0 ---

gen_ids = new_ids.clone()

['La révolution française<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Vous', "Aujourd'hui; maman est morte. Ou peut-être qu", 'The best basketball player of all time is Michael<|endoftext|><|endoftext|><|endoftext|><|endoftext|>Human']


In [22]:
for t in range(3):
    next_token_logits, past_key_values = qwenllm.logits(new_ids, attn_mask, past_key_values)
    attn_mask = torch.cat([attn_mask, torch.ones((len(tests_sentences), 1))], dim=-1)
    new_ids = multin_sampler.sample(next_token_logits)

    prompt_gen_ids = torch.cat([prompt_gen_ids, new_ids], dim=-1)
    print(qwenllm.tokenizer.batch_decode(prompt_gen_ids))

    gen_ids = torch.cat([gen_ids, new_ids], dim=-1)
    print(qwenllm.tokenizer.batch_decode(gen_ids))


['La révolution française<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Vous êtes', "Aujourd'hui; maman est morte. Ou peut-être qu'il", 'The best basketball player of all time is Michael<|endoftext|><|endoftext|><|endoftext|><|endoftext|>Human beings']
['Vous êtes', " qu'il", 'Human beings']
['La révolution française<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Vous êtes à', "Aujourd'hui; maman est morte. Ou peut-être qu'il est", 'The best basketball player of all time is Michael<|endoftext|><|endoftext|><|endoftext|><|endoftext|>Human beings\n']
['Vous êtes à', " qu'il est", 'Human beings\n']
['La révolution française<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Vous êtes à la', "Aujourd'hui; maman est morte. Ou peut-être qu'il est mort", 'The best basketball player of all tim

In [25]:
sequence_sampler = clm.sampling.sequence_sampling.multinomial.MultinomialSeqSampler(qwenllm, multin_sampler)
gen_ids = sequence_sampler.sample(input_ids, 30)

prompt_gen_ids = torch.cat([input_ids, gen_ids], dim=-1)
print(qwenllm.tokenizer.batch_decode(gen_ids))
print(qwenllm.tokenizer.batch_decode(prompt_gen_ids))

['A: 1800-1830 - 1830\nA la suite de la chute des Habsbourg en', " plus. Je suis désolé.\nAujourd'hui; mon père est mort. Ou peut-être plus. Je suis désolé.\n", 'Find the answer to the following question, calculate the number of digits in the answer. The answer is 5.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>']
['La révolution française<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>A: 1800-1830 - 1830\nA la suite de la chute des Habsbourg en', "Aujourd'hui; maman est morte. Ou peut-être plus. Je suis désolé.\nAujourd'hui; mon père est mort. Ou peut-être plus. Je suis désolé.\n", 'The best basketball player of all time is Michael<|endoftext|><|endoftext|><|endoftext|><|endoftext|>Find the answer to the following question, calculate the number of digits in the answer. The answer is 5.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext

# SMC Sampling

In [26]:
word5 = clm.constraints.lengthword.LengthWord(qwenllm, 5)

In [27]:
score = word5.prefix(tests101.input_ids)
print(score)

tensor([-inf, -inf, -inf])


: 