In [1]:
import constraintlm as clm

import torch

  from .autonotebook import tqdm as notebook_tqdm


# ConstraintLM tutorial

### Define a LM from your favorite package (`transformers`, `vllm`)

In [2]:
qwenllm = clm.TransformersLM("Qwen/Qwen2.5-0.5B")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


#### Problem about masking

In [3]:
print(len(qwenllm.tokenizer.get_vocab()), qwenllm.vocab_size)

151665 151665


In [4]:
print(qwenllm.tokenizer.eos_token_id, qwenllm.tokenizer.pad_token_id)
print(qwenllm.model.config.eos_token_id, qwenllm.model.config.pad_token_id)

151643 151643
151643 151643


In [5]:
prompt = "Ludwig Wittgenstein was a<|endoftext|>"
input_ids = qwenllm.tokenizer(prompt, return_tensors="pt").input_ids
print(input_ids)

tensor([[    43,    661,  36922,  82176,   4370,  12429,    572,    264, 151643]])


In [69]:
numo = 11891
print(qwenllm.tokenizer.decode(numo))
for c in qwenllm.tokenizer.decode(numo):
    print(c)

 nos
 
n
o
s


In [7]:
print(input_ids[0])

tensor([    43,    661,  36922,  82176,   4370,  12429,    572,    264, 151643])


In [8]:
type(qwenllm.tokenizer.batch_decode(torch.tensor([14])))

list

In [9]:
qwenllm.model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [10]:
outputs = qwenllm.model.generate(input_ids, do_sample=True, max_new_tokens=5)
qwenllm.tokenizer.batch_decode(outputs, skip_special_tokens=False)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['Ludwig Wittgenstein was a<|endoftext|>A. a man of']

In [11]:
attn_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0]])
output_logits = qwenllm.model(input_ids, attention_mask=attn_mask)
output_logits.logits[...,-2,:]

tensor([[ 4.5312,  3.4844,  1.3984,  ..., -6.0938, -6.0938, -6.0938]],
       dtype=torch.bfloat16, grad_fn=<SliceBackward0>)

In [12]:
prompt2 = "Ludwig Wittgenstein was a"
input_ids2 = qwenllm.tokenizer(prompt2, return_tensors="pt").input_ids
print(input_ids2)
output_logits2 = qwenllm.model(input_ids2)
print(output_logits2.logits[...,-1,:])

tensor([[   43,   661, 36922, 82176,  4370, 12429,   572,   264]])
tensor([[ 4.4688,  3.4219,  1.4062,  ..., -6.0625, -6.0625, -6.0625]],
       dtype=torch.bfloat16, grad_fn=<SliceBackward0>)


In [13]:
diff = (output_logits.logits[:, -2,:] - output_logits2.logits[:, -1,:]).abs().max()
print("Max abs diff:", diff)  # should be extremely close to 0

Max abs diff: tensor(0.2500, dtype=torch.bfloat16, grad_fn=<MaxBackward1>)


In [14]:
print(output_logits.logits[:, -2,:])
print(qwenllm.logits(input_ids, attn_mask))
print(output_logits2.logits[:, -1,:])
print(qwenllm.logits(input_ids2))

tensor([[ 4.5312,  3.4844,  1.3984,  ..., -6.0938, -6.0938, -6.0938]],
       dtype=torch.bfloat16, grad_fn=<SliceBackward0>)
(tensor([[11.0000, 11.8750, 15.6875,  ..., -4.2812, -4.2812, -4.2812]],
       dtype=torch.bfloat16), <transformers.cache_utils.DynamicCache object at 0x000002B5CB867790>)
tensor([[ 4.4688,  3.4219,  1.4062,  ..., -6.0625, -6.0625, -6.0625]],
       dtype=torch.bfloat16, grad_fn=<SliceBackward0>)
(tensor([[ 4.4688,  3.4219,  1.4062,  ..., -6.0625, -6.0625, -6.0625]],
       dtype=torch.bfloat16), <transformers.cache_utils.DynamicCache object at 0x000002B5F6C14110>)


## Guide

In [3]:
prompts = [
    "In July 1789 the French", 
    "The best basketball player of all time is Michael",
    "Ludwig Wittgenstein was a"
]

In [4]:
batch = qwenllm.tokenizer(prompts, padding=True, return_tensors="pt")
print(batch.input_ids, batch.attention_mask)

tensor([[   641,   5768,    220,     16,     22,     23,     24,    279,   8585],
        [   785,   1850,  19240,   2781,    315,    678,    882,    374,   7937],
        [    43,    661,  36922,  82176,   4370,  12429,    572,    264, 151643]]) tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0]])


In [5]:
logits, past_key_values = qwenllm.logits(batch.input_ids, attention_mask=batch.attention_mask)
print(logits, logits.shape)

tensor([[ 4.9688,  3.9844,  0.9844,  ..., -4.0625, -4.0625, -4.0625],
        [ 5.6250,  1.3125,  0.1367,  ..., -8.3125, -8.3125, -8.3125],
        [11.0000, 11.8750, 15.6875,  ..., -4.2812, -4.2812, -4.2812]],
       dtype=torch.bfloat16) torch.Size([3, 151936])


#### It works as usual:

You can access the logits of the next token...

... and sample from the logits.

In [6]:
probs = torch.softmax(logits, dim=-1)
next_token_ids = qwenllm.sample(probs, top_k=5)
print(qwenllm.tokenizer.batch_decode(next_token_ids))
print(qwenllm.tokenizer.batch_decode(torch.cat([batch.input_ids, next_token_ids], dim=-1)))

[' Parliament', ' Jordan', 'Human']
['In July 1789 the French Parliament', 'The best basketball player of all time is Michael Jordan', 'Ludwig Wittgenstein was a<|endoftext|>Human']


The `sample()` function allows you to sample the next token with a vast variety of techniques, by specifying the desired ones in the args.

To sample whole sequences, you need to define a `SequenceSampler`.

In [7]:
multinomial = clm.MultinomialSeqSampler(qwenllm)
generated_token_ids = multinomial.sample(batch.input_ids, 10, top_k = 5)
print(qwenllm.tokenizer.batch_decode(torch.cat([batch.input_ids, generated_token_ids], dim=-1)))

1
2
3
4
5
6
7
8
9
['In July 1789 the French Parliament approved a bill that established a new constitution.', 'The best basketball player of all time is Michael Jordan.\nMichael Jordan is the best known basketball player', "Ludwig Wittgenstein was a<|endoftext|>Let's be clear: Wittgenstein's ideas"]


### The concept of Constraint
It allows you to constrain (control) the way the LM generate tokens.

First, instantiate a `Constraint`

In [8]:
word5 = clm.LengthWord(qwenllm, 5)

This allows you to apply a constraint on the logits outputted by your LM...

In [21]:
probs = torch.softmax(logits, dim=-1)
constrained_probs, _ = word5.apply(input_ids=torch.tensor([]), probs=probs) # for the 1st generated token, set input_ids as an empty tensor or None

In [22]:
print(probs)
print(constrained_probs) #as we can see it is not the same as 

tensor([[2.1905e-06, 8.1956e-07, 4.0745e-08,  ..., 2.6193e-10, 2.6193e-10,
         2.6193e-10],
        [4.1723e-06, 5.5647e-08, 1.7229e-08,  ..., 3.6948e-12, 3.6948e-12,
         3.6948e-12],
        [3.0708e-04, 7.4005e-04, 3.3447e-02,  ..., 7.0941e-11, 7.0941e-11,
         7.0941e-11]], dtype=torch.bfloat16)
tensor([[1.1504e-05, 4.2915e-06, 2.1420e-07,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [4.2915e-04, 5.7220e-06, 1.7732e-06,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [3.3569e-04, 8.0872e-04, 3.6621e-02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], dtype=torch.bfloat16)


... and sample accordingly to this constraint (the generated tokens don't exceed 5 letters)... 

In [23]:
next_const_token_ids = qwenllm.sample(constrained_probs)
print(qwenllm.tokenizer.batch_decode(next_const_token_ids))
print(qwenllm.tokenizer.batch_decode(torch.cat([batch.input_ids, next_const_token_ids], dim=-1)))

[' colon', ' Al', 'a']
['In July 1789 the French colon', 'The best basketball player of all time is Michael Al', 'Ludwig Wittgenstein was a<|endoftext|>a']


... or to sample whole sequences according to this constraint:

In [24]:
cons_multinomial = clm.MultinomialSeqSampler(qwenllm, constraint=word5)
cons_generated_token_ids = cons_multinomial.sample(batch.input_ids, max_length=4, top_k=5)
print(qwenllm.tokenizer.batch_decode(torch.cat([batch.input_ids, cons_generated_token_ids], dim=-1)))

['In July 1789 the French army under the Duke', 'The best basketball player of all time is Michael Dwyer\n', 'Ludwig Wittgenstein was a<|endoftext|>Human Logic and Lingu']


Remark: The sentence w/ L. Wittgenstein is padded by one eos_token (same as pad_token), and it performs consistently worse than the others by printing weird tokens after the eos_token. See answer above.

This Sequence Sampling technique, which consists in sampling tokens that respect the constraint one by one, called Locally Constraint decoding (LCD), is a little bit naive. Indeed, the sequences we are sampling don't follow the target distribution (distribution of sequences generated by the LMs and that respect the constraint): $\frac{p(x) \Phi(x)}{\sum p(x) \Phi(x)}$, where $\Phi(x)$ is the non-negative value assigned by the constraint $\Phi$ to the  sentence $x$.

We can try to approximate this target distribution using Sequential Monte Carlo methods.

# The SMC Sampler

In [22]:
smc_sampler = clm.SMCSampler(qwenllm, word5)
smc_generated_token_ids = smc_sampler.sample(batch.input_ids, max_length=5, num_particles=2, ess_threshold=1/2)

In [25]:
smc_generated_token_ids[0]

tensor([[[ 1033,   911,   311,  3164,   320],
         [13390,    11, 62857,   553,  2799]],

        [[ 7801, 15640,    13,  1260,   572],
         [97138,   702,   264,  2244,  2487]],

        [[12895,   279,  3719,    71,  5235],
         [33975, 13962,   374,   279,  1852]]])

In [27]:
print(qwenllm.tokenizer.batch_decode(smc_generated_token_ids[0].reshape(6,5)))

[' were about to win (', ' army, aided by techn', ' James Jr. He was', ' Kidd has a great body', 'Could the Subh Black', 'Human evil is the same']


In [28]:
smc_generated_token_ids[1]

tensor([[-3.1885, -3.4590],
        [-5.6890, -6.9258],
        [-6.4526, -4.8926]])

# FSM Constraints

#### Regex
Definition

These 14 characters are special unless back-slashed: .^$*+?{}[]\|()

. : any character except newline

\* : 0 or more

\+ : 1 or more

? : 0 or 1

{4} : exactly 4

{4,6} : between 4 and 6 (inclusive)

{4,} : 4 or more

[abc] : “a”, “b”, or “c”

[a-zA-Z0-9_] : any of those

[^…] : negated set

\ : if it is \\. (or \\?, \\$, ...) it means a literal . (or ?, $, ...), but with non special characters, 

\d : digit [0-9] (Unicode digits if UNICODE)

\D : not a digit

\w : word char [a-zA-Z0-9_]

\W : not word

\s : any whitespace ([ \t\n\r\f\v], plus other Unicode spaces)

\S : not whitespace

# Constraint using FSM


In [6]:
class LenWord(clm.FSMConstraint):
    def __init__(self, llm, fsm):
        super().__init__(llm, fsm)

    def prefix():
        pass
    def complete():
        pass

    def score(self, input_ids):
        """
        Given the token IDs generated so far (input_ids), 
        return the score associated by the constraint.
        """
        if input_ids is None or input_ids.numel() == 0:
            return 1
        # Flatten batch dimensions
        batch_shape = input_ids.shape[:-1]
        seq_length = input_ids.shape[-1]
        flat_ids = input_ids.view(-1, seq_length)
        
        # Decode each sequence to text
        #Big issue here, a EOS_token such as <|endoftext|> will count as a long word
        texts = self.model.tokenizer.batch_decode(flat_ids, skip_special_tokens=True)
        
        # Determine penalty per sequence
        scores = []
        for text in texts:
            # Split on whitespace to get words
            words = text.split()
            # Check if any word is too long
            if any(len(word) > self.N for word in words):
                scores.append(0.0)
            else:
                scores.append(1.0)
        
        # Convert to tensor and reshape to original batch shape
        scores = torch.tensor(scores, dtype=torch.float, device=input_ids.device)
        return scores.view(batch_shape)

In [7]:
wordlen5 = LenWord(qwenllm, clm.FiniteStateMachine.from_regex("\s([A-Za-z0-9]{1,5}[.!?,]?\s)+").to_dfa())
wordlen5.create_hash_tables()

[(IN, [(CATEGORY, CATEGORY_SPACE)]), (MAX_REPEAT, (1, MAXREPEAT, [(SUBPATTERN, (1, 0, 0, [(MAX_REPEAT, (1, 5, [(IN, [(RANGE, (65, 90)), (RANGE, (97, 122)), (RANGE, (48, 57))])])), (MAX_REPEAT, (0, 1, [(IN, [(LITERAL, 46), (LITERAL, 33), (LITERAL, 63), (LITERAL, 44)])])), (IN, [(CATEGORY, CATEGORY_SPACE)])]))]))]
{(0, '\r'): {1}, (0, ' '): {1}, (0, '\x0c'): {1}, (0, '\t'): {1}, (0, '\n'): {1}, (0, '\x0b'): {1}, (1, ''): {34}, (34, ''): {4, 6, 10, 16, 24}, (4, 'j'): {5}, (4, '8'): {5}, (4, 'Q'): {5}, (4, 'E'): {5}, (4, 'i'): {5}, (4, 'x'): {5}, (4, 'X'): {5}, (4, 'e'): {5}, (4, 'D'): {5}, (4, 'G'): {5}, (4, 'U'): {5}, (4, 'A'): {5}, (4, 'r'): {5}, (4, 'a'): {5}, (4, '3'): {5}, (4, 'N'): {5}, (4, 's'): {5}, (4, 'h'): {5}, (4, 'c'): {5}, (4, 'L'): {5}, (4, 'y'): {5}, (4, 'l'): {5}, (4, '9'): {5}, (4, 'u'): {5}, (4, 'o'): {5}, (4, 'f'): {5}, (4, 'R'): {5}, (4, 'Z'): {5}, (4, '5'): {5}, (4, '2'): {5}, (4, 'm'): {5}, (4, 'P'): {5}, (4, 'k'): {5}, (4, 'S'): {5}, (4, 'K'): {5}, (4, 't'): {5}, (

This allows you to apply a constraint on the logits outputted by your LM...

In [8]:
probs = torch.softmax(logits, dim=-1)
constrained_probs, _ = wordlen5.apply(input_ids=torch.tensor([]), probs=probs) # for the 1st generated token, set input_ids as an empty tensor or None

... and sample accordingly to this constraint (the generated tokens don't exceed 5 letters)... 

In [9]:
next_const_token_ids = qwenllm.sample(constrained_probs)
print(qwenllm.tokenizer.batch_decode(next_const_token_ids))
print(qwenllm.tokenizer.batch_decode(torch.cat([batch.input_ids, next_const_token_ids], dim=-1)))

[' Prime', ' Snow', ' Find']
['In July 1789 the French Prime', 'The best basketball player of all time is Michael Snow', 'Ludwig Wittgenstein was a<|endoftext|> Find']


... or to sample whole sequences according to this constraint:

In [10]:
cons_multinomial = clm.MultinomialSeqSampler(qwenllm, constraint=wordlen5)
cons_generated_token_ids = cons_multinomial.sample(batch.input_ids, max_length=10, top_k=5)
print(qwenllm.tokenizer.batch_decode(torch.cat([batch.input_ids, cons_generated_token_ids], dim=-1)))

1
2
3
4
5
6
7
8
9
['In July 1789 the French and the other West Asian ports of the world were', 'The best basketball player of all time is Michael Redd.\nHe was born in 19', 'Ludwig Wittgenstein was a<|endoftext|> Given the above text, we can infer that the']


In [11]:
wordlen5.current_state = wordlen5.fsm.start_state       # We need to re-initialize the current state after a generation
smc_sampler_fsm = clm.SMCSampler(qwenllm, wordlen5)
smc_generated_token_ids_fsm = smc_sampler_fsm.sample(batch.input_ids, max_length=10, num_particles=3, ess_threshold=1/2)

ESS always is >= 1, particles won't be resampled.
1
2
3
4
5
6
7
8
9


In [12]:
print([a+b for a,b in zip(qwenllm.tokenizer.batch_decode(batch.input_ids.repeat_interleave(3, dim=0)), qwenllm.tokenizer.batch_decode(smc_generated_token_ids_fsm[0].reshape(3*3,10)))])

['In July 1789 the French were on the verge of rela Iraq of the', 'In July 1789 the French were fired upon by the Hugue Borde', 'In July 1789 the French clerk Mr. Ferry, a Spitz von', 'The best basketball player of all time is Michael J. Midk flex, and one of them', 'The best basketball player of all time is Michael Scott of the NBA.\nA. Both B.', 'The best basketball player of all time is Michael Cox. Cox is the man who has made most', 'Ludwig Wittgenstein was a<|endoftext|> Parts of Pi. He had one of the great', 'Ludwig Wittgenstein was a<|endoftext|>\tDream of a While\nA. Lucky\n', 'Ludwig Wittgenstein was a<|endoftext|> Let. He was quite uncon. Can you give']


add validate() function ------------------- !!!!!!!!!!!!!!!

# Chaining constraints

In [13]:
class Digits(clm.FSMConstraint):
    def __init__(self, llm, fsm):
        super().__init__(llm, fsm)

    def prefix():
        pass
    def complete():
        pass

    def score(self, input_ids):
        """
        Given the token IDs generated so far (input_ids), 
        return the score associated by the constraint.
        """
        if input_ids is None or input_ids.numel() == 0:
            return 1
        # Flatten batch dimensions
        batch_shape = input_ids.shape[:-1]
        seq_length = input_ids.shape[-1]
        flat_ids = input_ids.view(-1, seq_length)

        texts = self.model.tokenizer.batch_decode(input_ids, skip_special_tokens=True)

        allowed = set("0123456789+-*/ ")

        scores = []
        for text in texts:
            if all(c in allowed for c in text):
                scores.append(1.0)
            else:
                scores.append(0.0)
        scores = torch.tensor(scores, dtype=torch.float, device=input_ids.device)

        return scores.view(batch_shape)

In [14]:
digits = Digits(qwenllm, clm.FiniteStateMachine.from_regex("(\d+|[ +\-*/])+").to_dfa())
digits.create_hash_tables()

[(MAX_REPEAT, (1, MAXREPEAT, [(SUBPATTERN, (1, 0, 0, [(BRANCH, (None, [[(MAX_REPEAT, (1, MAXREPEAT, [(IN, [(CATEGORY, CATEGORY_DIGIT)])]))], [(IN, [(LITERAL, 32), (LITERAL, 43), (LITERAL, 45), (LITERAL, 42), (LITERAL, 47)])]]))]))]))]
{(6, ''): {0, 4}, (0, '8'): {1}, (0, '3'): {1}, (0, '5'): {1}, (0, '7'): {1}, (0, '2'): {1}, (0, '4'): {1}, (0, '1'): {1}, (0, '6'): {1}, (0, '0'): {1}, (0, '9'): {1}, (1, ''): {0, 3}, (2, ''): {0, 3}, (3, ''): {7}, (4, '*'): {5}, (4, ' '): {5}, (4, '+'): {5}, (4, '-'): {5}, (4, '/'): {5}, (5, ''): {7}, (7, ''): {9, 6}, (8, ''): {9, 6}}
defaultdict(<class 'set'>, {(0, '8'): {1}, (0, '3'): {1}, (0, '*'): {2}, (0, ' '): {2}, (0, '5'): {1}, (0, '7'): {1}, (0, '2'): {1}, (0, '+'): {2}, (0, '4'): {1}, (0, '-'): {2}, (0, '1'): {1}, (0, '6'): {1}, (0, '0'): {1}, (0, '/'): {2}, (0, '9'): {1}, (1, '8'): {1}, (1, '3'): {1}, (1, '*'): {2}, (1, ' '): {2}, (1, '5'): {1}, (1, '7'): {1}, (1, '2'): {1}, (1, '+'): {2}, (1, '4'): {1}, (1, '-'): {2}, (1, '1'): {1}, (1, '6')

In [15]:
rpnc = clm.RPNConstraint(qwenllm)

In [16]:
prompts_rpn = [
    "Given an arithmetic expression in standard infix notation, it is possible to convert it to Reverse Polish Notation (RPN).\n\nExample 1:\nInput: (3 + 4) * 5\nOutput: 3 4 + 5 *\n\nExample 2:\nInput: 7 - (2 + 3) * 4\nOutput: 7 2 3 + 4 * -\n\nExample 3:\nInput: (8 / 2) + (3 * (4 - 1))\nOutput:", 
    "Given an arithmetic expression in standard infix notation, it is possible to convert it to Reverse Polish Notation (RPN).\n\nExample 1:\nInput: (3 + 4) * 5\nOutput: 3 4 + 5 *\n\nExample 2:\nInput: 7 - (2 + 3) * 4\nOutput: 7 2 3 + 4 * -\n\nExample 3:\nInput: (3 + 4) * 5 − 6 / (1 + 2)\nOutput:", 
    "Given an arithmetic expression in standard infix notation, it is possible to convert it to Reverse Polish Notation (RPN).\n\nExample 1:\nInput: (3 + 4) * 5\nOutput: 3 4 + 5 *\n\nExample 2:\nInput: 7 - (2 + 3) * 4\nOutput: 7 2 3 + 4 * -\n\nExample 3:\nInput: (6 + 2) * 3 − 4\nOutput:", 
]
batch_rpn = qwenllm.tokenizer(prompts_rpn, padding=True, return_tensors="pt")

In [17]:
smc_sampler_chain = clm.SMCSampler(qwenllm, digits, rpnc)
smc_generated_token_ids_chain = smc_sampler_chain.sample(batch_rpn.input_ids, max_length=10, num_particles=3, ess_threshold=2)

None
1
tensor([[21],
        [17],
        [16],
        [18],
        [18],
        [18],
        [19],
        [16],
        [16]])
2
tensor([[ 21, 220],
        [ 17, 220],
        [ 16, 220],
        [ 18, 220],
        [ 18, 220],
        [ 18, 220],
        [ 19,  21],
        [ 16, 220],
        [ 16, 220]])
3
tensor([[ 21, 220,  23],
        [ 17, 220,  18],
        [ 16, 220,  17],
        [ 18, 220,  19],
        [ 18, 220,  19],
        [ 18, 220,  19],
        [ 19,  21,  15],
        [ 16, 220,  18],
        [ 16, 220,  20]])
4
tensor([[ 21, 220,  23, 481],
        [ 17, 220,  18, 220],
        [ 16, 220,  17, 353],
        [ 18, 220,  19, 488],
        [ 18, 220,  19, 488],
        [ 18, 220,  19, 488],
        [ 19,  21,  15,  20],
        [ 16, 220,  18, 220],
        [ 16, 220,  20, 488]])
5
tensor([[ 21, 220,  23, 481, 220],
        [ 17, 220,  18, 220,  19],
        [ 16, 220,  17, 353, 220],
        [ 18, 220,  19, 488, 220],
        [ 18, 220,  19, 488, 220],
     

In [18]:
for output in [a+b for a,b in zip(qwenllm.tokenizer.batch_decode(batch_rpn.input_ids.repeat_interleave(3, dim=0)), qwenllm.tokenizer.batch_decode(smc_generated_token_ids_chain[0].reshape(3*3,10)))]:
    print("------------------------")
    print(output)

------------------------
Given an arithmetic expression in standard infix notation, it is possible to convert it to Reverse Polish Notation (RPN).

Example 1:
Input: (3 + 4) * 5
Output: 3 4 + 5 *

Example 2:
Input: 7 - (2 + 3) * 4
Output: 7 2 3 + 4 * -

Example 3:
Input: (8 / 2) + (3 * (4 - 1))
Output:<|endoftext|><|endoftext|><|endoftext|>6 8 - 2 * 3 *
------------------------
Given an arithmetic expression in standard infix notation, it is possible to convert it to Reverse Polish Notation (RPN).

Example 1:
Input: (3 + 4) * 5
Output: 3 4 + 5 *

Example 2:
Input: 7 - (2 + 3) * 4
Output: 7 2 3 + 4 * -

Example 3:
Input: (8 / 2) + (3 * (4 - 1))
Output:<|endoftext|><|endoftext|><|endoftext|>2 3 4 - 1 + -
------------------------
Given an arithmetic expression in standard infix notation, it is possible to convert it to Reverse Polish Notation (RPN).

Example 1:
Input: (3 + 4) * 5
Output: 3 4 + 5 *

Example 2:
Input: 7 - (2 + 3) * 4
Output: 7 2 3 + 4 * -

Example 3:
Input: (8 / 2) + (3 * (

#### Next steps

GPT-2 \\
Model from scratch \\
Generateur aléatoire \\

Few shot \\
SFT \\


(SFT might be enough for translation of RPN expressions, but if the task is: given a number int, find an RPN expression whose result is this number, we need RL)

not(e) -> len<20 : problem

Comparer proba des phrases générées pour LenWord5 avec et sans SMC (greedy).

regarder temps exec smc w/ num_particles

RPN : input: an int. We want the model to create a RPN expression that is equal to input.

Des C et des L : input: int, list(int). output: expression using list(int) equal to int.

b ? e : if b then e \
b: bool
e: int

RPN: \d+-*/ T F et ou ?    ->    stack     -> 

0th step:
\d+-*/ =  (with fixed name of variable)  ->     stack (we need to modify the FSM)      ->     typing     -> (end of SMC)    eval



then same, But I am not allowed to use a variable if it was not initialized before 

1st step:
\d+-*/ =    ->     stack (we need to modify the FSM)      ->     typing     -> (end of SMC)    eval

# RPN Typed

"(\d+|[ +\-\*/]|[a-zA-Z][a-zA-Z0-9]*)+" (lexical constraint)

 -------> 
 
 number of digit/variables >= 2 (syntactic constraint),    
 type-checking in the stack (semantic constraint: type checking, declaration checking)    
 
 -------->     end




In [19]:
class DigitsVars(clm.FSMConstraint):
    def __init__(self, llm, fsm):
        super().__init__(llm, fsm)

    def prefix():
        pass
    def complete():
        pass

    def score(self, input_ids):
        """
        Given the token IDs generated so far (input_ids), 
        return the score associated by the constraint.
        """
        if input_ids is None or input_ids.numel() == 0:
            return 1
        # Flatten batch dimensions
        batch_shape = input_ids.shape[:-1]
        seq_length = input_ids.shape[-1]
        flat_ids = input_ids.view(-1, seq_length)

        texts = self.model.tokenizer.batch_decode(input_ids, skip_special_tokens=True)

        allowed = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+-*/ ")

        scores = []
        for text in texts:
            if all(c in allowed for c in text):
                scores.append(1.0)
            else:
                scores.append(0.0)
        scores = torch.tensor(scores, dtype=torch.float, device=input_ids.device)

        return scores.view(batch_shape)

In [20]:
digitsvars = DigitsVars(qwenllm, clm.FiniteStateMachine.from_regex("(\d+|[ +\-*/]|[a-zA-Z][a-zA-Z0-9]*)+").to_dfa())
digitsvars.create_hash_tables()

[(MAX_REPEAT, (1, MAXREPEAT, [(SUBPATTERN, (1, 0, 0, [(BRANCH, (None, [[(MAX_REPEAT, (1, MAXREPEAT, [(IN, [(CATEGORY, CATEGORY_DIGIT)])]))], [(IN, [(LITERAL, 32), (LITERAL, 43), (LITERAL, 45), (LITERAL, 42), (LITERAL, 47)])], [(IN, [(RANGE, (97, 122)), (RANGE, (65, 90))]), (MAX_REPEAT, (0, MAXREPEAT, [(IN, [(RANGE, (97, 122)), (RANGE, (65, 90)), (RANGE, (48, 57))])]))]]))]))]))]
{(12, ''): {0, 4, 6}, (0, '8'): {1}, (0, '3'): {1}, (0, '5'): {1}, (0, '7'): {1}, (0, '2'): {1}, (0, '4'): {1}, (0, '1'): {1}, (0, '6'): {1}, (0, '0'): {1}, (0, '9'): {1}, (1, ''): {0, 3}, (2, ''): {0, 3}, (3, ''): {13}, (4, '*'): {5}, (4, ' '): {5}, (4, '+'): {5}, (4, '-'): {5}, (4, '/'): {5}, (5, ''): {13}, (6, 'Q'): {7}, (6, 'E'): {7}, (6, 'i'): {7}, (6, 'x'): {7}, (6, 'X'): {7}, (6, 'e'): {7}, (6, 'D'): {7}, (6, 'G'): {7}, (6, 'U'): {7}, (6, 'A'): {7}, (6, 'r'): {7}, (6, 'a'): {7}, (6, 'N'): {7}, (6, 's'): {7}, (6, 'h'): {7}, (6, 'c'): {7}, (6, 'L'): {7}, (6, 'y'): {7}, (6, 'l'): {7}, (6, 'u'): {7}, (6, 'o'

In [21]:
rpntypedc = clm.RPNTypeConstraint(qwenllm)

In [22]:
prompts_rpntyped = [
    "Example 1:\nInput: foo = 4, (3 + foo) * 5\nOutput: 3 foo 4 = + 5 *\n\nExample 2:\nInput: bar = 3, 7 - (2 + bar) * 4\nOutput: 7 2 bar 3 = + 4 * -\n\nExample 3:\nInput: foofoo = 2, 8 + (foofoo * (4 - 1))\nOutput:", 
    "Example 1:\nInput: (3 + 4) * 5\nOutput: 3 4 + 5 *\n\nExample 2:\nInput: bar = 3,  7 - (2 + bar) * 4\nOutput: 7 2 bar 3 = + 4 * -\n\nExample 3:\nInput: foofoo = 5, (3 + 4) * foofoo − 6 \nOutput:", 
]
batch_rpntyped = qwenllm.tokenizer(prompts_rpntyped, padding=True, return_tensors="pt")

In [23]:
smc_sampler_chain2 = clm.SMCSampler(qwenllm, digitsvars, rpntypedc)
smc_generated_token_ids_chain2 = smc_sampler_chain2.sample(batch_rpntyped.input_ids, max_length=10, num_particles=3, ess_threshold=2)

None
1
tensor([[  526],
        [18084],
        [53552],
        [33975],
        [33975],
        [   32]])
2
tensor([[  526,   220],
        [18084,   979],
        [53552,  6679],
        [33975, 17257],
        [33975,  3749],
        [   32,   425]])
3
tensor([[  526,   220,    16],
        [18084,   979, 15229],
        [53552,  6679,   279],
        [33975, 17257,    57],
        [33975,  3749, 50894],
        [   32,   425,  6066]])
4
tensor([[  526,   220,    16,    15],
        [18084,   979, 15229,  7975],
        [53552,  6679,   279,  3403],
        [33975, 17257,    57, 10981],
        [33975,  3749, 50894, 33975],
        [   32,   425,  6066,   356]])
5
tensor([[  526,   220,    16,    15,   220],
        [18084,   979, 15229,  7975,   374],
        [53552,  6679,   279,  3403,  1714],
        [33975, 17257,    57, 10981,  6291],
        [33975,  3749, 50894, 33975,  2239],
        [   32,   425,  6066,   356,   638]])
6
tensor([[  526,   220,    16,    15,   220,    2

In [24]:
for output in [a+b for a,b in zip(qwenllm.tokenizer.batch_decode(batch_rpntyped.input_ids.repeat_interleave(3, dim=0)), qwenllm.tokenizer.batch_decode(smc_generated_token_ids_chain2[0].reshape(2*3,10)))]:
    print("------------------------")
    print(output)

------------------------
Example 1:
Input: foo = 4, (3 + foo) * 5
Output: 3 foo 4 = + 5 *

Example 2:
Input: bar = 3, 7 - (2 + bar) * 4
Output: 7 2 bar 3 = + 4 * -

Example 3:
Input: foofoo = 2, 8 + (foofoo * (4 - 1))
Output: int 10 8 foofoo min 
------------------------
Example 1:
Input: foo = 4, (3 + foo) * 5
Output: 3 foo 4 = + 5 *

Example 2:
Input: bar = 3, 7 - (2 + bar) * 4
Output: 7 2 bar 3 = + 4 * -

Example 3:
Input: foofoo = 2, 8 + (foofoo * (4 - 1))
Output: Null when foofoo is null or 0 In
------------------------
Example 1:
Input: foo = 4, (3 + foo) * 5
Output: 3 foo 4 = + 5 *

Example 2:
Input: bar = 3, 7 - (2 + bar) * 4
Output: 7 2 bar 3 = + 4 * -

Example 3:
Input: foofoo = 2, 8 + (foofoo * (4 - 1))
Output: Quit tried the above method and could not resolve the
------------------------
Example 1:
Input: (3 + 4) * 5
Output: 3 4 + 5 *

Example 2:
Input: bar = 3,  7 - (2 + bar) * 4
Output: 7 2 bar 3 = + 4 * -

Example 3:
Input: foofoo = 5, (3 + 4) * foofoo − 6 
Output:<|endo

Problem: the expression "The cat is on the mat 4 = = = = = = = " is legal, (mat = 4, the = mat, on = the, is = on, cat = is, The = cat), thus "The cat is on the mat" is a legal unfinished RPN++ expression.

Thus a model with a weak few-shot learning capacity will have a tendency to write sentences in natural language. 

imp, while, p11-12

RPN : boolean

Lark

Commencer expériences sur: SFT / RL / Constraint Decoding

1st step : utiliser outlines avec SMC

2nd step : faire expérience sur RPN où: 
- compute log-prob of generated sequences (faire augmenter le nombre de particules (from 1 to 20) and see the log-probs increase / see the accuracy for the task increase)
- on compare avec SFT / RL / Constraint Decoding / Random generator

3rd step : Imp avec outlines (scope checking, typing à intégrer)