In [20]:
import re
import reasoning_gym
import torch
from rich import print
from transformers import AutoModelForCausalLM, AutoTokenizer


SEED = 42

In [None]:
model_name = 'HuggingfaceTB/SmolLM-135M-Instruct'

llm = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name, torch_dtype=torch.float16)

In [None]:
def generate_model_response(messages):
    chat_template = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    inputs = tokenizer(chat_template, return_tensors='pt')
    print(f"Input ids shape: {inputs['input_ids'].shape}, first 10 tokens: {inputs['input_ids'][0, :10]}")
    outputs = llm.generate(**inputs, max_new_tokens=100)
    input_length = inputs['input_ids'].shape[1]
    newly_generated_tokens = outputs[:, input_length:]

    decoded = tokenizer.batch_decode(newly_generated_tokens)[0]
    return decoded

In [None]:
environment_name = 'propositional_logic'
dataset = reasoning_gym.create_dataset(environment_name, seed=SEED, size =5)

system_prompt = """
generate an answer after thinking.
Use <think> your reasons here </think> <answer> Answer here </answer>
You must answer within the <answer>...</answer> !
"""

def extract_answer(response):
    # Simple regex search to extract answers
    answer = re.search(r'answer>(.*?)</answer>', response, re.DOTALL)
    if answer is not None:
        return answer.group(1)
    return answer


for example in dataset:
    question = example['question']
    answer = example['answer']

    print(f"[bold white]System : {system_prompt}[/bold white]")
    print(f"[bold blue]Question : [/bold blue]\n" + question)
    if answer is not None:
        print(f"[bold green]Answer : [/bold green]\n" + answer)

    llm_response = input("Let's say the LLM response is")
    # llm_response = 'lol'

    answer = extract_answer(llm_response)
    score_func = reasoning_gym.get_score_answer_fn(example["metadata"]['source_dataset'])

    print(f"Extracted answer: ", answer)
    reward = score_func(answer, example)

    if reward > 0:
        print(f"[bold yellow]Reward : [/bold yellow] {reward}")
    else:
        print(f"[bold red]Incorrect")

### There are two phases
* Data collection phase
  * Environment -> LLM -> Buffer
* Training Phase
  * Experience buffer <-> Policy optimization <-> LLM

### Four things go into the experience buffer
* Full sequence : prompt and answer
* Binary reponse mask
* Group relative advantage of the response
* Log probability of generated tokens themselves

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

  Referenced from: <C1CC76AA-CD55-3E10-9064-29676E3E2535> /Users/vivekchaudhary/anaconda3/lib/python3.11/site-packages/torchvision/image.so
  warn(


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]