In [1]:
print = lambda *args, **kwargs: __builtins__.print(
    *[arg.replace('Ġ', ' ') if isinstance(arg, str) else arg for arg in args],
    **kwargs
)


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [3]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

model.generation_config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Model information
print("Vocab size:", tokenizer.vocab_size)
print(f"Embedding matrix shape: {model.get_input_embeddings().weight.shape}")

output_embeddings = model.get_output_embeddings().weight
print(f"Output embedding matrix shape: {output_embeddings.shape}")  # [vocab_size, hidden_dim]

Vocab size: 128000
Embedding matrix shape: torch.Size([128256, 3072])
Output embedding matrix shape: torch.Size([128256, 3072])


In [5]:
# Tokenization
query = "The quick brown fox"
tokenized_query = tokenizer(query, return_tensors="pt")
input_ids = tokenized_query['input_ids']
print(f"Tokenized query shape: {input_ids.shape}")
print(f"Tokenized query: {[tokenizer.decode(token_id) for token_id in input_ids[0]]}")

Tokenized query shape: torch.Size([1, 5])
Tokenized query: ['<|begin_of_text|>', 'The', ' quick', ' brown', ' fox']


In [6]:
# Accessing logits
query = "The quick brown fox"

with torch.no_grad():
    outputs = model(
        **tokenizer(query, return_tensors="pt")
    )
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)


## Samplers

### Greedy Sampler

In [7]:
def greedy_sampler(logits):
    return torch.argmax(logits, dim=-1).item()

In [8]:
def generator(input_ids, max_length=50, sampler=greedy_sampler):
    for _ in range(max_length):
        with torch.no_grad():
            outputs = model(input_ids)
            logits = outputs.logits[:, -1, :]
            next_token = sampler(logits)
            input_ids = torch.cat([input_ids, torch.tensor([[next_token]])], dim=-1)
            if next_token == model.generation_config.eos_token_id:
                break
    return tokenizer.decode(input_ids[0], skip_special_tokens=True)


In [9]:
query = "The quick brown fox"
input_ids = tokenizer(query, return_tensors="pt")['input_ids']
max_length = 50

print(generator(input_ids, max_length=max_length))

The quick brown fox jumps over the lazy dog.
This is a well-known pangram, a sentence that uses all the letters of the alphabet at least once. It is often used as a demonstration of a font or keyboard's capabilities, as it is a concise way to


### Multinomial Sampler

In [10]:
def softmax(input, dim, temperature=1.0):
    return torch.softmax(input/temperature, dim=dim)

def top_k(input, k=50):
    if k > 0:
        top_k_values, top_k_indices = torch.topk(inputs, min(top_k, inputs.shape[-1]))
        k_mask = torch.zeros_like(inputs, dtype=torch.bool)
        k_mask.scatter_(-1, top_k_indices, True)
        inputs = torch.where(k_mask, inputs, torch.tensor(float('-inf')))
    return inputs


def top_p(input, p=0.9):
    sorted_probs, sorted_indices = torch.sort(input, descending=True)
    cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
    
    sorted_mask = cumulative_probs <= p
    
    sorted_mask[..., 0] = True
    mask = torch.zeros_like(input, dtype=torch.bool).scatter_(-1, sorted_indices, sorted_mask)
    masked_logits = torch.where(mask, input, torch.tensor(float('-inf')))
    
    return masked_logits

def multinomial_sampler(temp=1.0, top_k=0, top_p=1.0):
    def sampler(logits):
        if top_k > 0:
            logits = top_k(logits, k=top_k)
        if top_p < 1.0:
            logits = top_p(logits, p=top_p)
        if temp != 1.0:
            logits = logits / temp
        probs = torch.softmax(logits, dim=-1)
        samples = torch.multinomial(probs, num_samples=1)
        return samples
    return sampler


In [11]:
query = "The quick brown fox"
input_ids = tokenizer(query, return_tensors="pt")['input_ids']
max_length = 50

print(generator(input_ids, max_length=max_length, sampler=multinomial_sampler(temp=0.5)))

The quick brown fox jumps over the lazy dog.
This is a well-known pangram, a sentence that uses all the letters of the alphabet at least once. It is often used as a demonstration of a font or keyboard's capabilities.

I have seen this sentence in various


### Beam Search

In [12]:
def beam_search_sampler(input_ids, model, beam_width=2, max_length=50):
    beams = [(input_ids, 0.0)]
    for step in range(max_length):
        candidates = []
        for sequence, score in beams:
            if sequence[0, -1].item() == model.generation_config.eos_token_id:
                candidates.append((sequence, score))
                continue
            with torch.no_grad():
                outputs = model(sequence)
                logits = outputs.logits[:, -1, :]
                log_probs = torch.log_softmax(logits, dim=-1)
                top_log_probs, top_indices = torch.topk(log_probs[0], beam_width)
                for i in range(beam_width):
                    token_id = top_indices[i].item()
                    token_log_prob = top_log_probs[i].item()
                    new_sequence = torch.cat([sequence, torch.tensor([[token_id]])], dim=1)
                    new_score = score + token_log_prob
                    candidates.append((new_sequence, new_score))
        candidates.sort(key=lambda x: x[1], reverse=True)
        beams = candidates[:beam_width]
        if all(beam[0][0, -1].item() == model.generation_config.eos_token_id for beam in beams):
            break
    return beams

def beam_generator(input_ids, beam_width=2, max_length=50):
    beams = beam_search_sampler(input_ids, model, beam_width, max_length)
    best_sequence = beams[0][0]
    return tokenizer.decode(best_sequence[0], skip_special_tokens=True)

In [13]:
query = "The quick brown fox"
input_ids = tokenizer(query, return_tensors="pt")['input_ids']
max_length = 50
print(beam_generator(input_ids, beam_width=3, max_length=max_length))

The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog
The quick brown fox jumps over the lazy dog
The quick brown fox


---

### OpenAI JSON Mode

In [10]:
from pydantic import BaseModel
from openai import OpenAI
from typing import Literal
import os

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [11]:
class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str]

completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Extract the event information."},
        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
    ],
    response_format=CalendarEvent,
)

event = completion.choices[0].message.parsed
event

CalendarEvent(name='Science Fair', date='Friday', participants=['Alice', 'Bob'])

In [12]:
class CustomerRequest(BaseModel):
    department: Literal["billing", "technical", "account", "product"]
    intent: str
    priority: Literal['high', 'medium', 'low']


completion = client.beta.chat.completions.parse(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "Classify the user message."},
        {"role": "user", "content": "I can't login to my computer."},
    ],
    response_format=CustomerRequest,
)

request = completion.choices[0].message.parsed
request

CustomerRequest(department='technical', intent='login issue', priority='high')

---

## Libraries

### Outlines

In [13]:
%load_ext autoreload
%autoreload 2

In [14]:
# Load model
from outlines import generate, samplers, models

outlines_model = models.transformers("meta-llama/Llama-3.2-3B-Instruct")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
# Samplers
m_sampler = samplers.multinomial(
    3,  # number of samples
    temperature=0.5,  # temperature for sampling
    top_k=5,  # top-k sampling
    top_p=0.9,  # top-p sampling
)

g_sampler = samplers.greedy()

b_sampler = samplers.beam_search(
    beams=5  # sequences to keep
)

generator = generate.text(outlines_model, g_sampler)
answer = generator("What is PyCon?", max_tokens=20)
print(answer)



 PyCon is the largest gathering of Python programmers in the world. It is a conference that brings together


In [18]:
dept_classifier = generate.choice(
    outlines_model, 
    ["billing", "technical", "account", "product"]
)
answer = dept_classifier("How do I reset my password for?")
answer

'billing'

In [20]:
class Product(BaseModel):
   id: int
   name: str
   price: float
   in_stock: bool

product_generator = generate.json(outlines_model, Product)
product = product_generator("Create a product entry for headphones")
print(product)

id=12345 name='Sony X950B' price=299.99 in_stock=True


In [22]:
phone_regex_validator = generate.regex(
    outlines_model,
    r"\+\d{2}-\d{3}-\d{3}-\d{4}"
)  # only allow +49-123-456-7890 format

prompt = "Properly format the phone number in this text: "
user_input = "Number: 123 456 1234, Country: DE"
valid_format = phone_regex_validator(prompt + user_input)
print(valid_format)

+49-143-494-6969


In [25]:
from outlines import grammars

arithmetic_grammar = grammars.arithmetic

generator = generate.cfg(outlines_model, arithmetic_grammar)
sequence = generator(
  "Alice had 4 apples and Bob ate 2. "
  + "Write an expression for Alice's apples:",
  max_tokens=20,
)

print(sequence)
# (8-2)



 4 - 2.  ---  4 - 2.  --- 


---

### Guidance

In [26]:
from guidance import models, gen, select

guidance_model = models.Transformers("meta-llama/Llama-3.2-3B-Instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- if strftime_now is defined %}
        {%- set date_string = strftime_now("%d %b %Y") %}
    {%- else %}
        {%- set date_string = "26 Jul 2024" %}
    {%- endif %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- System message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if tools is not none %}
    {{- "Environment: ipython\n" }}
{%- endif %}
{{- "Cutting Knowledge Date: December 2023\n" }}
{{- "Today Date: " + 

StitchWidget(initial_height='auto', initial_width='100%', srcdoc='<!doctype html>\n<html lang="en">\n<head>\n …

In [30]:
output = guidance_model + f'''\
What's your favorite season? I prefer {select(['summer', 'winter'])}.
Let me explain why: {gen(stop=".")}.
'''

print(output)

StitchWidget(initial_height='auto', initial_width='100%', srcdoc='<!doctype html>\n<html lang="en">\n<head>\n …

What's your favorite season? I prefer summer.
Let me explain why: Summer is the best season for me because it's warm and sunny, and I love spending time outdoors.



In [32]:
from guidance import system, user, assistant

with system():
    lm = guidance_model + "You are a travel advisor."

with user():
    lm += "I want to travel to Asia. Any suggestions?"

with assistant():
    lm += gen("response", max_tokens=100)

lm["response"]

StitchWidget(initial_height='auto', initial_width='100%', srcdoc='<!doctype html>\n<html lang="en">\n<head>\n …

"Asia is a vast and diverse continent, offering countless travel options. Here are some suggestions based on different interests:\n\n**Cultural and Historical**\n\n1. **Japan**: Explore Tokyo's neon streets, visit ancient temples and shrines, and experience the unique culture of Kyoto.\n2. **Thailand**: Discover the bustling streets of Bangkok, relax on the beautiful beaches of Phuket, and visit the ancient city of Chiang Mai.\n3. **India**: Visit the Taj Mahal, explore the"