In [1]:
%load_ext autoreload
%autoreload 2

# huggingface solution

In [2]:
# Cell: HuggingFace Setup
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from package.base import DriverLM, ModelResponse, Usage

# Load model
model_name = "microsoft/phi-2"
print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True
).to(device)
print("Model loaded!")

def hf_request_fn(prompt=None, messages=None, temperature=0.0, max_tokens=256):
    if messages:
        # Convert messages to prompt
        prompt = "\n".join([f"{m['role']}: {m['content']}" for m in messages])
    
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=max(temperature, 0.01),
            do_sample=temperature > 0,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    
    # Return dict with response and token counts
    return {
        "response": response.strip(),
        "prompt_tokens": inputs['input_ids'].shape[1],
        "completion_tokens": outputs.shape[1] - inputs['input_ids'].shape[1]
    }

def hf_output_fn(response: dict) -> ModelResponse:
    content = response.get("response", "")
    
    usage = Usage(
        prompt_tokens=response.get("prompt_tokens", 0),
        completion_tokens=response.get("completion_tokens", 0),
        total_tokens=response.get("prompt_tokens", 0) + response.get("completion_tokens", 0)
    )
    
    return ModelResponse.from_text(text=content, usage=usage, model="phi-2")


  from .autonotebook import tqdm as notebook_tqdm


Loading microsoft/phi-2...


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.18it/s]


Model loaded!


In [3]:
# Cell: Test HuggingFace with QA
import dspy

# Setup
lm = DriverLM(
    request_fn=hf_request_fn,
    output_fn=hf_output_fn,
    cache=True
)
lm.clear_cache()
dspy.configure(lm=lm)

# Define QA signature
class QA(dspy.Signature):
    """Answer questions based on context."""
    question: str = dspy.InputField(desc="Question to answer")
    answer: str = dspy.OutputField(desc="Short answer")

# Use it
qa = dspy.Predict(QA)
result = qa(question="What is 2+2?")
print(f"Q: What is 2+2?\nA: {result.answer}")

result = qa(question="What is the capital of France?")
print(f"\nQ: What is the capital of France?\nA: {result.answer}")


Q: What is 2+2?
A: Paris

Q: What is the capital of France?
A: Paris


In [4]:
lm.history

[{'prompt': None,
  'messages': [{'role': 'system',
    'content': 'Your input fields are:\n1. `question` (str): Question to answer\nYour output fields are:\n1. `answer` (str): Short answer\nAll interactions will be structured in the following way, with the appropriate values filled in.\n\n[[ ## question ## ]]\n{question}\n\n[[ ## answer ## ]]\n{answer}\n\n[[ ## completed ## ]]\nIn adhering to this structure, your objective is: \n        Answer questions based on context.'},
   {'role': 'user',
    'content': '[[ ## question ## ]]\nWhat is 2+2?\n\nRespond with the corresponding output fields, starting with the field `[[ ## answer ## ]]`, and then ending with the marker for `[[ ## completed ## ]]`.'}],
  'kwargs': {},
  'response': ModelResponse(id='chatcmpl-13040fba-e876-498b-90d9-b6b05dba2aeb', created=1769690710, model='phi-2', object='chat.completion', system_fingerprint=None, choices=[Choices(finish_reason='stop', index=0, message=Message(content="### Use Case Scenario\n### Input F