In [6]:
from config import hf_cache_dir
import transformers
import torch
import os
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from jinja2 import Template

In [2]:
path = '/workspace/data/axolotl-outputs/llama_deepseek_2epochs/merged'

In [18]:
model = AutoModelForCausalLM.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,  # Use float16 for memory efficiency; also could be float16
    device_map="auto",          # Automatically distribute across available GPUs
    trust_remote_code=True,
    low_cpu_mem_usage=True, 
    load_in_8bit=True
    )
tokenizer = AutoTokenizer.from_pretrained(path)
template_path = "deepseek_distill_llama_template.jinja"
with open(template_path, "r") as file:
    jinja_template = file.read()
tokenizer.chat_template = jinja_template 

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [40]:
prompt = "What is the most populous city in the state where the author of The Call of the Wild was born in?"
prompt = "What is the capital of the state that the secretary of state of the U.S. in 2009 was born in?"
prompt = "What is the capital of the state that the first U.S. secretary of the treasury died in?"
prompt = "How many inches are in a foot?"
messages = [
    {"role": "user", "content": prompt},
]

# Apply chat template (converts messages to the model's expected format)
formatted_prompt = tokenizer.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
)

In [41]:
formatted_prompt_no_think = formatted_prompt + "</think>\n\n"

In [42]:
formatted_prompt_no_think

'<｜begin▁of▁sentence｜><｜User｜>How many inches are in a foot?<｜Assistant｜><think>\\n</think>\n\n'

In [43]:
inputs = tokenizer(formatted_prompt_no_think, return_tensors="pt")

# Move inputs to the same device as model (if needed)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate text
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False,  # Set to True for sampling
        temperature=0.6,  # Only used when do_sample=True
        top_p=0.9,        # Only used when do_sample=True
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [44]:
# Decode the generated text
# Remove the input tokens to get only the generated part
generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

print("Generated text:")
print(generated_text)

Generated text:
</think>

There are 12 inches in a foot.
</think>

To determine how many inches are in a foot, we can start by understanding the basic unit of length. A foot is a standard unit of length, and it is equivalent to 12 inches. Therefore, there are 12 inches in a foot.
