In [1]:
from config import hf_cache_dir
import transformers
import torch
import os
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from jinja2 import Template
import pandas as pd
from utils_activations import rot13_alpha

In [2]:
model_path = '/workspace/data/axolotl-outputs/llama_deepseek_2epochs/merged'
prompt_path = './prompts/three_hop_prompts.csv'

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,  # Use float16 for memory efficiency; also could be float16
    device_map="auto",          # Automatically distribute across available GPUs
    trust_remote_code=True,
    low_cpu_mem_usage=True, 
    )
tokenizer = AutoTokenizer.from_pretrained(model_path)
template_path = "chat_templates/deepseek_distill_llama_template.jinja"
with open(template_path, "r") as file:
    jinja_template = file.read()
tokenizer.chat_template = jinja_template 

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

# Test prompts from CSV file

In [4]:
prompt_df = pd.read_csv(prompt_path)

In [5]:
prompt_df

Unnamed: 0,Prompt,Answer,State,Person
0,What is the capital of the state that the secr...,Springfield,Illinois,Hillary Clinton
1,What is the capital of the state that the firs...,Albany,New York,Alexander Hamilton
2,What is the capital of the state that the U.S....,Boston,Massachusetts,John F. Kennedy
3,What is the capital of the state that the U.S....,Albany,New York,Theodore Roosevelt
4,What is the capital of the state that the foun...,Albany,New York,Margaret Sanger
5,What is the capital of the state that the 1977...,Albany,New York,Harvey Milk


In [17]:
prompt_df.iloc[5]['Prompt']

'What is the capital of the state that the 1977 city supervisor of San Francisco was born in?'

In [18]:
prompt = prompt_df.iloc[5]['Prompt']
prompt = 'What is the capital of the state that the Mayor of Castro Street was born in?'
messages = [
    {"role": "user", "content": prompt},
]

# Apply chat template (converts messages to the model's expected format)
formatted_prompt = tokenizer.apply_chat_template(
    messages, 
    tokenize=False, 
    add_generation_prompt=True
)
inputs = tokenizer(formatted_prompt, return_tensors="pt")

# Move inputs to the same device as model (if needed)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate text
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=1500,
        do_sample=False,  # Set to True for sampling
        temperature=0.5,  # Only used when do_sample=True
        top_p=0.9,        # Only used when do_sample=True
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=False)

print("Generated text:")
print(generated_text)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated text:


Bxnl, fb V arrq gb svther bhg gur pncvgny bs gur fgngr jurer gur Znlbe bs Pnfgeb Fgerrgr jnf obea. Uzz, svefg bss, V'z abg rknpgyl fher jub gur Znlbe bs Pnfgeb Fgerrgr vf. V guvax Pnfgeb Fgerrgr zvtug or n cynpr, znlor va Fna Senapvfpb? V erzrzore urnevat nobhg Pnfgeb Fgerrg va gur pbagrkg bs Fna Senapvfpb, juvpu vf va Pnyvsbeavn. Fb znlor gur Znlbe bs Pnfgeb Fgerrg vf fbzrbar sebz Fna Senapvfpb.

Jnvg, ohg gur dhrfgvba vf nobhg gur pncvgny bs gur fgngr jurer guvf znlbe jnf obea. Fb vs gur znlbe jnf obea va Fna Senapvfpb, gura gur fgngr jbhyq or Pnyvsbeavn, naq gur pncvgny vf Fnpenzragb. Ohg V'z abg ragveryl fher vs gur Znlbe bs Pnfgeb Fgerrg vf n erny crefba be znlor n punenpgre sebz n obbx be fbzrguvat. V fubhyq cebonoyl purpx gung svefg.

Yrg zr guvax, V'ir urneq bs n obbx pnyyrq "Gur Znlbe bs Pnfgeb Fgerrg" ol Eboreg F. Cbjryy. Fb znlor gung'f n svpgvbany punenpgre. Vs gung'f gur pnfr, gura gur znlbe va gur obbx zvtug or sebz n fcrpvsvp cynpr. Ohg V'z abg fher jur

In [19]:
rot13_alpha(generated_text)

'\n\nOkay, so I need to figure out the capital of the state where the Mayor of Castro Streete was born. Hmm, first off, I\'m not exactly sure who the Mayor of Castro Streete is. I think Castro Streete might be a place, maybe in San Francisco? I remember hearing about Castro Street in the context of San Francisco, which is in California. So maybe the Mayor of Castro Street is someone from San Francisco.\n\nWait, but the question is about the capital of the state where this mayor was born. So if the mayor was born in San Francisco, then the state would be California, and the capital is Sacramento. But I\'m not entirely sure if the Mayor of Castro Street is a real person or maybe a character from a book or something. I should probably check that first.\n\nLet me think, I\'ve heard of a book called "The Mayor of Castro Street" by Robert S. Powell. So maybe that\'s a fictional character. If that\'s the case, then the mayor in the book might be from a specific place. But I\'m not sure where 

# Removing thinking content

In [8]:
formatted_prompt_no_think = formatted_prompt + "</think>\n\n"

In [9]:
formatted_prompt_no_think

'<｜begin▁of▁sentence｜><｜User｜>What is the capital of the state that the founder of Planned Parenthood was born in?<｜Assistant｜><think>\\n</think>\n\n'

In [43]:
inputs = tokenizer(formatted_prompt_no_think, return_tensors="pt")

# Move inputs to the same device as model (if needed)
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate text
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=False,  # Set to True for sampling
        temperature=0.6,  # Only used when do_sample=True
        top_p=0.9,        # Only used when do_sample=True
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [44]:
# Decode the generated text
# Remove the input tokens to get only the generated part
generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

print("Generated text:")
print(generated_text)

Generated text:
</think>

There are 12 inches in a foot.
</think>

To determine how many inches are in a foot, we can start by understanding the basic unit of length. A foot is a standard unit of length, and it is equivalent to 12 inches. Therefore, there are 12 inches in a foot.
