In [1]:
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch
from tqdm import tqdm

# Config
BASE_MODEL = "Salesforce/codegen-350M-mono"
LORA_PATH = "./lora-adapter"
TEST_PATH = "./1-encoder-data/test/test.jsonl"
OUTPUT_PATH = "./predictions.txt"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Load base model and tokenizer
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(LORA_PATH)
tokenizer.padding_side = "left"


# Inject LoRA adapter
model = PeftModel.from_pretrained(base_model, LORA_PATH)
model.to(DEVICE)
model.eval()

# Load test data
test_data = []
with open(TEST_PATH, "r", encoding="utf-8") as f:
    for line in f:
        test_data.append(json.loads(line))

from torch.utils.data import DataLoader

BATCH_SIZE = 16  
MAX_INPUT_LEN = 512
MAX_NEW_TOKENS = 256

#Tokenize without tensors for correct padding later
inputs = [tokenizer(example["input"], truncation=True, max_length=MAX_INPUT_LEN) for example in test_data]

# Pad using tokenizer logic (respects padding_side)
batch_encoding = tokenizer.pad(inputs, padding=True, return_tensors="pt")

input_ids = batch_encoding["input_ids"]
attention_mask = batch_encoding["attention_mask"]

# Create batches
dataloader = DataLoader(list(zip(input_ids, attention_mask)), batch_size=BATCH_SIZE)

# Predict
with open(OUTPUT_PATH, "w", encoding="utf-8") as out_file:
    for batch in tqdm(dataloader, desc="Generating predictions"):
        input_ids_batch, attention_mask_batch = [x.to(DEVICE) for x in batch]

        with torch.no_grad():
            output_tokens = model.generate(
                input_ids=input_ids_batch,
                attention_mask=attention_mask_batch,
                max_new_tokens=MAX_NEW_TOKENS,
                temperature=0.0,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )

        for input_ids_i, generated_i in zip(input_ids_batch, output_tokens):
            decoded = tokenizer.decode(generated_i, skip_special_tokens=True)
            input_text = tokenizer.decode(input_ids_i, skip_special_tokens=True)
            prediction = decoded[len(input_text):].strip() if decoded.startswith(input_text) else decoded.strip()
            out_file.write(prediction + "\n")


print(f" Finished predictions:  {OUTPUT_PATH}")


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Using device: cuda


  return torch.load(checkpoint_file, map_location=map_location)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  adapters_weights = torch.load(filename, map_location=torch.device(device))
You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Generating predictions: 100%|██████████| 108/108 [26:18<00:00, 14.62s/it]

 Finished predictions:  ./predictions.txt



