In [None]:
# Get the project root directory and add it to the system path
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
sys.path.insert(0, project_root)

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from util import generate_simple_eval_prompts, initialize_dfs
from peft import PeftModel, PeftConfig
from transformers import set_seed, AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from pprint import pprint

In [None]:
seed = 1212

# repeat the same splits with the same seed
test_split = 0.1  # 10% of the all
valid_split = 0.05  # 5% of the training set

max_new_tokens=40
batch_size = 16

model_name = "mistralai/Mistral-7B-v0.1"
model_checkpoint = os.path.expanduser("~/finetuning_mistral7b_v2/checkpoint-90")
cache_dir = os.path.expanduser("~/.cache/huggingface/")
translations_path = os.path.join(project_root, "translations/mistral7b_v2_translations.csv")

In [None]:
set_seed(seed)

In [None]:
_, df_test = initialize_dfs(test=test_split)
s0, r0, p0 = generate_simple_eval_prompts(df_test, shots=0)
s1, r1, p1 = generate_simple_eval_prompts(df_test, shots=1, fuzzy=True)
sources = s0 + s1
references = r0 + r1
prompts = p0 + p1
dataset = Dataset.from_dict({"sources": sources, "references": references, "prompts": prompts})
pprint(dataset)

In [None]:
peftconfig = PeftConfig.from_pretrained(model_checkpoint)

model_base = AutoModelForCausalLM.from_pretrained(
    peftconfig.base_model_name_or_path, device_map="auto", cache_dir=cache_dir
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    add_bos_token=True,
    add_eos_token=False,  # always False for inference
)
tokenizer.pad_token = tokenizer.eos_token
model = PeftModel.from_pretrained(model_base, model_checkpoint)
print("Peft model loaded")

In [None]:
def generate_batch_responses(prompts, model):
    encoded = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        generated_ids = model.generate(
            **encoded,
            max_new_tokens=max_new_tokens,
            min_new_tokens=1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

In [None]:
translations = []
for i in tqdm(range(0, len(dataset), batch_size)):
    batch_prompts = dataset["prompts"][i:i + batch_size]
    responses = generate_batch_responses(batch_prompts, model)
    cleaned = [r.replace(p, "") for p, r in zip(batch_prompts, responses)]
    translations.extend(cleaned)

In [None]:
# Create directory
folder = os.path.dirname(translations_path)
os.makedirs(folder, exist_ok=True)

# Save results
translations_df = pd.DataFrame({
    "sources": sources,
    "references": references,
    "translations": translations
})
translations_df.to_csv(translations_path, index=False)