In [None]:
import os
import random
import torch
import pandas as pd
from tqdm import tqdm
from util import generate_eval_prompts, initialize_dfs
from peft import PeftModel, PeftConfig
from transformers import set_seed, AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from pprint import pprint
from torch.utils.data import DataLoader
from transformers import TextStreamer



In [2]:
seed = 1212

# repeat the same splits with the same seed
test_split = 0.1  # 10% of the all
valid_split = 0.05  # 5% of the training set

batch_size = 16

model_name = "mistralai/Mistral-7B-v0.1"
model_checkpoint = os.path.expanduser("~/finetuning_mistral7b_v1/checkpoint-89")
cache_dir = os.path.expanduser("~/.cache/huggingface/")
translations_path = "translations/mistral7b_translations.csv"

In [3]:
set_seed(seed)

In [None]:
_, df_test = initialize_dfs(test=test_split)
r0, p0 = generate_eval_prompts(df_test, shots=0)
r1, p1 = generate_eval_prompts(df_test, shots=1, fuzzy=True)
references = r0 + r1
prompts = p0 + p1
dataset = Dataset.from_dict({"prompts": prompts, "references": references})
pprint(dataset)

Loading existing dataframe...
Dataframe loaded.
Split at index 10119.
['Chapter 34',
 'His fingers tweak one of my nipples, and I moan into yet another kiss.',
 'God, that felt so fucking good.',
 "But he wasn't good at that kind of thing.",
 "I don't only mean the millions of songs in the digital archive, someone in "
 "the station's history had hoarded vinyl like it was going out of style."]
['English: Chapter 34\nFrench: ',
 'English: His fingers tweak one of my nipples, and I moan into yet another '
 'kiss.\n'
 'French: ',
 'English: God, that felt so fucking good.\nFrench: ',
 "English: But he wasn't good at that kind of thing.\nFrench: ",
 "English: I don't only mean the millions of songs in the digital archive, "
 "someone in the station's history had hoarded vinyl like it was going out of "
 'style.\n'
 'French: ']
Dataset({
    features: ['prompts', 'references'],
    num_rows: 2250
})


In [5]:
peftconfig = PeftConfig.from_pretrained(model_checkpoint)

model_base = AutoModelForCausalLM.from_pretrained(
    peftconfig.base_model_name_or_path, device_map="auto", cache_dir=cache_dir
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    add_bos_token=True,
    add_eos_token=False,  # always False for inference
)

model = PeftModel.from_pretrained(model_base, model_checkpoint)
print("Peft model loaded")

ValueError: Can't find 'adapter_config.json' at '/Users/grisha/finetuning_mistral7b_v1/checkpoint-89'

In [None]:
def generate_batch_responses(prompts, model):
    encoded = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        generated_ids = model.generate(
            **encoded,
            max_new_tokens=20,
            min_new_tokens=1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

In [None]:
translations = []
for i in tqdm(range(0, len(dataset), batch_size)):
    batch_prompts = dataset["prompts"][i:i + batch_size]
    responses = generate_batch_responses(batch_prompts, model)
    cleaned = [r.replace(p, "") for p, r in zip(batch_prompts, responses)]
    translations.extend(cleaned)

In [None]:
translations_df = pd.DataFrame({
    "reference": references,
    "response": translations
})
translations_df.to_csv(translations_path, index=False)