In [None]:
import os
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model_name = "mistralai/Mistral-7B-v0.1"
model_checkpoint = "./finetuning_mistral7b_v2/checkpoint-89"
cache_dir = os.path.expanduser("~/.cache/huggingface/")

In [None]:
peftconfig = PeftConfig.from_pretrained(model_checkpoint)

model_base = AutoModelForCausalLM.from_pretrained(
    peftconfig.base_model_name_or_path, device_map="auto", cache_dir=cache_dir
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    add_bos_token=True,
    add_eos_token=False,  # always False for inference
)

new_model = PeftModel.from_pretrained(model_base, model_checkpoint)
print("Peft model loaded")

In [None]:
def generate_response(prompt, model):
  encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
  model_inputs = encoded_input.to('cuda')

  generated_ids = model.generate(**model_inputs,
                                 max_new_tokens=20,
                                 min_new_tokens=1,
                                 do_sample=False,
                                 pad_token_id=tokenizer.eos_token_id)

  decoded_output = tokenizer.batch_decode(generated_ids)
  return decoded_output[0].replace(prompt, "")

In [None]:
prompt = "English: Images raced through her mind as her daydreams took over\nFrench: "
response = generate_response(prompt, new_model)
print(f"Response: {response}")