In [1]:
%%capture
!pip install transformers[torch]
!pip install peft
!pip install datasets
!pip install bitsandbytes
!pip install tqdm
!pip install einops
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import PeftModel
from tqdm.notebook import tqdm

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
base_model_name="microsoft/phi-2"

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True,
    use_auth_token=True
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = PeftModel.from_pretrained(base_model,
                                  "USERNAME/MODEL",
                                  use_auth_token=True)

In [11]:
def make_prompt(entry):
    return f"Instruct:{entry}\nOutput:"

In [12]:
def run_model(entry):
    model_input = tokenizer(
        make_prompt(entry),
        return_tensors="pt").to("cuda")

    input_length = len(model_input['input_ids'][0])
    model.eval()
    with torch.no_grad():
        full_tokens = model.generate(**model_input, max_new_tokens=100)[0]
        decoded_tokens = tokenizer.decode(full_tokens[input_length:], skip_special_tokens=True)
    return decoded_tokens

In [None]:
run_model("Hello! Are you doing?")