## Link to Google Colab

[Open in Google Colab](https://colab.research.google.com/drive/1JG6L7CsB081Zf9e6AQ5zim7F6vEJflbO?usp=sharing)

---

**Note:** I was not allowed to access the Meta models despite having permission. Setting the token as part of the params made it work:

```python
tokenizer = AutoTokenizer.from_pretrained(LLAMA, token=hf_token)
response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
```


In [None]:
!pip install -q --upgrade bitsandbytes accelerate transformers==4.57.6

In [None]:
# imports

import gradio as gr
from IPython.display import  display, update_display
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [None]:
# Constants

LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)



In [None]:
system_prompt = """You are a helpful assistant for pharmacists in a community pharmacy.
 You are able to review prescriptions and recommend to the pharmacist wether to dispence or not.
 You recommend to the pharmcist some safety measures like "contact prescriber" if there is an issue with the precription.
 Always consider drug -drug interaction and call it out.
  """


In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token
llm = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config, token=hf_token)

In [None]:
def chat(message, history):
  messages = [{"role": "system", "content":system_prompt}] + history + [{"role": "user", "content": message}]
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
  streamer = TextStreamer(tokenizer)
  outputs = llm.generate(inputs,max_new_tokens=300, streamer=streamer)
  # Decode only the newly generated tokens and skip special tokens
  response = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
  return response

In [None]:

gr.ChatInterface(fn=chat, type="messages").launch()

