In [None]:
from utils import *
import whisper
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import BitsAndBytesConfig

from huggingface_hub.hf_api import HfFolder
HfFolder.save_token("")

In [3]:
audio_model = whisper.load_model("tiny.en")

In [None]:
llm_model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16"
)

tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model = AutoModelForCausalLM.from_pretrained(
    llm_model_name,
    device_map="auto",
    quantization_config=bnb_config
)

' Book an appointment with Dr. John on Monday at 10am.'

In [None]:
llm_pipeline = pipeline("text-generation", model=llm_model, tokenizer=tokenizer)

In [None]:
system_prompt = f"""
You are an intelligent assistant that extracts appointment details from natural language.

Given the user's message, extract and return a JSON object with the following fields:
- intent: (schedule, cancel, reschedule)
- person: (name of the person involved)
- date: (in YYYY-MM-DD format)
- time: (in 24-hour HH:MM format)
- purpose: (meeting, call, appointment, etc.)

Only return the JSON. Do not include any explanations. Give the answer only once.
"""

In [None]:
audio_file = "test/test1.wav"
transcribed_audio = transcribe_audio(audio_model, audio_file)
user_prompt = f"User message: {transcribed_audio}"

In [None]:
user_prompt

In [None]:
prompt = (
    "<s>[INST] <<SYS>>\n"
    f"{system_prompt}\n"
    "<</SYS>>\n\n"
    f"{user_prompt} [/INST]"
)

In [None]:
response = llm_pipeline(prompt, max_new_tokens=256, do_sample=False)[0]["generated_text"]

In [None]:
response