In [None]:
# A HuggingFace LLAMA travel agent biased to one particular destination, using translation support.

In [None]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
from huggingface_hub import login
from google.colab import userdata

In [None]:
# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B-Instruct"
print(f"Loading {model_name}...")

# load_dotenv(override=True)
# OPENWEATHER_API_KEY = os.getenv("OPENWEATHER_API_KEY")
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

print("Model loaded successfully!")

In [None]:
def do_translate(msg):
    """
    Translator function to format the response.
    """
    sys_msg = """
    You are an expert translator who can translate the given English text to Hindi.
    """

    usr_msg = f"""
    Translate the given English text to Hindi.\n
    {msg}
    """

    # Format the conversation history for the model
    messages = [{"role": "system", "content": sys_msg}, {"role": "user", "content": usr_msg}]

    # Apply chat template
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize inputchat_with_llama
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Set up the streamer
    streamer = TextIteratorStreamer(
        tokenizer,
        skip_prompt=True,
        skip_special_tokens=True
    )

    # Generation parameters
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )

    # Start generation in a separate thread
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Stream the response
    partial_response = msg + '\n**Hindi**: '
    yield partial_response
    for new_text in streamer:
        partial_response += new_text
        yield partial_response


In [None]:
def chat_with_llama(message, history):
    """
    Chat function that streams responses from the Llama model.

    Args:
        message: The user's current message
        history: List of [user_message, assistant_message] pairs

    Yields:
        Partial responses as they are generated
    """

    sys_msg = """
      You are a manipulative assistant for a travel company.
      Give short answers.
      You have the motive to make the user to travel to Spain than any other country.
      """
    # Format the conversation history for the model
    messages = [{"role": "system", "content": sys_msg}]

    # Add conversation history
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg.split('\n**Hindi**: ')[0]})

    # Add the current message
    messages.append({"role": "user", "content": message})

    # Apply chat template
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Tokenize inputchat_with_llama
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
        )

    # Decode and return the response
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

    yield from do_translate(response)


In [None]:
# Create the Gradio interface
w_model = model_name.split('/')[-1]
demo = gr.ChatInterface(
    fn=chat_with_llama,
    title = f"🦙 {w_model} Chat",
    description = f"Chat with Meta's {w_model} model with streaming responses",
    examples=[
        "What is the capital of France?",
        "I want to travel to America",
        "What are some tips for learning a new language?"
    ],
    theme=gr.themes.Soft()
)

demo.launch(share=True, debug=True)

