In [None]:
!pip install gradio
!pip install peft
!pip install bitsandbytes
!pip install trl



In [None]:
import gradio as gr
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
import torch

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "Oillim/Mistral-7b-vnpara"

In [None]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
    )

    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

model, tokenizer = create_model_and_tokenizer()
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def demo(sentence_1, sentence_2):
    PROMPT_TEMPLATE = "### Question:\n{instruction}\n\n### Answer:"
    instruction = f"""Do these two sentences have the same meaning?
Sentence 1: {sentence_1}
Sentence 2: {sentence_2}
Answer with "Yes" or "No"."""
    input_prompt = PROMPT_TEMPLATE.format_map(
{"instruction": instruction}
)
    input_ids = tokenizer(input_prompt, return_tensors="pt")

    outputs = model.generate(
    inputs=input_ids["input_ids"].to("cuda"),
    attention_mask=input_ids["attention_mask"].to("cuda"),
    do_sample=True,
    temperature=1.0,
    top_k=50,
    top_p=0.9,
    max_new_tokens=10,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
  )

    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    similarity = response.split("Answer:")[1].lower()
    if "no" in similarity or "not" in similarity:
      similarity = "No"
    else:
      similarity = "Yes"
    return similarity

In [None]:
iface = gr.Interface(fn=demo, inputs=["text", "text"], outputs="text")
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://fc92c7b88979e45456.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


