<a href="https://colab.research.google.com/github/ekaterinatao/house_md_tg_bot/blob/main/generative/NLP_2_HW2_house_md_bot_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Код запускается коррекно только на `GPU`

In [None]:
!pip install trl transformers ftfy gradio accelerate > 0.20.1 git+https://github.com/huggingface/peft.git -Uqqq
!pip install bitsandbytes einops datasets wandb -Uqqq
!pip install intel-extension-for-transformers -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m195.4/195.4 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.8/258.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.2/44.2 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m78.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import random
from tqdm.auto import tqdm, trange
from dataclasses import dataclass
import datasets
import numpy as np
import pandas as pd
import torch
from transformers import (AutoTokenizer, AutoModel,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          GenerationConfig)
from peft import get_peft_model, PeftConfig, PeftModel, LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
import gradio as gr
import warnings
warnings.filterwarnings("ignore")

In [None]:
@dataclass
class TrainingConfig:
    seed = 64
    dataset = 'ekaterinatao/house_md_context3'
    model_id = "PY007/TinyLlama-1.1B-step-50K-105b"
    checkpoint = "ekaterinatao/house-md-tynyLlama"
    batch_size = 8
    device = "cuda" if torch.cuda.is_available() else "cpu"
    learning_rate = 2e-3
    num_epochs = 1
    weight_decay = 0.001
    gradient_accumulation_steps = 2
    optim = "paged_adamw_32bit"
    max_grad_norm = 0.3
    max_steps = 1000
    warmup_ratio = 0.03
    lr_scheduler_type = "constant"
    lora_alpha = 32
    lora_dropout = 0.05
    lora_rank = 32

config = TrainingConfig()

In [None]:
seed = config.seed

random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
print(f'device is {config.device}')
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())

device is cuda
Tesla T4


# Inference

In [None]:
peft_config_eval = PeftConfig.from_pretrained(config.checkpoint)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

trained_model = AutoModelForCausalLM.from_pretrained(
    peft_config_eval.base_model_name_or_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    config.model_id,
    add_eos_token=True,
    trust_remote_code=True,
    padding_side='left'
)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def get_test_prompt(question, context=None):
    if context is None:
        context = ''
    else:
        context = context
    prompt = f"[INST]"
    prompt += f'Use the given context to guide your about answering the question\n'
    prompt += f"question: {question}\n"
    prompt += f"context: {context}\n"
    prompt += f"answer:"
    return prompt


def generate_answer(model):
    """To break generation type 'stop' in input box"""
    generation_config = GenerationConfig(
        max_new_tokens=50,
        pad_token_id = tokenizer.eos_token_id,
        repetition_penalty=2.0,
        eos_token_id = tokenizer.eos_token_id,
        temperature=0.3
    )
    question = input("Write your question to House MD:\n")
    context = ''

    while True:
        prompt = get_test_prompt(question, context=context)
        encoding = tokenizer(prompt, return_tensors="pt").to(config.device)
        outputs = model.generate(
            input_ids=encoding.input_ids,
            generation_config=generation_config
        )
        text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        result = text_output.split("\n")[-1].split('answer:')[-1][2:]
        print(f"House MD: {result}\n")
        context += f"\n{question}"
        question = input("You:\n")
        if "stop" in question.lower():
            break

    return result

### Пример диалогов с сохранением контекста.  
Подбирались гиперпараметры (repetition_penalty, температуры), конструкции промтов. На некоторые вопросы отвечает на других языках (чаще китайский и испанский).  
Качество генерации среднее, на парамедицинские темы дает более осмысленные ответы, чем на общие вопросы.  
**Вывод:** требуется чистка датасета и добавление диалогов на разные тематики, что на данном этапе намеренно не было реализовано.  

In [None]:
result = generate_answer(trained_model)

Write your question to House MD:
Hi How are you?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


House MD: aqui. I am very happy and excited for this new year! It is my first time in China so it will be an adventure of mine as well but also exciting because there's lots more things that can happen here than just

You:
I think new year was two months ago. But why are you in China? 


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


House MD: but i don't know how many years it is since last time we met! :) <issue_start><jupyter\_code>import pandas as pdfrom sklearn import preprocessing<p>df =pd['data']

You:
I do not understand you


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


House MD: but i don't know how many years it is since last time we met and now that my family has grown up a bit more than before... so maybe this will be an interesting topic for me! :) <issue_start><jup

You:
If you were House MD what would you advice me against fever and cold?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


House MD: I am very sorry for my bad English!   <reponame>john-mccormick/practical_python<filename>.github/.workflows--test/_default157896240

You:
stop


In [None]:
result = generate_answer(trained_model)

Write your question to House MD:
If you were House MD what would you advice me against fever and cold?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


House MD: a warm bath or hot shower.  2) I'm not sure if this is appropriate for my situation but it seems like an acceptable answer in general since we are talking hypotheticals here (I don’t know how much

You:
I think hot shower is good idea


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


House MD: ot water. It's a very simple thing but it can be done in less than an hour if we have enough money for that kind of things like this! (I don`t know how much time he has)  201

You:
Are we going to take shower together?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


House MD: I'm not sure. But if it was a very warm day then maybe just stay in bed for an hour or two beforehand so that when they come out of their bathroom there are no bacteria around them (and hopefully also

You:
stop


### Для тестирования чат-бота с сохранением контекста запустить ячейку ниже
Для прекращения генерации напечатать `stop` в stdin

In [None]:
result = generate_answer(trained_model)

# GRADIO DEMO
Не удалось реализовать код без использования GPU, поэтому демо с постоянным хостингом сделать не удалось (т.к. это возможно только платно).  
  
### Тестирование чат-бота
* При запуске следующей ячейки будет реализована временная демо версия в `gradio` для тестирования чат-бота.
* Тест с сохранением контекста можно провести в ноутбуке, запустив ячейку выше с функцией `generate_answer`

In [None]:
def get_answer(
    question,
    model=trained_model
):
    generation_config = GenerationConfig(
        max_new_tokens=50,
        pad_token_id = tokenizer.eos_token_id,
        repetition_penalty=2.0,
        eos_token_id = tokenizer.eos_token_id,
        temperature=0.3
    )
    prompt = get_test_prompt(question)
    encoding = tokenizer(prompt, return_tensors="pt").to(config.device)
    outputs = model.generate(
        input_ids=encoding.input_ids,
        generation_config=generation_config
    )
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    result = text_output.split("\n")[-1].split('answer:')[-1][2:]

    return result


title = "HouseMD bot"
description = "Gradio Demo for bot. To use it, simply add your text message."

interface = gr.Interface(
    fn=get_answer,
    inputs=gr.Textbox(label="Input message to House MD", lines=2),
    outputs=gr.Textbox(label="House MD's answer"),
    title=title,
    description=description
)
interface.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Running on public URL: https://05ea13b9a9def4763d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7861 <> https://05ea13b9a9def4763d.gradio.live


