In [18]:
!pip install transformers -q
!pip install accelerate -q
!pip install bitsandbytes -q
!pip install peft -q

In [19]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

In [20]:
import os
import json
import sys
import io
import random
import itertools
from typing import Any, Dict, List, Optional, Tuple, Union
import shutil
import logging
from dataclasses import dataclass, field

import torch
import torch.optim
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer
from transformers import TrainingArguments, Trainer, TrainerCallback
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
from transformers import HfArgumentParser

import accelerate, bitsandbytes

from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType,PeftModel, PeftConfig

In [21]:
random_state = 42

dataset_path = "instructions_fred.jsonl"

def set_seed(seed=random_state):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    tf.random.set_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

In [22]:
def load_samples(dataset_path, pre_prompt, tokenizer):
    samples = []
    with open(dataset_path, 'r', encoding="utf-8") as f:
        for sample in f:
            sample = json.loads(sample)
            try:
                seed = '<SC1>' + pre_prompt +'Ваш прошлый диалог: '+ sample['context'] + ' Твой ответ: <extra_id_0>'
                reply = '<extra_id_0>' + sample['response']
                input_tokens = tokenizer.encode(seed, add_special_tokens=False, truncation=True, max_length=1024)
                output_tokens = tokenizer.encode(reply, add_special_tokens=False)  # , truncation=True, max_length=1024)
                if len(input_tokens) < 712 and len(output_tokens) < 712:
                    samples.append({'input_tokens': input_tokens,
                                    'output_tokens': output_tokens,
                                    'seed': seed,
                                    'reply': reply})
            except Exception as ex:
                print(ex)

    return samples


class FinetuneDataset(Dataset):
    def __init__(self, samples, tokenizer):
        self.tokenizer = tokenizer
        self.max_input_len = 0
        self.max_output_len = 0
        self.samples = []

        self.bos_token_id = tokenizer.encode('<s>', add_special_tokens=False)[0]
        self.eos_token_id = tokenizer.encode('</s>', add_special_tokens=False)[0]
        self.pad_token_id = tokenizer.encode('<pad>', add_special_tokens=False)[0]

        for sample in samples:
            input_ids = sample['input_tokens']
            output_ids = sample['output_tokens'] + [self.eos_token_id]
            self.samples.append((input_ids, output_ids))
            self.max_input_len = max(self.max_input_len, len(input_ids))
            self.max_output_len = max(self.max_output_len, len(output_ids))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index: int):
        input_ids, output_ids = self.samples[index]

        input_npad = self.max_input_len - len(input_ids)
        attention_mask = [1]*len(input_ids) + [0]*input_npad
        input_ids = input_ids + input_npad * [self.pad_token_id]

        output_npad = self.max_output_len - len(output_ids)
        labels = output_ids + output_npad * [-100]

        return {'input_ids': torch.LongTensor(input_ids),
                'attention_mask': attention_mask,
                'labels': torch.LongTensor(labels),
                }


In [23]:
import torch, gc

def empty_cache():
    gc.collect()
    torch.cuda.empty_cache()
    print('Cache cleared!')

In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path = '/home/ubuntu/checkpoint_models/checkpoint-400'

tokenizer = transformers.GPT2Tokenizer.from_pretrained(model_path)
model = transformers.T5ForConditionalGeneration.from_pretrained(model_path, load_in_8bit=True, device_map="auto")

tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})

0

In [25]:
pre_prompt = 'Расскажи что-нибудь об альфа банке'

In [26]:
train_samples = load_samples(dataset_path, tokenizer = tokenizer, pre_prompt = pre_prompt)
train_dataset = FinetuneDataset(train_samples, tokenizer)

In [27]:
len(train_dataset)

270

In [None]:
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)


peft_model = get_peft_model(prepare_model_for_int8_training(model), lora_config)
peft_model.print_trainable_parameters()


peft_training_args = TrainingArguments(
    fp16 = False,
    output_dir = checkpoint_path,
    overwrite_output_dir=False,
    optim =  "adafactor",
    learning_rate  = 1e-3,
    weight_decay=0,
    lr_scheduler_type  = 'constant',
    num_train_epochs = 100,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps  = 8, #8
    gradient_checkpointing=False,
    logging_strategy  = 'steps',
    logging_steps  = 1,
    save_steps=50,
    save_total_limit=None,
    seed=42,

)

In [None]:
trainer = Trainer(
        model=peft_model,
        args=peft_training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=None,
    )

In [25]:
train_result = trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,6.1075
2,5.1753
3,5.2758
4,4.832
5,4.5296
6,4.4373
7,4.2789
8,5.1698
9,4.0978
10,3.5957




In [28]:
peft_model_id = '/home/ubuntu/checkpoint_models/checkpoint-400'

peft_trained_model = PeftModel.from_pretrained(model, peft_model_id, device_map="auto", torch_dtype=torch.float16)
peft_trained_model.eval()


print("Peft model loaded")

Peft model loaded


In [None]:
while True:
    print('-'*80)
    dialog = []
    while True:
        #print(dialog)
        msg = 'Собеседник: ' + input('Собеседник: ').strip()
        if len(msg) == 0:
            break

        dialog.append(msg)
        prompt = '<SC1>'+ 'Привет.' +'Ваш прошлый диалог:' + '\n'.join(dialog) + ' Твой ответ:<extra_id_0>'
        input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(device)
        out_ids = peft_trained_model.generate(input_ids=input_ids,
                                    max_length=250,
                                    eos_token_id=tokenizer.eos_token_id,
                                    early_stopping=True,
                                    do_sample=True,
                                    temperature=0.8,
                                    top_k=50,
                                    top_p=0.85)
        #dialog.pop(-1)
        t5_output = tokenizer.decode(out_ids[0][1:]).replace('<extra_id_0>','')
        if '</s>' in t5_output:
            t5_output = t5_output[:t5_output.find('</s>')].strip()

        print('Ответ: {}'.format(t5_output))
        dialog.append('Ответ: '+t5_output)

--------------------------------------------------------------------------------


Собеседник:  какие пакеты решений есть в альфа банке?


Ответ: В любом отделении Альфа Банка вы можете выбрать Пакет решений Smart Smart Gold или Smart Platinum. В зависимости от выбранного Пакета Вам будут доступны: Smart Classic, Smart Gold, Smart Platinum, Smart Black.


Собеседник:  Скинь ссылку на пакет Classic 


Ответ: Ссылка - https://www.alfabank.by/besmart/alfa-classic/
