In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import io
from peft import PeftModel, PeftConfig


from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
model = GPT2LMHeadModel.from_pretrained('ai-forever/rugpt3large_based_on_gpt2')

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

In [5]:
for name, param in model.named_parameters():
    print(name)

transformer.wte.weight
transformer.wpe.weight
transformer.h.0.ln_1.weight
transformer.h.0.ln_1.bias
transformer.h.0.attn.c_attn.weight
transformer.h.0.attn.c_attn.bias
transformer.h.0.attn.c_proj.weight
transformer.h.0.attn.c_proj.bias
transformer.h.0.ln_2.weight
transformer.h.0.ln_2.bias
transformer.h.0.mlp.c_fc.weight
transformer.h.0.mlp.c_fc.bias
transformer.h.0.mlp.c_proj.weight
transformer.h.0.mlp.c_proj.bias
transformer.h.1.ln_1.weight
transformer.h.1.ln_1.bias
transformer.h.1.attn.c_attn.weight
transformer.h.1.attn.c_attn.bias
transformer.h.1.attn.c_proj.weight
transformer.h.1.attn.c_proj.bias
transformer.h.1.ln_2.weight
transformer.h.1.ln_2.bias
transformer.h.1.mlp.c_fc.weight
transformer.h.1.mlp.c_fc.bias
transformer.h.1.mlp.c_proj.weight
transformer.h.1.mlp.c_proj.bias
transformer.h.2.ln_1.weight
transformer.h.2.ln_1.bias
transformer.h.2.attn.c_attn.weight
transformer.h.2.attn.c_attn.bias
transformer.h.2.attn.c_proj.weight
transformer.h.2.attn.c_proj.bias
transformer.h.2.ln_2

In [6]:
for name, param in model.named_parameters():
    param.requires_grad = False
    for template in ['.ln_f.', '.23.']:
        if template in name: # choose whatever you like here
            param.requires_grad = True
        
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

760,300,032 total parameters.
28,334,592 training parameters.


In [2]:
tokenizer = AutoTokenizer.from_pretrained('rccmsu/ruadapt_llama2_7b_v0.1')



tokenizer_config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/990k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/145 [00:00<?, ?B/s]

In [2]:
class Config:
    def __init__(self):
        self.actuals = []
        self.actuals_f = False

In [3]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length=self.source_len,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length=self.summ_len, return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [4]:
from time import sleep
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)
        #outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=labels)
        outputs = model(input_ids = ids, attention_mask = mask, labels=labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [5]:
import evaluate
rouge = evaluate.load("rouge")
 
def compute_metrics(decoded_preds,decoded_labels):
    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )
 
    return {k: round(v, 4) for k, v in result.items()}

In [6]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length = config.SUMMARY_LEN
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            predictions.extend(preds)
            if not config.actuals_f:
                target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]  
                actuals.extend(target)
        if not config.actuals_f:
            config.actuals_f = True
            config.actuals = actuals
        scores = compute_metrics(predictions, config.actuals)
        print(predictions[0])
    return predictions, config.actuals, scores

In [7]:
from tqdm.notebook import tqdm

device='cuda:0'
config = Config()         # Initialize config
config.MODEL_NAME = 'ai-forever/rugpt3large_based_on_gpt2'
config.TRAIN_BATCH_SIZE = 9    # input batch size for training (default: 64)
config.VALID_BATCH_SIZE = 1    # input batch size for testing (default: 1000)
config.TRAIN_EPOCHS = 100       # number of epochs to train (default: 10)
config.VAL_EPOCHS = 1 
config.LEARNING_RATE = 5e-5    # learning rate (default: 0.01)
config.SEED = 42               # random seed (default: 42)
config.MAX_LEN = 1024
config.SUMMARY_LEN = 256 

best = 0

# Set random seeds and deterministic pytorch for reproducibility
torch.manual_seed(config.SEED) # pytorch random seed
np.random.seed(config.SEED) # numpy random seed
torch.backends.cudnn.deterministic = True

# tokenzier for encoding the text
tokenizer = GPT2Tokenizer.from_pretrained(config.MODEL_NAME)

df = pd.read_excel('conv_dataset.xlsx')
df.columns = ['ctext', 'text']
#df = df[['text','ctext']]
df.ctext = '<s>user\n' + df.ctext + '</s>\n'
print(df.head())
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = config.SEED)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

train_params = {
    'batch_size': config.TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': config.VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

model = GPT2LMHeadModel.from_pretrained(config.MODEL_NAME, device_map=device)
#model = model.to(device)
'''
for name, param in model.named_parameters():
    param.requires_grad = False
    for template in ['lm_head']:
        if template in name: # choose whatever you like here
            param.requires_grad = True
            
'''
        
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

optimizer = torch.optim.Adam(params =  model.parameters(), lr=config.LEARNING_RATE)

#wandb.watch(model, log="all")

print('Initiating Fine-Tuning for the model on our dataset')
try:
    for epoch in tqdm(range(config.TRAIN_EPOCHS)):
        train(epoch, tokenizer, model, device, training_loader, optimizer)
        #if epoch % 10 == 0:
        _, __, scores = validate(epoch, tokenizer, model, device, val_loader)
        print(scores)
        if scores['rouge2'] > best:
            best = scores['rouge2']
            model.save_pretrained("checkpoints//conv_rouge2_"+str(best).split('.')[-1])
            
except KeyboardInterrupt:
    pass
except Exception as e:
    print(e)
print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(config.VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    print('Output Files generated for review')
    



                                               ctext  \
0  <s>user\nДепозит: 6300рПолная стоимость: 19700...   
1  <s>user\nОплата - 16495р2202 2063 4522 7990Сбе...   
2  <s>user\nэто может заблокировать и нашу и вашу...   
3                              <s>user\nждем:)</s>\n   
4                <s>user\nПришлите чек оплаты)</s>\n   

                                                text  
0              угу\nздесь интересует вариант с двумя  
1  у меня банк с приколом, там нельзя отправить б...  
2  тогда пока что отмена, карты другого банка нем...  
3  вроде нормально, на карту можно без сообщения ...  
4  это подойдет или нужно где подробнее? (сумма с...  
FULL Dataset: (17989, 2)
TRAIN Dataset: (14391, 2)
TEST Dataset: (3598, 2)




760,300,032 total parameters.
760,300,032 training parameters.
Initiating Fine-Tuning for the model on our dataset


  0%|          | 0/100 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


stack expects each tensor to be equal size, but got [10] at entry 0 and [7] at entry 1
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe


KeyboardInterrupt: 

In [1]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

MODEL_NAME = "rccmsu/ruadapt_llama2_7b_v0.1"
DEFAULT_MESSAGE_TEMPLATE = "<s>{role}\n{content}</s>\n"
DEFAULT_SYSTEM_PROMPT = "ты — миша, русскоязычный парень 24 лет живущий в москве. ты разговариваешь с людьми и подшучиваешь над ними."
DEFAULT_RESPONSE_TEMPLATE = "<s>bot\n"

class Conversation:
    def __init__(
        self,
        message_template=DEFAULT_MESSAGE_TEMPLATE,
        system_prompt=DEFAULT_SYSTEM_PROMPT,
        response_template=DEFAULT_RESPONSE_TEMPLATE
    ):
        self.message_template = message_template
        self.response_template = response_template
        self.messages = [{
            "role": "system",
            "content": system_prompt
        }]

    def add_user_message(self, message):
        self.messages.append({
            "role": "user",
            "content": message
        })

    def add_bot_message(self, message):
        self.messages.append({
            "role": "bot",
            "content": message
        })

    def get_prompt(self, tokenizer):
        final_text = ""
        for message in self.messages:
            message_text = self.message_template.format(**message)
            final_text += message_text
        final_text += DEFAULT_RESPONSE_TEMPLATE
        return final_text.strip()

def generate(model, tokenizer, prompt, generation_config):
    data = tokenizer(prompt, return_tensors="pt")
    data = {k: v.to(model.device) for k, v in data.items()}
    output_ids = model.generate(
        **data,
        max_length = 256,
        generation_config=generation_config
    )[0]
    output_ids = output_ids[len(data["input_ids"][0]):]
    output = tokenizer.decode(output_ids, skip_special_tokens=True)
    return output.strip()

In [2]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cuda:0",
    torch_dtype=torch.float16
)
model.to('cuda:0')
model.eval()

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
print(generation_config)

conversation = Conversation()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2
}



In [3]:
from flask import Flask, jsonify, request
from flask_cors import CORS
app = Flask(__name__)


# Cross Origin Resource Sharing (CORS) handling
CORS(app, resources={'/captcha': {"origins": "http://localhost:8080"}})

<flask_cors.extension.CORS at 0x2181cb1cd50>

In [4]:
@app.route('/ai_bot', methods=['POST'])
def post_request():
    conversation.add_user_message(request.json['user_input'])
    prompt = conversation.get_prompt(tokenizer)

    output = generate(model, tokenizer, prompt, generation_config)
    return jsonify({'output':output})

In [None]:
app.run(host='0.0.0.0', port=8080)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8080
 * Running on http://192.168.1.34:8080
Press CTRL+C to quit
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
192.168.1.144 - - [28/Aug/2024 23:13:09] "POST /ai_bot HTTP/1.1" 200 -
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
192.168.1.144 - - [28/Aug/2024 23:14:01] "POST /ai_bot HTTP/1.1" 200 -


In [3]:


import warnings
import os

os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
warnings.filterwarnings("ignore")



In [4]:
import torch

# the if-condition is adapted to devices on M1/M2
device = 'cuda:0' if torch.cuda.is_available() else ('mps' if torch.backends.mps.is_available() else 'cpu')
device

'cuda:0'

In [5]:


from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling, TextDataset
from transformers import AdamW, get_cosine_schedule_with_warmup



In [6]:
model_name_or_path = 'ai-forever/rugpt3large_based_on_gpt2'

# tokenizer based on GPT2 for text preprocessing
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# loading a pre-trained model based on GPT2
model = GPT2LMHeadModel.from_pretrained('finetuned_model/checkpoint-5000').to(device)



NameError: name 'device' is not defined

In [40]:
def generate(prompt, do_sample=True, num_beams=2, temperature=1, top_p=0.9, max_length=256):
    
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    model.eval()
    with torch.no_grad():
        out = model.generate(input_ids, 
                            do_sample=do_sample,
                            num_beams=num_beams,
                            temperature=temperature,
                            top_p=top_p,
                            max_length=max_length,
                            )

    print(list(map(tokenizer.decode, out))[0])

In [44]:
# generation example before fine-tuning
generate('откуда у тебя такой рейтинг?')

откуда у тебя такой рейтинг?<s>user
ну типо того да
но вообще отец был не очень доволен тем что я занялся этим как раз в то время
ему не нравилось что я все время был на математике и почти ничего не делал дома
он считал что это плохо и ничего не делает дома
поэтому он хотел чтобы я занялся чем то типо фриланса
ну типо там сидел бы дома и разбирался в проге
а я в это время хотел шизить в свое хобби и рисовать аниме блядот
поэтому нам приходилось совмещать приятное с полезным</s>


In [1]:
from datasets import Dataset
import pandas as pd

In [2]:
df = pd.read_excel('conv_dataset.xlsx')
df.columns = ['ctext', 'text']
#df = df[['text','ctext']]
df.ctext = '<s>user\n' + df.ctext + '</s>\n'
print(df.head())
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state = 42)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

                                               ctext  \
0  <s>user\nДепозит: 6300рПолная стоимость: 19700...   
1  <s>user\nОплата - 16495р2202 2063 4522 7990Сбе...   
2  <s>user\nэто может заблокировать и нашу и вашу...   
3                              <s>user\nждем:)</s>\n   
4                <s>user\nПришлите чек оплаты)</s>\n   

                                                text  
0              угу\nздесь интересует вариант с двумя  
1  у меня банк с приколом, там нельзя отправить б...  
2  тогда пока что отмена, карты другого банка нем...  
3  вроде нормально, на карту можно без сообщения ...  
4  это подойдет или нужно где подробнее? (сумма с...  
FULL Dataset: (17989, 2)
TRAIN Dataset: (14391, 2)
TEST Dataset: (3598, 2)


In [3]:
lines = []
for i in range(train_dataset.shape[0]):
    lines.append(train_dataset.loc[i, 'ctext'])
    lines.append(train_dataset.loc[i, 'text'])
with open('txt_dataset.txt', 'w', encoding='utf-8') as f:
    f.writelines(lines)

In [4]:
train_dataset = TextDataset(tokenizer=tokenizer,file_path='txt_dataset.txt', 
                            block_size=64)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=False)

NameError: name 'TextDataset' is not defined

In [46]:
train_dataset

NameError: name 'train_dataset' is not defined

In [12]:
training_args = TrainingArguments(
    output_dir = "./finetuned_model",
    overwrite_output_dir = True,
    num_train_epochs = 7,
    gradient_accumulation_steps = 2,
    fp16 = True,
    per_device_train_batch_size = 8,
    learning_rate = 0.0002,
    optim = 'adafactor',
    lr_scheduler_type = 'cosine'
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

In [14]:
trainer.train()

Step,Training Loss
500,4.1468
1000,3.5669
1500,3.1368
2000,2.1443
2500,1.7768
3000,1.357
3500,0.8884
4000,0.7446
4500,0.5957
5000,0.4961


TrainOutput(global_step=5313, training_loss=1.8030999713354183, metrics={'train_runtime': 927.1167, 'train_samples_per_second': 91.683, 'train_steps_per_second': 5.731, 'total_flos': 2.219414088174797e+16, 'train_loss': 1.8030999713354183, 'epoch': 7.0})