In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import RobertaTokenizerFast, RobertaModel, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
import os

Функция для создания DataFrame

In [2]:
def make_df(path):
    data_dict = dict(text = [],
                     score = [])
    for rootdir, dirs, files in os.walk(path):
        for file in files:
            with open(os.path.join(rootdir, file)) as f:
                try:
                    text = " ".join(f.readlines())
                    text = text.replace(r"<br />", "")
                    score = float(file[:-4].split("_")[-1])
                    data_dict["text"].append(text)
                    data_dict["score"].append(score)
                except:
                    print("Ошибка чтения файла", file)
    frame = pd.DataFrame(data_dict)
    frame["score"] = (frame["score"]-1)/9
    return frame

DataSet класс

In [3]:
class IMDBDataset(Dataset):
    def __init__(
            self,
            data: pd.DataFrame,
            tokenizer: RobertaTokenizerFast,
            max_token_len: int = 512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text = data_row["text"]
        score = data_row["score"]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=True,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return dict(
            comment_text=text,
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            token_type_ids = encoding["token_type_ids"].flatten(),
            score=torch.FloatTensor(np.array(score).reshape(-1))
        )

DataModule класс

In [4]:
class IMDBDataModule(pl.LightningDataModule):
    def __init__(self, train_df, val_df, tokenizer, batch_size=8, max_token_len=512):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
    def setup(self, stage=None):
        self.train_dataset = IMDBDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )
        self.val_dataset = IMDBDataset(
            self.val_df,
            self.tokenizer,
            self.max_token_len
        )
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
        )
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
        )

Класс модели

In [5]:
class IMDBScoreRegressor(pl.LightningModule):
    def __init__(self, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = RobertaModel.from_pretrained('roberta-base', return_dict=True)
        self.regressor = nn.Linear(self.bert.config.hidden_size, 1)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.MSELoss()
    def forward(self, input_ids, attention_mask, token_type_ids,  score=None):
        output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        output = self.regressor(output.pooler_output)
        loss = 0
        if score is not None:
            loss = self.criterion(output, score)
        return loss, output
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]
        score = batch["score"]
        loss, outputs = self(input_ids, attention_mask, token_type_ids, score)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "score": score}
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        token_type_ids = batch["token_type_ids"]
        score = batch["score"]
        loss, outputs = self(input_ids, attention_mask, token_type_ids, score)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )
        )

Гиперпараметры

In [6]:
N_EPOCHS = 24
BATCH_SIZE = 12

Чтение данных

In [8]:
train_df, val_df = train_test_split(make_df(r"""aclImdb\train"""), test_size=1000)

Ошибка чтения файла 3832_4.txt
Ошибка чтения файла 4526_4.txt
Ошибка чтения файла 6929_1.txt
Ошибка чтения файла 10327_7.txt
Ошибка чтения файла 11668_7.txt
Ошибка чтения файла 8712_8.txt


Токенизация

In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
data_module = IMDBDataModule(
    train_df,
    val_df,
    tokenizer,
    batch_size=BATCH_SIZE,
    max_token_len=512
)

Создание модели

In [10]:
steps_per_epoch=len(train_df) // BATCH_SIZE
total_training_steps = steps_per_epoch * N_EPOCHS
warmup_steps = total_training_steps // 5

model = IMDBScoreRegressor(
    n_warmup_steps=warmup_steps,
    n_training_steps=total_training_steps
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Создание трейнера

In [11]:
logger = TensorBoardLogger("lightning_logs", name="IMDB-rewiews")
checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="val_loss", save_last=True, every_n_epochs=1)
trainer = pl.Trainer(
    accelerator='gpu',
    devices=1,
    logger=logger,
    callbacks=[checkpoint_callback],
    max_epochs=N_EPOCHS,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Обучение модели

In [12]:
trainer.fit(model, data_module)

Missing logger folder: lightning_logs\IMDB-rewiews
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params
-------------------------------------------
0 | roberta   | RobertaModel | 124 M 
1 | regressor | Linear       | 769   
2 | criterion | MSELoss      | 0     
-------------------------------------------
124 M     Trainable params
0         Non-trainable params
124 M     Total params
498.586   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


Компиляция в ONNX

In [46]:
class DeployModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.trained_model = model

    def forward(self, input_ids, attention_mask, token_type_ids):
        # _, output = self.trained_model(input_ids, attention_mask, token_type_ids)
        _, output = self.trained_model(input_ids, attention_mask, token_type_ids)
        return output.view(-1)*9 + 1

In [None]:
deploy_model = model

deploy_model = DeployModel(IMDBScoreRegressor.load_from_checkpoint(r"lightning_logs\IMDB-rewiews\version_1\checkpoints\epoch=13-step=28000.ckpt"))

In [51]:
input_sample_dict = tokenizer.encode_plus(
    "input_sample",
    add_special_tokens=True,
    max_length=512,
    return_token_type_ids=True,
    padding="max_length",
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)

deploy_model.example_input_array = input_sample_dict["input_ids"], input_sample_dict["attention_mask"], input_sample_dict["token_type_ids"]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [52]:
deploy_model.to_onnx("ROBERTA_model.onnx", export_params=True)