In [1]:
import numpy as np
import pandas as pd
import os
from torch.utils.data import DataLoader,Dataset
import torch, torchtext
from torchtext.functional import to_tensor
from torchtext.models import (RobertaClassificationHead, 
                              ROBERTA_BASE_ENCODER, 
                              XLMR_LARGE_ENCODER)
from sklearn.model_selection import train_test_split

NUM_CLASSES = 11

device = 'cuda' if torch.cuda.is_available else 'cpu'
device

'cuda'

In [2]:
# !pip install torchtext

In [3]:
data = pd.read_csv('/kaggle/input/textdata/sample.csv')
data

Unnamed: 0,class,text
0,arrangement,СОГЛАШЕНИЕ N 8\nо расторжении трудового догово...
1,arrangement,Соглашение о предоставлении опциона на заключе...
2,arrangement,Соглашение\nо реструктуризации задолженности\n...
3,arrangement,Дополнительное соглашение\r\nк договору купли-...
4,arrangement,Соглашение\nо расторжении договора об оказании...
...,...,...
496,bill,Счет № 5 от 01 октября 2020 г.\r\n\r\nПоставщи...
497,bill,Счет на оплату № от 14 октября 2020 года\r\n\r...
498,bill,Счет №23 от 12.09.2024 г.\t\t...
499,bill,"""Огурец!"" (ИП Микрюков В.В.)\t\t\t\t\t\t\r\n\t..."


In [4]:
mapping = {'act': 0,
         'application': 1,
         'arrangement': 2,
         'bill': 3,
         'contract': 4,
         'contract offer': 5,
         'determination': 6,
         'invoice': 7,
         'order': 8,
         'proxy': 9,
         'statute': 10}
unmapping = {v:k for k,v in mapping.items()}
unmapping

{0: 'act',
 1: 'application',
 2: 'arrangement',
 3: 'bill',
 4: 'contract',
 5: 'contract offer',
 6: 'determination',
 7: 'invoice',
 8: 'order',
 9: 'proxy',
 10: 'statute'}

In [5]:
data['class'].map(mapping).value_counts()

class
9     71
4     70
0     69
1     61
8     50
7     43
3     41
2     40
5     25
10    21
6     10
Name: count, dtype: int64

In [6]:
class DocDataset(torch.utils.data.Dataset):
    # overload the key dunder methods
    def __init__(self, df, num_rows=None):
        self.dataset =  df
        self.dataset['class'] = self.dataset['class'].map(mapping)
        # numpy
        self.x = self.dataset.values
        self.x_tmp = self.x[:, 0]
        # cannot convert to tensor - still text
        self.y_tmp = self.x[:, 1]

    def __getitem__(self, idx):
        text = self.x_tmp[idx]
        targ = self.y_tmp[idx]
        # Return as a tuple with targ/label and then text.
        # This matters later when implementing def collate_data(batch)
        sample = (targ,text)
        return sample
            
    def __len__(self):
        return len(self.x_tmp)

In [7]:
train_df, val_df = train_test_split(data, test_size=0.15, 
                                    random_state=60, stratify=data['class'])

BATCH_SIZE = 4
train_dataset = DocDataset(train_df)
valid_dataset = DocDataset(val_df)

train_loader = DataLoader(train_dataset, BATCH_SIZE, num_workers=4, shuffle=True)
valid_loader = DataLoader(valid_dataset, BATCH_SIZE, num_workers=4, shuffle=False)

print(f"Length of train: {len(train_dataset)}, length of valid: {len(valid_dataset)}")

Length of train: 425, length of valid: 76


In [8]:
classifier_head = RobertaClassificationHead(num_classes=NUM_CLASSES, input_dim=1024)

xlmr = XLMR_LARGE_ENCODER
transformer = xlmr.transform()
input_batch = ["Привет", "Hello"]
model_input = to_tensor(transformer(input_batch), padding_value=1).to(device)
model = xlmr.get_model(head=classifier_head).to(device)

with torch.no_grad():
    print(model_input, model_input.size())
    print(model(model_input), model(model_input).size())

100%|██████████| 5.07M/5.07M [00:00<00:00, 54.3MB/s]
Downloading: "https://download.pytorch.org/models/text/xlmr.vocab.pt" to /root/.cache/torch/hub/checkpoints/xlmr.vocab.pt
100%|██████████| 4.85M/4.85M [00:00<00:00, 66.7MB/s]
Downloading: "https://download.pytorch.org/models/text/xlmr.large.encoder.pt" to /root/.cache/torch/hub/checkpoints/xlmr.large.encoder.pt
100%|██████████| 2.08G/2.08G [00:28<00:00, 79.3MB/s]


tensor([[    0,  1813, 18454,     2],
        [    0, 35378,     2,     1]], device='cuda:0') torch.Size([2, 4])
tensor([[ 0.4749,  0.2213, -0.1027, -0.0729, -0.1334, -0.0614, -0.0048,  0.3618,
         -0.0352,  0.0778, -0.3171],
        [ 0.3213,  0.0744, -0.1230, -0.1659, -0.3443, -0.0170,  0.2645,  0.2863,
          0.0728,  0.1417, -0.1826]], device='cuda:0') torch.Size([2, 11])


In [9]:
def train_epoch(transformer, loader, model, 
                loss_fn, optimizer, scheduler, device):
    model = model.to(device)
    model.train()
    avg_loss = 0.
    for text, target in loader:
        optimizer.zero_grad()  # Обнуляем градиенты
        x = text
        y = target.to(device)
        x = to_tensor(transformer(list(x)), padding_value=1).to(device)
        
        pred_cls = model(x)
        # print(prediction_seg)
        loss = loss_fn(pred_cls, y)
        loss.backward()
        optimizer.step()
        avg_loss += loss.item()
    avg_loss /= len(loader)
    scheduler.step(avg_loss)  # Обновляем скорость обучения
    return avg_loss


def valid_epoch(transformer, loader, model, device, score):
    model = model.to(device)
    model.eval()
    scores = []
    with torch.no_grad():
        for text, target in loader:
            x = text
            y = target.to(device)
            x = to_tensor(transformer(list(x)), padding_value=1).to(device)
            
            probs = torch.sigmoid(model(x)).to(device)
            scores.append(score(probs, y))
    return torch.stack(scores).mean().item()

In [10]:
model

RobertaModel(
  (encoder): RobertaEncoder(
    (transformer): TransformerEncoder(
      (token_embedding): Embedding(250002, 1024, padding_idx=1)
      (layers): TransformerEncoder(
        (layers): ModuleList(
          (0-23): 24 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
            )
            (linear1): Linear(in_features=1024, out_features=4096, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=4096, out_features=1024, bias=True)
            (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (positional_embedding): PositionalEmbedding(
        (embedding): Embe

In [11]:
mapping

{'act': 0,
 'application': 1,
 'arrangement': 2,
 'bill': 3,
 'contract': 4,
 'contract offer': 5,
 'determination': 6,
 'invoice': 7,
 'order': 8,
 'proxy': 9,
 'statute': 10}

In [12]:
data['class'].value_counts().sort_index()

class
act               69
application       61
arrangement       40
bill              41
contract          70
contract offer    25
determination     10
invoice           43
order             50
proxy             71
statute           21
Name: count, dtype: int64

In [13]:
target = data['class'].map(mapping)
class_sample_count = np.unique(target, return_counts=True)[1]
weight = 1. / class_sample_count
samples_weight = weight[target]
samples_weight = torch.from_numpy(samples_weight)

In [14]:
model.head.activation_fn = torch.nn.Softmax()

In [None]:
from torchmetrics import Accuracy

lossfn = torch.nn.CrossEntropyLoss(weight=torch.from_numpy(weight.astype(np.float32)).to(device))
optimizer = torch.optim.Adam(model.parameters(),  lr=1e-4)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1, 
                                              end_factor=0.2, total_iters=30, 
                                              last_epoch=-1)

accuracy = Accuracy(task="multiclass", num_classes=NUM_CLASSES, top_k=1).to(device)

transformer = transformer
train_loader, valid_loader
score = accuracy
loss_fn = lossfn
optimizer, scheduler, device

best_score = 0

for epoch in range(100):
    train_loss = train_epoch(transformer=transformer,
                             loader=train_loader, 
                             model=model, 
                             loss_fn=loss_fn, 
                             optimizer=optimizer, 
                             scheduler=scheduler, 
                             device=device)
    valid_score = valid_epoch(transformer=transformer,
                              loader=valid_loader, 
                              model=model, 
                              device=device,
                              score=accuracy
                             )
    
    if valid_score > best_score:
        torch.save(model.state_dict(), 'best_model.pth')
        best_score = valid_score
        
    print(f'Epoch: {epoch}, train_loss: {train_loss}, valid_score: {valid_score}\n')


  return self._call_impl(*args, **kwargs)
  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)


Epoch: 0, train_loss: 2.3978748900868068, valid_score: 0.09210526198148727

Epoch: 1, train_loss: 2.39730208833641, valid_score: 0.09210526198148727

