**Выводы**: все обучались стабильно и дают близкое друг к другу качество
- `BERT accuracy = 0.9391`
- `ALBERT accuracy = 0.9448`
- `DeBERTa accuracy = 0.9416`

Это может быть связано с одинаковыми параметрами обучения и конфигураций. Предполагаю, что улучшить качество можно, подобрав эти параметры.

Обратил внимание, что модели, несмотря на отличающиееся размеры, учатся схожее количество времени - ~20 минут на одну эпоху обучения и валидации. Буду рад, если подскажете как ускорить обучение и инференс

In [None]:
!git clone https://github.com/dariush-bahrami/character-tokenizer.git

In [None]:
%%capture
!pip install torch transformers

In [None]:
import string
import sys
import pandas as pd
import torch

from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

sys.path.append("./character-tokenizer")
from charactertokenizer import CharacterTokenizer

In [None]:
chars = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
model_max_length = 64
tokenizer = CharacterTokenizer(chars, model_max_length)
example = "Привет"
tokens = tokenizer(example)
print(tokens)

{'input_ids': [0, 39, 42, 26, 12, 18, 46, 1], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
df = pd.read_csv('all_accents.tsv', sep='\t', names=['word', 'gt'])
print(f"{df.shape = }")
df.head()

df.shape = (1680535, 2)


Unnamed: 0,word,gt
0,-де,-д^е
1,-ка,-к^а
2,-либо,-л^ибо
3,-нибудь,-ниб^удь
4,-с,-с


In [None]:
class StressDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        super(StressDataset, self).__init__()
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word = self.data.iloc[idx]['word']
        stress_idx = self.data.iloc[idx]['gt'].find('^')
        labels = [0] * len(word)

        encoded = self.tokenizer.encode_plus(
            word,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_special_tokens_mask=True,
        )

        del encoded['token_type_ids']

        encoded['input_ids'] = encoded['input_ids'].squeeze()
        encoded['attention_mask'] = encoded['attention_mask'].squeeze()
        encoded['special_tokens_mask'] = encoded['special_tokens_mask'].squeeze()

        offset = encoded['input_ids'].shape[-1]

        encoded['labels'] = torch.tensor(
            [-100] + labels + [-100] * (offset - (1 + len(labels)))
        ).long()

        encoded['labels'][stress_idx + 1] = 1

        return encoded

In [None]:
MODEL_MAX_LENGTH = 100
BATCH_SIZE = 512

train_data, test_data = train_test_split(df, test_size=0.5, random_state=42)

train_dataset = StressDataset(train_data, tokenizer, MODEL_MAX_LENGTH)
test_dataset = StressDataset(test_data, tokenizer, MODEL_MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=2)

In [None]:
def train_one_epoch(model, train_loader, epoch, optimizer, scheduler, device, dir_name):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        mask = batch['special_tokens_mask'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss, logits = outputs.loss, outputs.logits
        total_loss += loss.item()
        loss.backward()

        optimizer.step()
        scheduler.step()

        # calculating accuracy
        for idx in range(batch['input_ids'].shape[0]):
            pred = torch.masked_select(
                input=logits[idx].argmax(dim=-1),
                mask=mask[idx] == 0
            )
            true = torch.masked_select(
                input=labels[idx],
                mask=labels[idx] != -100
            )
            if torch.equal(pred, true):
                correct += 1
            total += 1

    avg_loss = total_loss / len(train_loader)
    avg_acc = correct / total
    print(f'Epoch: {epoch + 1}, train loss: {avg_loss:.4f}, train accuracy: {avg_acc:.4f}')

    # checkpoint
    model.save_pretrained(f"./{dir_name}/epoch_0{epoch + 1}")

In [None]:
def eval_one_epoch(model, test_loader, epoch, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            mask = batch['special_tokens_mask'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss, logits = outputs.loss, outputs.logits
            total_loss += loss.item()

            # calculating accuracy
            for idx in range(batch['input_ids'].shape[0]):
                pred = torch.masked_select(
                    input=logits[idx].argmax(dim=-1),
                    mask=mask[idx] == 0
                )
                true = torch.masked_select(
                    input=labels[idx],
                    mask=labels[idx] != -100
                )
                if torch.equal(pred, true):
                    correct += 1
                total += 1

    avg_loss = total_loss / len(test_loader)
    avg_acc = correct / total
    print(f'Epoch: {epoch + 1}, eval loss: {avg_loss:.4f}, eval accuracy: {avg_acc:.4f}')

In [None]:
def visualize(model, word, device):
    model.eval()
    with torch.no_grad():
        encoded = tokenizer.encode_plus(
            word,
            max_length=MODEL_MAX_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_special_tokens_mask=True,
        )

        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        result = outputs.logits.argmax(dim=-1)
        result = torch.masked_select(
            result.cpu(),
            encoded['special_tokens_mask'] == 0
        ).tolist()

    output = ''
    for char, label in zip(word, result):
        if label == 1:
            output += '^' + char
        else:
            output += char
    return output

### BERT

In [None]:
from transformers import BertForTokenClassification, BertConfig

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

bert_config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=256,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=512,
)

model = BertForTokenClassification(bert_config)
model = model.to(device)

In [None]:
from transformers import get_cosine_schedule_with_warmup

In [None]:
NUM_EPOCHS = 10
num_warmup_steps = 1000
num_training_steps = NUM_EPOCHS * len(train_loader)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

In [None]:
DIR_NAME = 'bert_checkpoints'

try:
    for epoch in range(NUM_EPOCHS):
        train_one_epoch(model, train_loader, epoch, optimizer, scheduler, device, DIR_NAME)
        eval_one_epoch(model, test_loader, epoch, device)
except KeyboardInterrupt:
    print('Interrupted')

  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 1, train loss: 0.1325, train accuracy: 0.6010


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 1, eval loss: 0.0884, eval accuracy: 0.7638


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 2, train loss: 0.0847, train accuracy: 0.7648


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 2, eval loss: 0.0693, eval accuracy: 0.8007


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 3, train loss: 0.0700, train accuracy: 0.8109


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 3, eval loss: 0.0568, eval accuracy: 0.8513


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 4, train loss: 0.0589, train accuracy: 0.8439


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 4, eval loss: 0.0474, eval accuracy: 0.8779


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 5, train loss: 0.0503, train accuracy: 0.8680


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 5, eval loss: 0.0405, eval accuracy: 0.8972


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 6, train loss: 0.0433, train accuracy: 0.8865


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 6, eval loss: 0.0344, eval accuracy: 0.9138


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 7, train loss: 0.0372, train accuracy: 0.9032


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 7, eval loss: 0.0304, eval accuracy: 0.9243


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 8, train loss: 0.0325, train accuracy: 0.9155


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 8, eval loss: 0.0271, eval accuracy: 0.9335


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 9, train loss: 0.0292, train accuracy: 0.9240


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 9, eval loss: 0.0258, eval accuracy: 0.9384


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 10, train loss: 0.0277, train accuracy: 0.9281


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 10, eval loss: 0.0256, eval accuracy: 0.9391


In [None]:
data_sample = df.sample(20)
data_sample['pred'] = data_sample['word'].apply(lambda x: visualize(model, x, device))
data_sample

Unnamed: 0,word,gt,pred
549026,клоунскому,кл^оунскому,кл^оунскому
1030947,погружаемо,погруж^аемо,погруж^аемо
27805,амбициознейшем,амбици^ознейшем,амбици^ознейшем
1512486,трюкаческий,трюк^аческий,трюк^аческий
433819,засереетесь,засер^еетесь,засер^еетесь
281231,горюющее,гор^юющее,гор^юющее
901549,откладывавшем,откл^адывавшем,откл^адывавшем
1611109,цезурная,цез^урная,цез^урная
104673,бокастеньких,бок^астеньких,бок^астеньких
618786,либреттистом,либретт^истом,либретт^истом


### ALBERT

In [None]:
from transformers import AlbertForTokenClassification, AlbertConfig

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

albert_config = AlbertConfig(
    vocab_size=tokenizer.vocab_size,
    hidden_size=256,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=512,
)

model = AlbertForTokenClassification(albert_config)
model = model.to(device)

In [None]:
from transformers import get_cosine_schedule_with_warmup

In [None]:
NUM_EPOCHS = 10
num_warmup_steps = 1000
num_training_steps = NUM_EPOCHS * len(train_loader)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

In [None]:
DIR_NAME = 'albert_checkpoints'

try:
    for epoch in range(NUM_EPOCHS):
        train_one_epoch(model, train_loader, epoch, optimizer, scheduler, device, DIR_NAME)
        eval_one_epoch(model, test_loader, epoch, device)
except KeyboardInterrupt:
    print('Interrupted')

  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 1, train loss: 0.1327, train accuracy: 0.5937


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 1, eval loss: 0.0925, eval accuracy: 0.7395


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 2, train loss: 0.0833, train accuracy: 0.7677


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 2, eval loss: 0.0756, eval accuracy: 0.7892


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 3, train loss: 0.0669, train accuracy: 0.8221


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 3, eval loss: 0.0609, eval accuracy: 0.8339


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 4, train loss: 0.0541, train accuracy: 0.8596


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 4, eval loss: 0.0505, eval accuracy: 0.8679


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 5, train loss: 0.0444, train accuracy: 0.8865


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 5, eval loss: 0.0436, eval accuracy: 0.8916


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 6, train loss: 0.0364, train accuracy: 0.9086


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 6, eval loss: 0.0356, eval accuracy: 0.9125


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 7, train loss: 0.0292, train accuracy: 0.9282


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 7, eval loss: 0.0302, eval accuracy: 0.9266


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 8, train loss: 0.0230, train accuracy: 0.9447


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 8, eval loss: 0.0269, eval accuracy: 0.9370


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 9, train loss: 0.0183, train accuracy: 0.9567


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 9, eval loss: 0.0255, eval accuracy: 0.9441


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 10, train loss: 0.0156, train accuracy: 0.9636


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 10, eval loss: 0.0257, eval accuracy: 0.9448


In [None]:
data_sample = df.sample(20)
data_sample['pred'] = data_sample['word'].apply(lambda x: visualize(model, x, device))
data_sample

Unnamed: 0,word,gt,pred
634372,люнеты,люн^еты,люн^еты
1087274,полукруглых,полукр^углых,полукр^углых
1031474,погуляйте,погул^яйте,погул^яйте
454579,зашпунтуй,зашпунт^уй,зашпунт^уй
752652,неблагоприятнейшею,неблагопри^ятнейшею,неблагопри^ятнейшею
1226912,противостоявших,противосто^явших,противосто^явших
1102553,попугавшею,попуг^авшею,попуг^авшею
1058558,подседать,подсед^ать,подсед^ать
1235856,прыгучим,прыг^учим,прыг^учим
1658703,экспромту,экспр^омту,экспр^омту


### DeBERTa

In [None]:
from transformers import DebertaV2ForTokenClassification, DebertaV2Config

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

deberta_config = DebertaV2Config(
    vocab_size=tokenizer.vocab_size,
    hidden_size=256,
    num_hidden_layers=4,
    num_attention_heads=4,
    intermediate_size=512,
)

model = DebertaV2ForTokenClassification(deberta_config)
model = model.to(device)

In [None]:
from transformers import get_cosine_schedule_with_warmup

In [None]:
NUM_EPOCHS = 10
num_warmup_steps = 1000
num_training_steps = NUM_EPOCHS * len(train_loader)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

In [None]:
DIR_NAME = 'deberta_checkpoints'

try:
    for epoch in range(NUM_EPOCHS):
        train_one_epoch(model, train_loader, epoch, optimizer, scheduler, device, DIR_NAME)
        eval_one_epoch(model, test_loader, epoch, device)
except KeyboardInterrupt:
    print('Interrupted')

  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 1, train loss: 0.1328, train accuracy: 0.6097


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 1, eval loss: 0.0869, eval accuracy: 0.7488


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 2, train loss: 0.0833, train accuracy: 0.7677


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 2, eval loss: 0.0698, eval accuracy: 0.8125


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 3, train loss: 0.0686, train accuracy: 0.8139


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 3, eval loss: 0.0555, eval accuracy: 0.8579


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 4, train loss: 0.0573, train accuracy: 0.8474


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 4, eval loss: 0.0466, eval accuracy: 0.8817


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 5, train loss: 0.0490, train accuracy: 0.8710


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 5, eval loss: 0.0400, eval accuracy: 0.9021


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 6, train loss: 0.0418, train accuracy: 0.8905


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 6, eval loss: 0.0336, eval accuracy: 0.9161


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 7, train loss: 0.0358, train accuracy: 0.9072


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 7, eval loss: 0.0293, eval accuracy: 0.9294


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 8, train loss: 0.0311, train accuracy: 0.9189


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 8, eval loss: 0.0263, eval accuracy: 0.9369


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 9, train loss: 0.0279, train accuracy: 0.9277


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 9, eval loss: 0.0250, eval accuracy: 0.9411


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 10, train loss: 0.0264, train accuracy: 0.9318


  0%|          | 0/1642 [00:00<?, ?it/s]

Epoch: 10, eval loss: 0.0248, eval accuracy: 0.9416


In [None]:
data_sample = df.sample(20)
data_sample['pred'] = data_sample['word'].apply(lambda x: visualize(model, x, device))
data_sample

Unnamed: 0,word,gt,pred
525111,капитальные,капит^альные,капит^альные
406447,закрапаешь,закр^апаешь,закр^апаешь
169277,вогнанных,в^огнанных,в^огнанных
350511,доукомплектовывал,доукомплект^овывал,доукомплект^овывал
1150472,привертывали,прив^ертывали,прив^ертывали
1034040,подвергнувшуюся,подв^ергнувшуюся,подв^ергнувшуюся
1548453,уплативших,уплат^ивших,уплат^ивших
1645797,шпажная,шп^ажная,шп^ажная
1144378,препозитивны,препозит^ивны,препозит^ивны
1025331,повоешь,пов^оешь,пов^оешь
