# Краткое описание решения:
1. создание target для тренировочных данных (последовательность из 0 и 1, длины 40, 1 - стоит на месте гласной буквы)
2. применение лемматизации
3. токенизация
4. создание датасета, состоящего из слова(без ударения), леммы слова, для train - с target
5. инициализация и обучение модели
    StressModel(
        (lstm_layer_1): Sequential(
                  (0): Embedding(658, 70)
                  (1): LSTM(70, 110, num_layers=2, batch_first=True, bidirectional=True)
                  )
        (lstm_layer_2): Sequential(
                  (0): Embedding(658, 70)
                  (1): LSTM(70, 110, num_layers=2, batch_first=True, bidirectional=True)
                  )
        (fc): Linear(in_features=440, out_features=1, bias=True)
        (dropout): Dropout(p=0.05, inplace=False)
    )
6. количество параметров - 997201 < 1000000
7. сохранение модели - best_model(70-110).pth

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers pymystem3 git+https://github.com/Koziev/character-tokenizer -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m89.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for charactertokenizer (setup.py) ... [?25l[?25hdone


In [None]:
import charactertokenizer
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import re
from sklearn.model_selection import train_test_split
from pymystem3 import Mystem

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
vowels = 'аеёиоуыэюя'
BATCH_SIZE = 128
epochs = 9

In [None]:
def without_stress(text):
    text = re.sub(r'\^', '', text)
    return text


In [None]:
def word_to_target(word, max_len=40):
    target = []
    for i, char in enumerate(word):
        if char == '^':
            target[-1] = '1'
        else:
            target.append('0')

    while len(target) < max_len:
        target.append('0')

    return ''.join(target)

In [None]:
data_train = pd.read_csv('/content/drive/MyDrive/rucode final/train_stresses_labels.txt', header=None)
# data_test = pd.read_csv('/content/drive/MyDrive/rucode final/private_test_stresses.txt', header=None)

data_train.columns = ['Stressed']
# data_test.columns = ['without_stress']

data_train['without_stress'] = data_train['Stressed'].progress_apply(without_stress)
data_train['target'] = data_train['Stressed'].progress_apply(word_to_target)

100%|██████████| 588490/588490 [00:01<00:00, 363363.48it/s]
100%|██████████| 588490/588490 [00:04<00:00, 138292.81it/s]


# Лемматизация

In [None]:
mystem = Mystem()

def preprocess_text(text):
    token = mystem.lemmatize(text)[0]
    return token

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [None]:
data_train['lemma'] = data_train['without_stress'].progress_apply(preprocess_text)
# data_test['lemma'] = data_test['without_stress'].progress_apply(preprocess_text)

100%|██████████| 588490/588490 [01:00<00:00, 9763.35it/s] 
100%|██████████| 294252/294252 [00:28<00:00, 10336.46it/s]


# Dataset Dataloader

In [None]:
class CustomDataset(Dataset):
    def __init__(self, without_stress, lemma, targets=None, max_length=40):

        self.tokenizer = charactertokenizer.CharacterTokenizer.from_pretrained('inkoziev/charllama-35M')
        self.max_length = max_length

        without_stress = [str(item) for item in without_stress]
        self.without_stress = self.tokenizer(without_stress, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)["input_ids"]

        lemmas = [str(item) for item in lemma]
        self.lemmas = self.tokenizer(lemmas, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)["input_ids"]

        if targets is not None:
            targets_int = [torch.tensor([int(digit) for digit in target]) for target in targets]
            targets_tensor = torch.stack(targets_int)
            self.targets = targets_tensor.numpy()

        else:
            self.targets = None

    def __len__(self):
        return len(self.without_stress)

    def __getitem__(self, idx):
        if self.targets is not None:
            return (self.without_stress[idx], self.lemmas[idx]), self.targets[idx]
        else:
            return (self.without_stress[idx], self.lemmas[idx])

In [None]:
shuffled_df = data_train.sample(frac=1).reset_index(drop=True)
train = shuffled_df.copy()
# train, valid = train_test_split(shuffled_df, random_state=42, shuffle=True, train_size=0.7) # для оценки на валидационной выборке - раскомментировать

In [None]:
train_dataset = CustomDataset(train['without_stress'].values, train['lemma'].values, train['target'].values)
# valid_dataset = CustomDataset(valid['without_stress'].values, valid['lemma'].values, valid['target'].values) # для оценки на валидационной выборке - раскомментировать
# test_dataset = CustomDataset(data_test['without_stress'].values, data_test['lemma'].values)

Downloading (…)okenizer_config.json:   0%|          | 0.00/8.90k [00:00<?, ?B/s]

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False) # для оценки на валидационной выборке - раскомментировать
# test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class StressModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(StressModel, self).__init__()

        self.lstm_layer_1 = nn.Sequential(
            nn.Embedding(vocab_size, embedding_dim),
            nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2), # увеличение num_layers не дало результатов
        )

        self.lstm_layer_2 = nn.Sequential(
            nn.Embedding(vocab_size, embedding_dim),
            nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2), # увеличение num_layers не дало результатов
        )

        self.fc = nn.Linear(hidden_dim * 4, 1)
        self.dropout = nn.Dropout(0.05) # возможно нужно увеличить Dropout


    def forward(self, batch):
        stress, lemma = batch
        stress, lemma = stress.to(device), lemma.to(device)

        stress_batch, _ = self.lstm_layer_1(stress)
        stress_out = self.dropout(stress_batch)

        lemma_batch, _ = self.lstm_layer_2(lemma)
        lemma_out = self.dropout(lemma_batch)

        combined_tensor = torch.cat([stress_out, lemma_out], dim=2)
        output = self.fc(combined_tensor)
        return output

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
model = StressModel(vocab_size=658, embedding_dim=70, hidden_dim=110).to(device) # private_best = 70, 110

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.7) # gamma=0.7

In [None]:
params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Number of trainable parameters: {params_count}')

Number of trainable parameters: 997201


In [None]:
# ---  Post process  ---
def post_process_output(outputs):
    """Convert the outputs into a one-hot format with a single '1' for the max value."""
    max_indices = outputs.argmax(dim=1)
    one_hot = torch.zeros_like(outputs)
    one_hot[torch.arange(outputs.size(0)), max_indices] = 1.0
    return one_hot.float()

def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_ones = 0
    total_ones = 0

    all_predictions = []
    all_targets = []

    for inputs, targets in tqdm(dataloader):
#         inputs, targets = inputs.to(device), targets.to(device)
        targets = targets.to(device)
        optimizer.zero_grad()

        outputs = model(inputs).squeeze(2)
        loss = criterion(outputs, targets.float())
        loss.backward()

        optimizer.step()
        total_loss += loss.item()

        predicted = post_process_output(outputs)

        correct_ones += ((predicted == targets) & (targets == 1)).sum().item()
        total_ones += (targets == 1).sum().item()

        all_predictions.extend(predicted.detach().cpu().numpy().ravel())
        all_targets.extend(targets.detach().cpu().numpy().ravel())

    scheduler.step()

    accuracy_ones = 100 * correct_ones / total_ones if total_ones != 0 else 0
    # f1 = f1_score(all_targets, all_predictions)
    # precision = precision_score(all_targets, all_predictions)
    # recall = recall_score(all_targets, all_predictions)
    return total_loss / len(dataloader), accuracy_ones

def test_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct_ones = 0
    total_ones = 0

    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for inputs, targets in dataloader:
#             inputs, targets = inputs.to(device), targets.to(device)
            targets = targets.to(device)

            outputs = model(inputs).squeeze(2)
            loss = criterion(outputs, targets.float())
            total_loss += loss.item()

            predicted = post_process_output(outputs)

            correct_ones += ((predicted == targets) & (targets == 1)).sum().item()
            total_ones += (targets == 1).sum().item()

            all_predictions.extend(predicted.detach().cpu().numpy().ravel())
            all_targets.extend(targets.detach().cpu().numpy().ravel())

    accuracy_ones = 100 * correct_ones / total_ones if total_ones != 0 else 0
    # f1 = f1_score(all_targets, all_predictions)
    # precision = precision_score(all_targets, all_predictions)
    # recall = recall_score(all_targets, all_predictions)
    return total_loss / len(dataloader), accuracy_ones


In [None]:
for epoch in range(epochs):
    print(f'EPOCH: {epoch+1}')
    train_loss, train_acc = train_model(model, train_dataloader, optimizer, criterion, device) #criterion
    print(f'TRAIN-- Loss: {train_loss:.4f}, Accuracy: {train_acc:.2f}%')
    # valid_loss, valid_acc = test_model(model, valid_dataloader, criterion, device) #criterion
    # print(f'VALID-- Loss: {valid_loss:.4f}, Accuracy: {valid_acc:.2f}%')

EPOCH: 1


100%|██████████| 4598/4598 [01:49<00:00, 42.05it/s]


TRAIN-- Loss: 0.0163, Accuracy: 87.35%
EPOCH: 2


  4%|▎         | 165/4598 [00:03<01:45, 41.95it/s]


KeyboardInterrupt: ignored

In [None]:
torch.save(model, '/content/sample_data/best_model(70-110).pth')

# embedding_dim=70, hidden_dim=110, epochs=9

EPOCH: 1
100%|██████████| 4598/4598 [01:47<00:00, 42.95it/s]
TRAIN-- Loss: 0.0166, Accuracy: 87.14%, F1: 0.8714, Precision: 0.8714, Recall: 0.8714
EPOCH: 2
100%|██████████| 4598/4598 [01:50<00:00, 41.43it/s]
TRAIN-- Loss: 0.0074, Accuracy: 94.74%, F1: 0.9474, Precision: 0.9474, Recall: 0.9474
EPOCH: 3
100%|██████████| 4598/4598 [01:46<00:00, 43.09it/s]
TRAIN-- Loss: 0.0049, Accuracy: 96.75%, F1: 0.9675, Precision: 0.9675, Recall: 0.9675
EPOCH: 4
100%|██████████| 4598/4598 [01:45<00:00, 43.45it/s]
TRAIN-- Loss: 0.0038, Accuracy: 97.58%, F1: 0.9758, Precision: 0.9758, Recall: 0.9758
EPOCH: 5
100%|██████████| 4598/4598 [01:46<00:00, 43.35it/s]
TRAIN-- Loss: 0.0027, Accuracy: 98.36%, F1: 0.9836, Precision: 0.9836, Recall: 0.9836
EPOCH: 6
100%|██████████| 4598/4598 [01:46<00:00, 43.20it/s]
TRAIN-- Loss: 0.0022, Accuracy: 98.72%, F1: 0.9872, Precision: 0.9872, Recall: 0.9872
EPOCH: 7
100%|██████████| 4598/4598 [01:46<00:00, 43.34it/s]
TRAIN-- Loss: 0.0016, Accuracy: 99.12%, F1: 0.9912, Precision: 0.9912, Recall: 0.9912
EPOCH: 8
100%|██████████| 4598/4598 [01:46<00:00, 43.24it/s]
TRAIN-- Loss: 0.0013, Accuracy: 99.30%, F1: 0.9930, Precision: 0.9930, Recall: 0.9930
EPOCH: 9
100%|██████████| 4598/4598 [01:46<00:00, 43.30it/s]
TRAIN-- Loss: 0.0009, Accuracy: 99.54%, F1: 0.9954, Precision: 0.9954, Recall: 0.9954
