# Общие шаги по предсказанию моделью:
1. загрузка приватного тестового набора данных
2. проведение лемматизации
3. создание датасета и токенизация - каждый батч будет содержать токенизированные исходнык слова и токенизированные леммы
4. загрузка модели
5. создание предсказаний (map_location=torch.device('cpu') - для данной модели попробовал впервые делать предсказания на cpu; до этого на гпу)
6. вывод нескольких получившихся слов
7. сохранение предсказания

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers pymystem3 git+https://github.com/Koziev/character-tokenizer -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for charactertokenizer (setup.py) ... [?25l[?25hdone


In [3]:
import charactertokenizer
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import re
from pymystem3 import Mystem

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler

In [4]:
vowels = 'аеёиоуыэюя'
BATCH_SIZE = 128
epochs = 9

In [6]:
data_test = pd.read_csv('/content/drive/MyDrive/rucode final/private_test_stresses.txt', header=None)

data_test.columns = ['without_stress']

# Лемматизация

In [7]:
mystem = Mystem()

def preprocess_text(text):
    token = mystem.lemmatize(text)[0]
    return token

data_test['lemma'] = data_test['without_stress'].progress_apply(preprocess_text)

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz
100%|██████████| 294252/294252 [00:27<00:00, 10757.77it/s]


In [8]:
class CustomDataset(Dataset):
    def __init__(self, without_stress, lemma, targets=None, max_length=40):

        self.tokenizer = charactertokenizer.CharacterTokenizer.from_pretrained('inkoziev/charllama-35M')
        self.max_length = max_length

        without_stress = [str(item) for item in without_stress]
        self.without_stress = self.tokenizer(without_stress, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)["input_ids"]

        lemmas = [str(item) for item in lemma]
        self.lemmas = self.tokenizer(lemmas, return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)["input_ids"]

        if targets is not None:
            targets_int = [torch.tensor([int(digit) for digit in target]) for target in targets]
            targets_tensor = torch.stack(targets_int)
            self.targets = targets_tensor.numpy()

        else:
            self.targets = None

    def __len__(self):
        return len(self.without_stress)

    def __getitem__(self, idx):
        if self.targets is not None:
            return (self.without_stress[idx], self.lemmas[idx]), self.targets[idx]
        else:
            return (self.without_stress[idx], self.lemmas[idx])

In [9]:
test_dataset = CustomDataset(data_test['without_stress'].values, data_test['lemma'].values)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Downloading (…)okenizer_config.json:   0%|          | 0.00/8.90k [00:00<?, ?B/s]

In [11]:
class StressModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(StressModel, self).__init__()

        self.lstm_layer_1 = nn.Sequential(
            nn.Embedding(vocab_size, embedding_dim),
            nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2), # увеличение num_layers не дало результатов
        )

        self.lstm_layer_2 = nn.Sequential(
            nn.Embedding(vocab_size, embedding_dim),
            nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True, num_layers=2), # увеличение num_layers не дало результатов
        )

        self.fc = nn.Linear(hidden_dim * 4, 1)
        self.dropout = nn.Dropout(0.05) # возможно нужно увеличить Dropout


    def forward(self, batch):
        stress, lemma = batch
        stress, lemma = stress.to(device), lemma.to(device)

        stress_batch, _ = self.lstm_layer_1(stress)
        stress_out = self.dropout(stress_batch)

        lemma_batch, _ = self.lstm_layer_2(lemma)
        lemma_out = self.dropout(lemma_batch)

        combined_tensor = torch.cat([stress_out, lemma_out], dim=2)
        output = self.fc(combined_tensor)
        return output

In [13]:
device = 'cpu'
model = torch.load('/content/drive/MyDrive/rucode final/best_model(70-110).pth', map_location=torch.device('cpu'))
# model.to(device)

In [16]:
model.parameters

<bound method Module.parameters of StressModel(
  (lstm_layer_1): Sequential(
    (0): Embedding(658, 70)
    (1): LSTM(70, 110, num_layers=2, batch_first=True, bidirectional=True)
  )
  (lstm_layer_2): Sequential(
    (0): Embedding(658, 70)
    (1): LSTM(70, 110, num_layers=2, batch_first=True, bidirectional=True)
  )
  (fc): Linear(in_features=440, out_features=1, bias=True)
  (dropout): Dropout(p=0.05, inplace=False)
)>

In [17]:
params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Number of trainable parameters: {params_count}')

Number of trainable parameters: 997201


In [18]:
def post_process_output(outputs):
    """Convert the outputs into a one-hot format with a single '1' for the max value."""
    max_indices = outputs.argmax(dim=1)
    one_hot = torch.zeros_like(outputs)
    one_hot[torch.arange(outputs.size(0)), max_indices] = 1.0
    return one_hot.float()

In [19]:
# ---  with post process  ---
def get_predictions(model, dataloader, device):
    model.eval()
    all_predictions = []

    with torch.no_grad():
        for inputs in tqdm(dataloader):

            outputs = model(inputs).squeeze(2)
            predicted = post_process_output(outputs)

            all_predictions.extend(predicted.cpu().numpy().tolist())

    return all_predictions

In [20]:
predictions = get_predictions(model, test_dataloader, device) # время на получение предсказаний на cpu: ~17 минут, на gpu: ~1-2 минуты, до этого всегда запускали на gpu

100%|██████████| 2299/2299 [17:00<00:00,  2.25it/s]


In [21]:
data_test['predicted'] = predictions

In [22]:
# расстановка ударений в получившихся словах
def add_stress(arr, word):
    arr = arr[:len(word)]
    new_word = ''.join([char + '^' if arr[i] == 1.0 else char for i, char in enumerate(word)])
    return new_word

In [23]:
result_series = pd.Series([add_stress(arr, word) for arr, word in zip(data_test['predicted'], data_test['without_stress'])])
data_test['predicted'] = result_series

In [24]:
full_test = data_test['predicted'].values.tolist()

In [25]:
# посмотри на получившиеся слова с ударениями
import random
random_items = random.sample(full_test, 20)
print(random_items)

['прикипи^м', 'выпора^жнивавши', 'бронхоадени^ты', 'опосты^лев', 'заморо^сивши', 'хле^бные', 'продыми^ла', 'помолоде^в', 'вы^говорами', 'сверне^м', 'переклю^ют', 'пыря^ет', 'ге^ную', 'зали^занный', 'ме^ртвенными', 'проковырну^ла', 'абзе^тцер', 'потли^востью', 'по^шлину', 'нагримиро^вывалось']


In [None]:
# если нужно, создаем посылку

# with open('/content/sample_data/privateTest+model(70-110).txt', 'w', encoding='utf-8') as f:
#     for item in full_test:
#         f.write("%s\n" % item)