In [None]:
import warnings
for warn in [UserWarning, FutureWarning]: warnings.filterwarnings("ignore", category = warn)

# Импорт необходимых библиотек
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm

from transformers import AutoTokenizer, AutoModel,AutoModelForMaskedLM
from sklearn.model_selection import train_test_split
import torch
import torch.nn.functional as F
from torch import Tensor
from einops import rearrange
from typing import Tuple, Callable
from torch.autograd import Function
import gc
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import root_mean_squared_error

from torch.utils.data import Dataset, DataLoader 
pd.set_option('display.max_columns', None)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
train = pd.read_csv('../data/train.csv')

In [None]:
def preprocess(df):
    df['activation_date'] = pd.to_datetime(df['activation_date'])

    df['day'] = df['activation_date'].dt.day
    df['month'] = df["activation_date"].dt.month
    df['year'] = df["activation_date"].dt.year
    df['weekday'] = df['activation_date'].dt.weekday
    df["dayofyear"] = df['activation_date'].dt.dayofyear
    df.drop(columns=['activation_date', 'item_id'], inplace=True)
    df['param_1'] = df['param_1'].fillna('')
    df['param_2'] = df['param_2'].fillna('')
    df['param_3'] = df['param_3'].fillna('')
    df['description'] = df['description'].fillna('')
    return df

In [3]:
class Dataset_avito(): 
    def __init__(self, part='train', len_1=15034, len_2=15034): 
        train = pd.read_csv('../data/train.csv')
        train_1 = train[train.deal_probability != 0.0].iloc[0:len_1]
        train_2 = train[train.deal_probability == 0.0].iloc[0:len_2]
        #train = train.iloc[0:15034]
        train = pd.concat([train_1, train_2])
        train = preprocess(train)
        X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['deal_probability', 'image']), train['deal_probability'], test_size=0.2, random_state=42)
        self.x = X_train if part == 'train' else X_val
        self.y = y_train if part == 'train' else y_val
        self.n_samples = X_train.shape[0] if part == 'train' else X_val.shape[0]
        self.text = list(self.x.apply(lambda item: '\n'.join([ item["title"], str(item["description"]), item["region"], item["city"], item["parent_category_name"], item["category_name"], ('' if item["param_1"] is None else str(item["param_1"])), ('' if item["param_2"] is None else str(item["param_2"])), ('' if item["param_3"] is None else str(item["param_3"]))]), axis=1).values)
        user_type_dict = {'Private': 0, 'Company': 1, 'Shop': 2}
        self.tabular = list(self.x.apply(lambda item: torch.tensor([item["item_seq_number"], item["day"], item["month"], item["year"], item["weekday"], item["dayofyear"], user_type_dict[item["user_type"]], 0.0 if item["price"] is None else item["price"]]), axis=1).values)

    def __getitem__(self, index): 
        return self.tabular[index], self.text[index], np.array(self.y)[index] 
        
    def __len__(self): 
        return self.n_samples

In [4]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_avito('train'), batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(dataset=Dataset_avito('val'), batch_size=BATCH_SIZE, shuffle=False)

In [4]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_avito('train', len_1=7517, len_2=7517), batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(dataset=Dataset_avito('val', len_1=7517, len_2=7517), batch_size=BATCH_SIZE, shuffle=False)

In [6]:
from dataclasses import dataclass
from typing import ClassVar
from typing import List, Dict, Any, Tuple, Optional
@dataclass
class ModelTrainer:
    model: 'typing.Any'
    train_dataloader: DataLoader
    val_dataloader: DataLoader
    device: torch.device
    epochs: int
    round_loss: int
    round_rmse: int

    optimizer: torch.optim
    loss_fn: 'typing.Any'
    
    patience: int = 10 # Ранняя остановка обучения

    def __post_init__(self):        
        # История обучения и тестирования
        self.__history = pd.DataFrame({
            "train_avg": [], # Средние метрики на тренировочной выборке
            "val_avg": [], # Средние метрики на валидационной выборке
            "train_loss": [], # Loss на тренировочной выборке
            "val_loss": [], # Loss на валидационной выборке
        })

        # Количество шагов в одной эпохе
        self.__train_steps = len(self.train_dataloader)
        self.__val_steps = len(self.val_dataloader)

        self.__best_val_avg = 0
        self.__no_improvement_count = 0
        
        self.loss_fn = self.loss_fn

    @property
    def history(self) -> pd.DataFrame:
        """Получение DataFrame историей обучения и тестирования

        Returns:
            pd.DataFrame: **DataFrame** c историей обучения и тестирования
        """

        return self.__history

    @classmethod
    def _is_best_model(self, dev_avg: float) -> bool:
        """Проверка, является ли текущая модель лучшей на основе метрик валидации

        Args:
            test_accuracy (float): Текущая точность тестирования

        Returns:
            bool: True, если текущая модель лучшая, иначе False
        """

        try:
            min_val_avg = min(self.__history["val_avg"])
        except ValueError:
            min_val_avg = 10**10
        return dev_avg < min_val_avg

    def _save_model(self, epoch: int, path_to_model: str, test_rmse: float, loss: torch.Tensor) -> None:
        """Сохранение модели

        Args:
            epoch (int): Текущая эпоха
            path_to_model (str): Путь для сохранения модели
            test_rmse (float): rmse на тестовой выборке
            loss (torch.Tensor): Значение потерь
        """
        
        os.makedirs(path_to_model, exist_ok = True)
        self._best_model_name = f"{self.model.__class__.__name__}_{epoch}_{test_rmse}_checkpoint.pth"

        torch.save({
            "epoch": epoch,
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
            "test_loss": loss,
        }, os.path.join(path_to_model, f"{self.model.__class__.__name__}_{epoch}_{test_rmse}_checkpoint.pth"))
    
    # Процесс обучения
    def train(self, path_to_model: str) -> None:
        """Процесс обучения

        Args:
            path_to_model (str): Путь для сохранения моделей

        Returns:
            None
        """
        
        losses_train_list = []
        losses_val_list = []
        rmse_train_list = []
        rmse_val_list = []
        min_val_rmse = 10**10

        for epoch in range(1, self.epochs + 1):
            with torch.no_grad():
                torch.cuda.empty_cache()
            self.model.train() # Установка модели в режим обучения
            # Сумма Loss
            total_train_loss = 0
            total_val_loss = 0
            # Сумма rmse
            train_rmse = 0
            val_rmse = 0

            # Проход по всем тренировочным пакетам
            with tqdm(total = self.__train_steps, desc = f"Эпоха {epoch}", unit = "batch") as pbar_train:
                for batch, (tabular, text, targets) in enumerate(self.train_dataloader, 1):
                    tabular = tabular.unsqueeze(2).expand(-1, -1, 1024).to(device)
                    tabular = torch.nan_to_num(tabular,nan=0.0)
                    text_embedding = []
                    for i in range(len(text)):
                        encoded_input = feature_extractor_tokenizer(text[i], padding=True, truncation=True, return_tensors='pt').to(device)
                        with torch.no_grad():
                            features = feature_extractor_model(**encoded_input)[0][0]
                        text_embedding.append(features)
                    text_embedding = torch.nn.utils.rnn.pad_sequence(text_embedding, batch_first=True)
                    emb_concat = torch.concat((tabular, text_embedding), 1)
                    emb_concat = emb_concat.to(device)
                    targets = targets.to(device)
                    logits = self.model(emb_concat)
                    logits = torch.nan_to_num(logits, nan=0.0)
                    loss = self.loss_fn(logits, targets.float()) # Ошибка предсказаний

                    # Обратное распространение для обновления весов
                    self.optimizer.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    self.optimizer.step()
        
                    total_train_loss += loss.item() # Потеря
                    # RMSE
                    train_rmse += root_mean_squared_error(targets.cpu().detach().numpy(), logits.cpu().detach().numpy())
        
                    pbar_train.update(1)
                    with torch.no_grad():
                        torch.cuda.empty_cache()

                # Средняя потеря
                avg_train_loss = round(total_train_loss / batch, self.round_loss)
                losses_train_list.append(avg_train_loss)
        
                # RMSE
                train_rmse = round(train_rmse / len(self.train_dataloader.dataset) * 100, self.round_rmse)
                rmse_train_list.append(train_rmse)
        
                pbar_train.set_postfix({
                    "rmse": train_rmse,
                    "Средняя потеря": avg_train_loss
                })
            
            
            # Установка модели в режим предсказаний
            self.model.eval()
        
            # Предсказания на валидационной выборке
            with torch.no_grad():
                with tqdm(total = self.__val_steps, desc = f"Тестирование {epoch}", unit = "batch") as pbar_val:
                    for batch, (tabular, text, targets) in enumerate(self.val_dataloader, 1):
                        text_embedding = []
                        tabular = tabular.unsqueeze(2).expand(-1, -1, 1024).to(device)
                        tabular = torch.nan_to_num(tabular,nan=0.0)
                        for i in range(len(text)):
                            encoded_input = feature_extractor_tokenizer(text[i], padding=True, truncation=True, return_tensors='pt').to(device)
                            with torch.no_grad():
                                features = feature_extractor_model(**encoded_input)[0][0]
                            text_embedding.append(features)
                        text_embedding = torch.nn.utils.rnn.pad_sequence(text_embedding, batch_first=True)
                        emb_concat = torch.concat((tabular, text_embedding), 1)
                        emb_concat = emb_concat.to(device)
                        targets = targets.to(device)
                        logits = self.model(emb_concat)
                        logits = torch.nan_to_num(logits, nan=0.0)
                        loss = self.loss_fn(logits, targets.float()) # Ошибка предсказаний
                        
                        total_val_loss += loss.item() # Потеря
                        # RMSE
                        val_rmse += root_mean_squared_error(targets.cpu().detach().numpy(), logits.cpu().detach().numpy())
        
                        pbar_val.update(1)
                        with torch.no_grad():
                            torch.cuda.empty_cache()
                    # Средняя потеря
                    avg_val_loss = round(total_val_loss / batch, self.round_loss)
                    losses_val_list.append(avg_val_loss)
        
                    # RMSE
                    val_rmse = round(val_rmse / len(self.val_dataloader.dataset) * 100, self.round_rmse)
                    rmse_val_list.append(val_rmse)
                    
                    pbar_val.set_postfix({
                        "rmse": val_rmse,
                        "Средняя потеря": avg_val_loss
                    })
            
            if val_rmse < min_val_rmse:
                min_val_rmse = val_rmse
                self._save_model(epoch, path_to_model, round(val_rmse, self.round_rmse), avg_val_loss)
                self.__best_dev_avg = val_rmse
                self.__no_improvement_count = 0
            else:
                self.__no_improvement_count += 1

            if self.__no_improvement_count >= self.patience:
                print(f"Ранняя остановка на эпохе {epoch} из-за отсутствия улучшения точности на тестовой выборке")
                break

    # Получение хэш-значения
    def __hash__(self):
        return id(self)

In [7]:
EPOCHS = 20 # Количество эпох
BATCH_SIZE = 32 # Размер выборки (пакета)
LEARNING_RATE = 1e-4 # Скорость обучения
ROUND_RMSE = 2 # Знаков Accuracy после запятой
ROUND_LOSS = 7 # Знаков Loss после запятой
ROOT_DIR = os.path.join(".")
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_transformer")

In [8]:
feature_extractor_tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True)
feature_extractor_model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", code_revision='da863dd04a4e5dce6814c6625adfba87b83838aa', trust_remote_code=True).to(device)

flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn is not installed. Using PyTorch native attention implementation.
flash_attn i

In [9]:
class TransformerModelWithAttention(nn.Module):
    def __init__(self, input_dim = 1024, hidden_dim=128, num_heads = 4, num_layers = 8, dropout = 0.1):
        super(TransformerModelWithAttention, self).__init__()
        self.in_layer = nn.Linear(input_dim, hidden_dim)
        self.positional_encoding = nn.Parameter(torch.zeros(1, 10000, hidden_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model = hidden_dim, nhead = num_heads, dim_feedforward = hidden_dim, dropout = dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)
        self.fc_out = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = x.to(device)
        x = self.in_layer(x)
        batch_size, seq_len, _ = x.size()
        x = x + self.positional_encoding[:, :seq_len, :]
        encoder_output = self.transformer_encoder(x)
        x = encoder_output.mean(dim = 1)
        return torch.clamp(self.fc_out(x), 0.0, 1.0).flatten()

In [151]:
model_transformer = TransformerModelWithAttention(pooling=None,  num_layers=2, input_dim=1024, hidden_dim=128, num_heads=2).to(device)
optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
loss_fn = nn.MSELoss()
trainer = ModelTrainer(model_transformer, train_dataloader, val_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_RMSE, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

Эпоха 1: 100%|██████████| 376/376 [06:31<00:00,  1.04s/batch, rmse=0.91, Средняя потеря=0.0893]
Тестирование 1: 100%|██████████| 94/94 [01:35<00:00,  1.02s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 2: 100%|██████████| 376/376 [06:27<00:00,  1.03s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 2: 100%|██████████| 94/94 [01:35<00:00,  1.01s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 3: 100%|██████████| 376/376 [06:26<00:00,  1.03s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 3: 100%|██████████| 94/94 [01:35<00:00,  1.01s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 4: 100%|██████████| 376/376 [06:23<00:00,  1.02s/batch, rmse=0.91, Средняя потеря=0.089]
Тестирование 4: 100%|██████████| 94/94 [01:35<00:00,  1.01s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 5: 100%|██████████| 376/376 [06:24<00:00,  1.02s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 5: 100%|██████████| 94/94 [01:35<00:00,  1.01s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 6: 100%|██

Ранняя остановка на эпохе 11 из-за отсутствия улучшения точности на тестовой выборке





In [None]:
model_transformer = TransformerModelWithAttention(num_layers=2, input_dim=1024, hidden_dim=128, num_heads=2).to(device)
optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
loss_fn = nn.MSELoss()
trainer = ModelTrainer(model_transformer, train_dataloader, val_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_RMSE, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

Эпоха 1: 100%|██████████| 376/376 [06:42<00:00,  1.07s/batch, rmse=0.91, Средняя потеря=0.0896]
Тестирование 1: 100%|██████████| 94/94 [02:02<00:00,  1.30s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 2: 100%|██████████| 376/376 [06:28<00:00,  1.03s/batch, rmse=0.91, Средняя потеря=0.089]
Тестирование 2: 100%|██████████| 94/94 [01:49<00:00,  1.16s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 3: 100%|██████████| 376/376 [11:18<00:00,  1.81s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 3: 100%|██████████| 94/94 [02:16<00:00,  1.45s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 4: 100%|██████████| 376/376 [06:25<00:00,  1.02s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 4: 100%|██████████| 94/94 [01:36<00:00,  1.02s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 5: 100%|██████████| 376/376 [10:25<00:00,  1.66s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 5: 100%|██████████| 94/94 [03:15<00:00,  2.08s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 6: 100%|██

In [13]:
model_transformer = TransformerModelWithAttention(num_layers=2, input_dim=1024, hidden_dim=128, num_heads=2).to(device)
optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
loss_fn = nn.MSELoss()
trainer = ModelTrainer(model_transformer, train_dataloader, val_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_RMSE, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

Эпоха 1: 100%|██████████| 376/376 [08:31<00:00,  1.36s/batch, rmse=0.91, Средняя потеря=0.0891]
Тестирование 1: 100%|██████████| 94/94 [03:20<00:00,  2.13s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 2: 100%|██████████| 376/376 [13:14<00:00,  2.11s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 2: 100%|██████████| 94/94 [03:14<00:00,  2.07s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 3: 100%|██████████| 376/376 [13:02<00:00,  2.08s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 3: 100%|██████████| 94/94 [03:15<00:00,  2.08s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 4: 100%|██████████| 376/376 [13:05<00:00,  2.09s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 4: 100%|██████████| 94/94 [03:14<00:00,  2.07s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 5: 100%|██████████| 376/376 [13:26<00:00,  2.14s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 5: 100%|██████████| 94/94 [03:20<00:00,  2.14s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 6: 100%|█

Ранняя остановка на эпохе 11 из-за отсутствия улучшения точности на тестовой выборке





In [None]:
model_transformer = TransformerModelWithAttention(num_layers=2, input_dim=1024, hidden_dim=128, num_heads=2).to(device)
optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
loss_fn = nn.MSELoss()
trainer = ModelTrainer(model_transformer, train_dataloader, val_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_RMSE, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

Эпоха 1: 100%|██████████| 376/376 [13:13<00:00,  2.11s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 1: 100%|██████████| 94/94 [03:16<00:00,  2.09s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 2: 100%|██████████| 376/376 [13:12<00:00,  2.11s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 2: 100%|██████████| 94/94 [03:16<00:00,  2.10s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 3: 100%|██████████| 376/376 [13:12<00:00,  2.11s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 3: 100%|██████████| 94/94 [03:16<00:00,  2.09s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 4: 100%|██████████| 376/376 [13:12<00:00,  2.11s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 4: 100%|██████████| 94/94 [03:16<00:00,  2.09s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 5: 100%|██████████| 376/376 [13:13<00:00,  2.11s/batch, rmse=0.91, Средняя потеря=0.0889]
Тестирование 5: 100%|██████████| 94/94 [03:16<00:00,  2.09s/batch, rmse=0.89, Средняя потеря=0.0849]
Эпоха 6: 100%|█

Ранняя остановка на эпохе 11 из-за отсутствия улучшения точности на тестовой выборке





новый dataset

BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_avito('train', len_1=7517, len_2=7517), batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(dataset=Dataset_avito('val', len_1=7517, len_2=7517), batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model_transformer = TransformerModelWithAttention(num_layers=2, input_dim=1024, hidden_dim=128, num_heads=2).to(device)
optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
loss_fn = nn.MSELoss()
trainer = ModelTrainer(model_transformer, train_dataloader, val_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_RMSE, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

Эпоха 1: 100%|██████████| 376/376 [10:22<00:00,  1.66s/batch, rmse=1.09, Средняя потеря=0.125]
Тестирование 1: 100%|██████████| 94/94 [02:33<00:00,  1.63s/batch, rmse=1.06, Средняя потеря=0.12]
Эпоха 2: 100%|██████████| 376/376 [10:14<00:00,  1.64s/batch, rmse=1.09, Средняя потеря=0.125]
Тестирование 2: 100%|██████████| 94/94 [02:31<00:00,  1.61s/batch, rmse=1.06, Средняя потеря=0.12]
Эпоха 3: 100%|██████████| 376/376 [10:09<00:00,  1.62s/batch, rmse=1.09, Средняя потеря=0.125]
Тестирование 3: 100%|██████████| 94/94 [02:31<00:00,  1.61s/batch, rmse=1.06, Средняя потеря=0.12]
Эпоха 4:  91%|█████████ | 343/376 [09:07<00:53,  1.62s/batch]

In [None]:
model_transformer = TransformerModelWithAttention(num_layers=2, input_dim=1024, hidden_dim=128, num_heads=2).to(device)
optimizer = optim.Adam(params = model_transformer.parameters(), lr = LEARNING_RATE)
loss_fn = nn.MSELoss()
trainer = ModelTrainer(model_transformer, train_dataloader, val_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_RMSE, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

Эпоха 1: 100%|██████████| 752/752 [12:55<00:00,  1.03s/batch, rmse=1.09, Средняя потеря=0.124]
Тестирование 1: 100%|██████████| 188/188 [03:12<00:00,  1.02s/batch, rmse=1.09, Средняя потеря=0.126]
Эпоха 2: 100%|██████████| 752/752 [13:03<00:00,  1.04s/batch, rmse=1.08, Средняя потеря=0.124]
Тестирование 2: 100%|██████████| 188/188 [03:13<00:00,  1.03s/batch, rmse=1.09, Средняя потеря=0.126]
Эпоха 3: 100%|██████████| 752/752 [12:57<00:00,  1.03s/batch, rmse=1.08, Средняя потеря=0.124]
Тестирование 3: 100%|██████████| 188/188 [03:12<00:00,  1.02s/batch, rmse=1.09, Средняя потеря=0.126]
Эпоха 4:  57%|█████▋    | 429/752 [07:22<05:36,  1.04s/batch]

## Mamba

In [10]:
BATCH_SIZE = 32
train_dataloader = DataLoader(dataset=Dataset_avito('train', len_1=7517, len_2=7517), batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(dataset=Dataset_avito('val', len_1=7517, len_2=7517), batch_size=BATCH_SIZE, shuffle=False)

In [9]:
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_mamba")

In [11]:
from torch.nn.functional import silu
from torch.nn.functional import softplus
from einops import rearrange, repeat, einsum
class RMSNorm(nn.Module):
    def __init__(self, d_model: int, eps: float = 1e-8) -> None:
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(d_model))

    def forward(self, x: Tensor) -> Tensor:        
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim = True) + self.eps) * self.weight

class Mamba(nn.Module):
    def __init__(self, num_layers, d_input, d_model, d_state=16, d_discr=None, ker_size=4):
        super().__init__()
        mamba_par = {
            'd_input' : d_input,
            'd_model' : d_model,
            'd_state' : d_state,
            'd_discr' : d_discr,
            'ker_size': ker_size
        }
        self.layers = nn.ModuleList([nn.ModuleList([MambaBlock(**mamba_par), RMSNorm(d_input)]) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_input, 1)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def forward(self, seq, cache=None):
        seq = seq.to(self.device)
        for mamba, norm in self.layers:
            out, cache = mamba(norm(seq), cache)
            seq = out + seq
        return self.fc_out(seq.mean(dim = 1))
        
class MambaBlock(nn.Module):
    def __init__(self, d_input, d_model, d_state=16, d_discr=None, ker_size=4):
        super().__init__()
        d_discr = d_discr if d_discr is not None else d_model // 16
        self.in_proj  = nn.Linear(d_input, 2 * d_model, bias=False)
        self.out_proj = nn.Linear(d_model, d_input, bias=False)
        self.s_B = nn.Linear(d_model, d_state, bias=False)
        self.s_C = nn.Linear(d_model, d_state, bias=False)
        self.s_D = nn.Sequential(nn.Linear(d_model, d_discr, bias=False), nn.Linear(d_discr, d_model, bias=False),)
        self.conv = nn.Conv1d(
            in_channels=d_model,
            out_channels=d_model,
            kernel_size=ker_size,
            padding=ker_size - 1,
            groups=d_model,
            bias=True,
        )
        self.A = nn.Parameter(torch.arange(1, d_state + 1, dtype=torch.float).repeat(d_model, 1))
        self.D = nn.Parameter(torch.ones(d_model, dtype=torch.float))
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def forward(self, seq, cache=None):
        b, l, d = seq.shape
        (prev_hid, prev_inp) = cache if cache is not None else (None, None)
        a, b = self.in_proj(seq).chunk(2, dim=-1)
        x = rearrange(a, 'b l d -> b d l')
        x = x if prev_inp is None else torch.cat((prev_inp, x), dim=-1)
        a = self.conv(x)[..., :l]
        a = rearrange(a, 'b d l -> b l d')
        a = silu(a)
        a, hid = self.ssm(a, prev_hid=prev_hid) 
        b = silu(b)
        out = a * b
        out =  self.out_proj(out)
        if cache:
            cache = (hid.squeeze(), x[..., 1:])   
        return out, cache
    
    def ssm(self, seq, prev_hid):
        A = -self.A
        D = +self.D
        B = self.s_B(seq)
        C = self.s_C(seq)
        s = softplus(D + self.s_D(seq))
        A_bar = einsum(torch.exp(A), s, 'd s,   b l d -> b l d s')
        B_bar = einsum(          B,  s, 'b l s, b l d -> b l d s')
        X_bar = einsum(B_bar, seq, 'b l d s, b l d -> b l d s')
        hid = self._hid_states(A_bar, X_bar, prev_hid=prev_hid)
        out = einsum(hid, C, 'b l d s, b l s -> b l d')
        out = out + D * seq
        return out, hid
    
    def _hid_states(self, A, X, prev_hid=None):
        b, l, d, s = A.shape
        A = rearrange(A, 'b l d s -> l b d s')
        X = rearrange(X, 'b l d s -> l b d s')
        if prev_hid is not None:
            return rearrange(A * prev_hid + X, 'l b d s -> b l d s')
        h = torch.zeros(b, d, s, device=self.device)
        return torch.stack([h := A_t * h + X_t for A_t, X_t in zip(A, X)], dim=1)

In [None]:
model_mamba = Mamba(num_layers=2, d_input=1024, d_model=128).to(device)
optimizer = optim.Adam(params = model_mamba.parameters(), lr = LEARNING_RATE)
loss_fn = nn.MSELoss()
trainer = ModelTrainer(model_mamba, train_dataloader, val_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_RMSE, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

Эпоха 1: 100%|██████████| 376/376 [13:47<00:00,  2.20s/batch, rmse=114, Средняя потеря=9.14e+3]
Тестирование 1: 100%|██████████| 94/94 [03:14<00:00,  2.07s/batch, rmse=158, Средняя потеря=2.69e+4]
Эпоха 2: 100%|██████████| 376/376 [13:38<00:00,  2.18s/batch, rmse=70.6, Средняя потеря=2.31e+3]
Тестирование 2: 100%|██████████| 94/94 [03:16<00:00,  2.09s/batch, rmse=170, Средняя потеря=3.12e+4]
Эпоха 3: 100%|██████████| 376/376 [13:59<00:00,  2.23s/batch, rmse=61.3, Средняя потеря=1.28e+3]
Тестирование 3: 100%|██████████| 94/94 [03:06<00:00,  1.99s/batch, rmse=66.1, Средняя потеря=4.7e+3]
Эпоха 4: 100%|██████████| 376/376 [14:21<00:00,  2.29s/batch, rmse=73.5, Средняя потеря=2e+3]
Тестирование 4: 100%|██████████| 94/94 [03:25<00:00,  2.19s/batch, rmse=61.3, Средняя потеря=4.03e+3]
Эпоха 5: 100%|██████████| 376/376 [15:17<00:00,  2.44s/batch, rmse=61.9, Средняя потеря=1.99e+3]
Тестирование 5: 100%|██████████| 94/94 [03:23<00:00,  2.17s/batch, rmse=7.94, Средняя потеря=67.2]
Эпоха 6: 100%|█

## LSTM

In [11]:
class LSTM(nn.Module):
    def __init__(self, input_size = 1024, hidden_size = 64, num_layers = 2, dropout = 0.1, bidirectional=True):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.lstm = nn.LSTM(
            input_size = input_size,
            hidden_size = hidden_size,
            num_layers = num_layers,
            batch_first = True,
            dropout = dropout,
            bidirectional=bidirectional
        )
        if bidirectional:
            self.fc = nn.Linear(2 * hidden_size, 1)
        else:
            self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        if self.lstm.bidirectional:
            h0, c0 = torch.zeros(2 * self.num_layers, len(x), self.hidden_size).to(device), torch.zeros(2 * self.num_layers, len(x), self.hidden_size).to(device)
        else:
            h0, c0 = torch.zeros(self.num_layers, len(x), self.hidden_size).to(device), torch.zeros(self.num_layers, len(x), self.hidden_size).to(device)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        if self.lstm.bidirectional:
            out = torch.cat((hn[-2, :, :], hn[-1, :, :]), dim=1)
        else:
            out = out[:, -1, :]
        out = self.fc(out)
        return out

In [12]:
PATH_TO_MODEL = os.path.join(ROOT_DIR, "Models_lstm")

In [None]:
model_lstm = LSTM().to(device)
optimizer = optim.Adam(params = model_lstm.parameters(), lr = LEARNING_RATE)
loss_fn = nn.MSELoss()
trainer = ModelTrainer(model_lstm, train_dataloader, val_dataloader, device, EPOCHS, ROUND_LOSS, ROUND_RMSE, optimizer, loss_fn)
trainer.train(PATH_TO_MODEL)

Эпоха 1: 100%|██████████| 376/376 [25:15<00:00,  4.03s/batch, rmse=0.91, Средняя потеря=0.0858]
Тестирование 1: 100%|██████████| 94/94 [07:48<00:00,  4.98s/batch, rmse=0.89, Средняя потеря=0.0836]
Эпоха 2: 100%|██████████| 376/376 [28:05<00:00,  4.48s/batch, rmse=0.9, Средняя потеря=0.0854]
Тестирование 2: 100%|██████████| 94/94 [06:45<00:00,  4.31s/batch, rmse=0.89, Средняя потеря=0.0832]
Эпоха 3: 100%|██████████| 376/376 [27:07<00:00,  4.33s/batch, rmse=0.9, Средняя потеря=0.0852]
Тестирование 3: 100%|██████████| 94/94 [06:33<00:00,  4.19s/batch, rmse=0.89, Средняя потеря=0.083]
Эпоха 4: 100%|██████████| 376/376 [26:43<00:00,  4.27s/batch, rmse=0.9, Средняя потеря=0.0852]
Тестирование 4: 100%|██████████| 94/94 [06:47<00:00,  4.34s/batch, rmse=0.89, Средняя потеря=0.0835]
Эпоха 5: 100%|██████████| 376/376 [27:22<00:00,  4.37s/batch, rmse=0.9, Средняя потеря=0.0852]
Тестирование 5: 100%|██████████| 94/94 [06:53<00:00,  4.40s/batch, rmse=0.89, Средняя потеря=0.0831]
Эпоха 6: 100%|██████