# SOLUTION

Это второй ноутбук
Здесь реализация модельки DSMM

In [196]:
import pandas as pd
import numpy as np

import pytorch_lightning as pl

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as td

import shutil

seed = 69
np.random.seed(seed)

# PREPROC
Возьмем подготовленные данные с прошлого ноутбука

In [197]:
preproc = pd.read_csv('data/preproc.csv', index_col=0)
preproc.head()

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1,0,1,3,3,0.333333,0.0,0.333333,...,0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1,0.333333
1,1,10,3,0,3,0,3,1.0,0.0,1.0,...,0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44,14.666667
2,0,10,3,0,2,0,3,1.0,0.0,0.666667,...,0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22,7.333333
3,1,10,3,0,3,0,3,1.0,0.0,1.0,...,0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8,2.666667
4,2,10,3,0,3,1,3,1.0,0.0,1.0,...,273,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24,8.0


# SCALING

Для нейронок уже надо бы сделать скейлинг данных

Есть столбцы с только целыми значениями, они потенциально могут быть категориальными, по-хорошему с категориальные величины лучше не скейлить, а добавить в модельку их эмбединги, но идея просто наудачу добавлять в модель десятки слоев эмбедингов мне не очень нравится, так что буду относиться к ним как просто числовым значениям
и заскейлим их

In [198]:
from sklearn.preprocessing import MaxAbsScaler

scaled = preproc.copy()
scaler = MaxAbsScaler()

to_scale = scaled.columns.tolist()
to_scale.remove('query_id')
to_scale.remove('rank')

scaled[to_scale] = scaler.fit_transform(scaled[to_scale])

In [199]:
scaled.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rank,235258.0,0.677869,0.830033,0.0,0.000000,0.000000,1.000000,4.0
query_id,235258.0,14828.413401,8193.945170,10.0,8215.000000,14935.000000,21580.000000,29995.0
feature_0,235258.0,0.061676,0.039915,0.0,0.032258,0.064516,0.096774,1.0
feature_1,235258.0,0.011457,0.032172,0.0,0.000000,0.000000,0.000000,1.0
feature_2,235258.0,0.044068,0.038416,0.0,0.000000,0.037037,0.074074,1.0
...,...,...,...,...,...,...,...,...
feature_139,235258.0,0.281747,0.392089,0.0,0.000000,0.000000,0.500000,1.0
feature_140,235258.0,0.038007,0.046890,0.0,0.009425,0.024972,0.049144,1.0
feature_141,235258.0,0.001074,0.008352,0.0,0.000000,0.000032,0.000436,1.0
feature_142,235258.0,0.001490,0.004226,0.0,0.000265,0.000796,0.001858,1.0


# SPLIT

разобьем данные

In [200]:
from sklearn.model_selection import train_test_split

scaled_train, scaled_test = train_test_split(scaled, test_size=0.2, shuffle=True, random_state=seed)


# MODEL

Идея такая: мы попытаемся обучить модель так, чтобы она создала такое векторное пространство, где вокруг представления конкретного запроса будут вектора релевантных документов
Поэтому будем обучать с triplet loss, чтобы отталкивать негативные интеракции и приближать позитивные

Выделяем позитивные и негативные примеры, берем фичи документов

In [201]:
positives = scaled_train[scaled_train['rank'] > 2].copy()
negatives = scaled_train[scaled_train['rank'] < 2].copy()
rank3 = scaled_train[scaled_train['rank'] == 3].copy()
doc_features = scaled.loc[:, ~scaled_train.columns.isin(['query_id', 'rank'])]
pos_ind = positives.index.to_series()


тут формируем тройки, где для каждого запроса выбирается позитивный пример и негативный, для попытки задания разницы между высокими рангами добавляю несколько троек, где для документа с рангом 4 негативным будет документ с рангом 3

In [None]:
from random import random

triplets = positives.loc[:, positives.columns.isin(['query_id'])]
triplets['pos'] = pos_ind
NUM_NEGATIVE_SAMPLES = 10
triplets = pd.concat([triplets] * NUM_NEGATIVE_SAMPLES).sort_index().reset_index(drop=True)


def fill_neg(row):
    print()
    if scaled_train.loc[row['pos'], 'rank'] == 4 and random() < 0.2:
        if len(rank3[rank3['query_id'] == row['query_id']]) != 0:
            return np.random.choice(rank3[rank3['query_id'] == row['query_id']].index)

    return np.random.choice(negatives[negatives['query_id'] == row['query_id']].index)


triplets['neg'] = triplets.apply(fill_neg, axis=1)

In [203]:
triplets.head()

Unnamed: 0,query_id,pos,neg
0,10,22,10
1,10,22,15
2,10,22,12
3,10,22,61
4,10,22,5


In [204]:
doc_features_n = doc_features.shape[1]

разбиваем на обучение и валидацию для понимания как проходит обучение

In [205]:
rdm = np.random.random(len(triplets))
train_data = triplets[rdm < 0.8]
val_data = triplets[(rdm >= 0.8)]

len(train_data), len(val_data)

(38559, 9621)

Тут мы создаем фичи запроса, формируя вектора из 15 айдишников релевантных запросов

In [None]:
from random import shuffle


def pad_with_specific_value(lst, size, val):
    lst = list(set(lst))
    shuffle(lst)
    lst = lst[:size]
    print(lst)
    return np.pad(lst, (0, size - len(lst)), 'constant', constant_values=(val))


padded_query = triplets.groupby("query_id").apply(lambda x: (
    pad_with_specific_value(x['pos'].tolist(), 15, 300000).tolist()
))

padded_query = padded_query.reindex(range(30000), fill_value=[300000] * 15)
padded_query = np.stack(padded_query.values)

In [207]:
class DSSMData(pl.LightningDataModule):
    def __init__(self, train_triplets, val_triplets, doc_features, padded_queries):
        super().__init__()
        self.train_triplets = train_triplets
        self.val_triplets = val_triplets
        self.doc_features = doc_features
        self.padded_queries = padded_queries

    def _collect_data(self, triplets):
        queries = triplets["query_id"].values
        positives = triplets["pos"].values
        negatives = triplets["neg"].values

        # Удаляем позитивные интеракции с документами, чтобы не было даталика
        relevant_docs = self.padded_queries[queries]
        relevant_docs[relevant_docs == positives.reshape(-1, 1)] = 50000
        relevant_docs[relevant_docs == negatives.reshape(-1, 1)] = 50000

        return td.TensorDataset(
            torch.from_numpy(relevant_docs).long(),
            torch.from_numpy(self.doc_features.loc[positives].values).double(),
            torch.from_numpy(self.doc_features.loc[negatives].values).double()
        )

    def prepare_data(self):
        self.train_dataset = self._collect_data(self.train_triplets)
        self.val_dataset = self._collect_data(self.val_triplets)

    def train_dataloader(self):
        return td.DataLoader(self.train_dataset, batch_size=2048, shuffle=True, num_workers=0)

    def val_dataloader(self):
        return td.DataLoader(self.val_dataset, batch_size=2048, num_workers=0)


Архитектура сеток для перевода документов и запросов в латентное представление

In [208]:
class DocNet(nn.Module):
    def __init__(self, n_factors,
                 activation = F.relu):
        super().__init__()
        self.dense_layer = nn.Linear(doc_features_n, n_factors, bias=False)
        self.output_layer = nn.Linear(n_factors, n_factors, bias=False)
        self.activation = activation

    def forward(self, doc_features):
        features = self.activation(self.dense_layer(doc_features))
        output = self.output_layer(features)
        return output


class QueryNet(nn.Module):
    def __init__(self, n_factors, num_embeddings,
                 activation = F.relu):
        super().__init__()
        self.query_embeddings = nn.EmbeddingBag(num_embeddings + 1, n_factors, padding_idx=num_embeddings)
        self.dense_layer = nn.Linear(n_factors, n_factors, bias=False)
        self.output_layer = nn.Linear(n_factors + n_factors, n_factors, bias=False)
        self.activation = activation

    def forward(self, query_docs):
        interactions_emb = self.query_embeddings(query_docs)
        features = self.activation(self.dense_layer(interactions_emb))
        x = torch.concat([interactions_emb, features], axis=1)
        output = self.output_layer(x)
        return output

Сама модель

In [229]:
class DSSM(pl.LightningModule):
    def __init__(
            self,
            doc_number,
            embedding_dim = 100,
            activation = F.relu,
            lr = 1e-3,
            triplet_loss_margin = 0.4,
            weight_decay= 1e-6,
            log_to_prog_bar = True,
    ):
        super().__init__()
        self.lr = lr
        self.triplet_loss_margin = triplet_loss_margin
        self.weight_decay = weight_decay
        self.log_to_prog_bar = log_to_prog_bar
        self.doc_net = DocNet(embedding_dim, activation)
        self.query_net = QueryNet(embedding_dim, doc_number)

    def forward(
            self,
            query_ids,
            doc_features_pos,
            doc_features_neg,
    ) :
        anchor = self.query_net(query_ids)
        pos = self.doc_net(doc_features_pos)
        neg = self.doc_net(doc_features_neg)

        return anchor, pos, neg

    def _step(self, batch, batch_idx, metric, prog_bar=False):
        query_ids, pos, neg = batch
        anchor, positive, negative = self(query_ids, pos, neg)
        loss = F.triplet_margin_loss(anchor, positive, negative, margin=self.triplet_loss_margin)
        self.log(metric, loss, prog_bar=prog_bar)
        return loss

    def training_step(self, batch, batch_idx):
        return self._step(batch, batch_idx, "train_loss")

    def validation_step(self, batch, batch_idx):
        return self._step(batch, batch_idx, "val_loss", self.log_to_prog_bar)

    def inference(self, dataloader, mode = "doc"):
        batches = []
        query_ids = []
        if (mode == "query"):
            model = self.query_net
        elif (mode == "doc"):
            model = self.doc_net
        else:
            raise ValueError(f"Unsupported model {mode}!")

        self.eval()
        for batch in dataloader:
            ids, features = batch
            with torch.no_grad():
                v_batch = model(features.to(self.device))
            batches.append(v_batch)
            query_ids.append(ids)
        vectors = torch.cat(batches, dim=0).cpu().numpy()
        vectors_ids = torch.cat(query_ids, dim=0).cpu().numpy()
        return vectors_ids, vectors

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)
        scheduler = {
            'scheduler': lr_scheduler,
            'monitor': 'val_loss'
        }
        return [optimizer], [scheduler]

# FIT

Собираем модель и начинаем обучать

In [230]:
data_module = DSSMData(train_data, val_data, doc_features, padded_query)
net = DSSM(doc_number=300000, embedding_dim=64).double()

checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_loss")

trainer = pl.Trainer(
    max_epochs=100,
    accelerator='gpu',
    devices=1,
    callbacks=[
        pl.callbacks.early_stopping.EarlyStopping(monitor="val_loss", patience=5),
        pl.callbacks.LearningRateMonitor(logging_interval="step"),
        checkpoint_callback,
    ])

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [231]:
trainer.fit(
    net,
    data_module
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type     | Params
---------------------------------------
0 | doc_net   | DocNet   | 12.9 K
1 | query_net | QueryNet | 19.2 M
---------------------------------------
19.2 M    Trainable params
0         Non-trainable params
19.2 M    Total params
76.901    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00082: reducing learning rate of group 0 to 1.0000e-04.


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 00092: reducing learning rate of group 0 to 1.0000e-05.


Validation: 0it [00:00, ?it/s]

Сохраняем

In [232]:
shutil.move(checkpoint_callback.best_model_path, "models/dssmFinal.ckpt")

'models/dssmFinal.ckpt'

# PREDICT
Загружаем модельку
Достаем из неё векторные представления запросов и документов

In [233]:
best = DSSM.load_from_checkpoint("models/dssmFinal.ckpt", doc_number=300000, embedding_dim=64)

In [234]:
doc_features = doc_features.sort_index()
docs = torch.from_numpy(doc_features.index.values)
inf_docs = torch.from_numpy(doc_features.values).double()
docs_ds = td.TensorDataset(docs, inf_docs)

In [235]:
inf_dl_docs = td.DataLoader(docs_ds, batch_size=128, shuffle=False, num_workers=1)
doc_ids, doc_embeddings = best.double().inference(inf_dl_docs)

In [236]:
scaled_test['index'] = scaled_test.index.to_series()
positives = scaled_test
interactions = pd.pivot_table(scaled_test, values="rank", index="query_id", columns="index").fillna(0)

In [237]:
interactions = interactions.sort_index()
query_ids_inf = torch.from_numpy(interactions.index.values)
query_inf_feat = torch.from_numpy(padded_query[interactions.index.values]).long()
query_ds = td.TensorDataset(query_ids_inf, query_inf_feat)

In [238]:
inf_dl_queries = td.DataLoader(query_ds, batch_size=128, shuffle=False, num_workers=1)
query_ids, query_embeddings = best.double().inference(inf_dl_queries, "query")

In [239]:
doc_embeddings.shape, query_embeddings.shape

((235258, 64), (1987, 64))

In [249]:
query_id_to_emb = dict()
for i in range(len(query_ids)):
    query_id_to_emb[query_ids[i]] = i

Т.к. модель напрямую не выдает скор для документов, мы возьмем за скор документа расстояние от вектора запроса до вектора этого документа (ну и преобразуем, чтобы скор до ближайшего был как можно больше)

In [242]:
def predict(doc_embeddings, query_embeddings, query_id_to_emb, df):
    distances = []

    query_embedding = query_embeddings[query_id_to_emb[df['query_id'].iloc[0]]]
    for index, _ in df.iterrows():
        doc_emb = doc_embeddings[index]
        distances.append(np.linalg.norm(doc_emb - query_embedding))
    distances = np.array(distances)
    distances = distances.max() - distances
    return distances


In [243]:
predictions = scaled_test.groupby('query_id').apply(
    lambda x: predict(doc_embeddings, query_embeddings, query_id_to_emb, x))
predictions

query_id
10       [0.09616305751808518, 0.3560776439293001, 1.08...
25       [1.9165818353036834, 1.904982917081322, 0.9706...
40       [2.840935658087939, 0.34478129959519066, 2.744...
55       [0.0, 2.645722919893572, 2.101561207774747, 2....
70       [1.5151593127372252, 3.0510110351294664, 1.100...
                               ...                        
29935    [5.426363305170114, 4.855221033556867, 3.29529...
29950    [3.2299465247145065, 1.3555846269336085, 2.358...
29965    [2.5011508517245833, 4.664832656723473, 2.5435...
29980    [1.2815959738161773, 0.6006971066722882, 2.324...
29995    [2.345748551133193, 1.3516571457445572, 0.0, 0...
Length: 1987, dtype: object

# RESULT

In [244]:
y_true = scaled_test.groupby('query_id').apply(lambda x: [x.loc[i]['rank'] for i in x.index])

In [245]:
from sklearn.metrics import ndcg_score


def ndcg(k=None):
    return lambda y_true, y_pred: ndcg_score([y_true], [y_pred], k=k)


from sklearn.metrics import precision_score


def precision_at_k(k=None):
    return lambda y_true, y_pred: precision_at_k_func(y_true, y_pred, k=k)


def precision_at_k_func(y_true, y_pred, k=None):
    if k and len(y_pred) > k:
        y_pred = y_pred[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(y_pred):
        if p in y_true and p not in y_pred[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not y_true:
        return 0.0
    if k:
        return score / min(len(y_true), k)
    return score / len(y_true)


def score(y_true, y_predictions, metric=precision_score):
    res = []
    for i, y_pred in enumerate(y_predictions):
        if len(y_pred) > 1:
            res.append(metric(y_true.iloc[i], y_pred))
    return sum(res) / len(res)


Как и для XGBRanker оценки надо перевести в видо от 0 до 4

In [246]:
def scale_and_transform(x, num_classes=5):
    scaled_x = ((x - np.min(x)) / (np.max(x) - np.min(x))) * (num_classes - 1)

    int_x = np.round(scaled_x).astype(int)

    return int_x


predictionsScaled = predictions.copy()
predictionsScaled = predictionsScaled.apply(scale_and_transform)
predictionsScaled

query_id
10        [0, 1, 2, 3, 0, 3, 2, 1, 2, 2, 1, 2, 4, 1, 0, 2]
25                          [3, 3, 2, 3, 2, 3, 4, 4, 0, 3]
40       [4, 0, 4, 3, 3, 0, 1, 2, 1, 2, 2, 4, 3, 1, 2, ...
55                                [0, 3, 2, 3, 3, 3, 4, 1]
70       [2, 3, 1, 1, 3, 2, 3, 1, 2, 1, 2, 1, 3, 2, 1, ...
                               ...                        
29935     [4, 4, 2, 3, 4, 0, 1, 2, 3, 3, 3, 3, 2, 2, 3, 3]
29950    [3, 1, 2, 3, 4, 1, 0, 1, 1, 4, 4, 2, 2, 2, 4, ...
29965                 [2, 4, 2, 2, 0, 1, 2, 2, 2, 3, 1, 3]
29980                       [2, 1, 3, 2, 2, 3, 3, 0, 4, 2]
29995    [3, 2, 0, 1, 2, 4, 1, 3, 3, 1, 2, 3, 3, 2, 3, ...
Length: 1987, dtype: object

In [247]:
def get_scores(y_true, y_pred):
    res = dict()
    res['ndcg@5'] = score(y_true, y_pred, ndcg(k=5))
    res['ndcg@20'] = score(y_true, y_pred, ndcg(k=20))
    res['ndcg'] = score(y_true, y_pred, ndcg())
    res['precision@5'] = score(y_true, y_pred, precision_at_k(5))
    res['precision@20'] = score(y_true, y_pred, precision_at_k(20))
    res['precision'] = score(y_true, y_pred, precision_at_k())
    return res

In [248]:
print("Predictions")
print(get_scores(y_true, predictions))

print("PredictionsScaled")
print(get_scores(y_true, predictionsScaled))

Predictions
{'ndcg@5': 0.4126087513977368, 'ndcg@20': 0.5834804520561023, 'ndcg': 0.651500585546478, 'precision@5': 0.03210474978795601, 'precision@20': 0.02078240286031312, 'precision': 0.020003542625106868}
PredictionsScaled
{'ndcg@5': 0.4057266459668763, 'ndcg@20': 0.5789703018974725, 'ndcg': 0.6481029152237677, 'precision@5': 0.3040600791631326, 'precision@20': 0.11683337819535812, 'precision': 0.09844804631940493}


В сравнении с результатами других моделек можно уверенно сказать, что эта модель лучше всех ранжирует документы в списках "средней" длины
Я это объясняю тем, что при задачи позитивных и негативных примеров не так сильно видна разница между "соседними" рангами (такие как 4 и 3, 0 и 1) и из-за этого у модели начинается путаница с тем, что должно стоять выше среди подходящих документов и среди совсем нерелевантных документов.
Я считаю, что эту проблему вполне можно исправить при помощи более хитрового задания триплетов для обучения