In [79]:
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForMaskedLM
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [80]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

In [81]:
class AppDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, label2id, is_train=True):
        self.texts = (df['app_name'] + ' [SEP] ' + 
                      df['shortDescription'].fillna('') + ' [SEP] ' +
                      df['full_description'].fillna('')).tolist()
        self.is_train = is_train
        if self.is_train:
            self.labels = df['labels_str'].apply(
                lambda labs: [label2id[l] for l in labs if l in label2id]
            ).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id
        self.num_classes = len(label2id)
        self.extra_features = df[['app_name_length', 'full_description_length', 'shortDescription_length',
       'app_name_sumbols_per_word', 'full_description_sumbols_per_word',
       'shortDescription_sumbols_per_word']].values.astype('float32')


    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        if self.is_train:
            target = torch.zeros(self.num_classes, dtype=torch.float)
            for l in self.labels[idx]:
                target[l] = 1
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": target if self.is_train else np.zeros(45),
            "extra_feats": torch.tensor(self.extra_features[idx], dtype=torch.float)
        }

In [82]:
class MultiLabelClassifier(nn.Module):
    def __init__(self, model_name, num_labels, extra_dim=6):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(0.3)
        # добавляем вход для доп. фичей
        self.fc = nn.Linear(hidden_size + extra_dim, num_labels)

    def forward(self, input_ids, attention_mask, extra_feats):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls = outputs.last_hidden_state[:, 0, :]  # [CLS]
        concat = torch.cat([cls, extra_feats], dim=1)
        logits = self.fc(self.dropout(concat))
        return logits

In [83]:
def train_epoch(model, loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    criterion = nn.BCEWithLogitsLoss()
    for batch in tqdm(loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask, batch["extra_feats"].to(device))
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [84]:
def hitrate_at_3(preds, labels):
    """ preds, labels: numpy arrays """
    hits = 0
    for p, l in zip(preds, labels):
        true_idx = np.where(l == 1)[0]
        top3 = p.argsort()[-3:][::-1]
        if len(set(true_idx) & set(top3)) > 0:
            hits += 1
    return hits / len(labels)


def eval_epoch(model, loader, device):
    model.eval()
    preds_all, labels_all = [], []
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].cpu().numpy()
            logits = model(input_ids, attention_mask, batch["extra_feats"].to(device))
            probs = torch.sigmoid(logits).cpu().numpy()
            preds_all.append(probs)
            labels_all.append(labels)
    preds_all = np.vstack(preds_all)
    labels_all = np.vstack(labels_all)
    return hitrate_at_3(preds_all, labels_all)


In [85]:
def predict(model, loader, device):
    model.eval()
    preds_all = []
    with torch.no_grad():
        for batch in tqdm(loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            logits = model(input_ids, attention_mask, batch["extra_feats"].to(device))
            probs = torch.sigmoid(logits).cpu().numpy()
            preds_all.append(probs)
    preds_all = np.vstack(preds_all)
    return preds_all

In [86]:
train = pd.read_csv('data/train.tsv', sep='\t')
train.head()

Unnamed: 0,app_name,full_description,shortDescription,labels_str
0,Матрона Московская.,Хочешь знать что будет? Загляни в будущие. Мат...,Узнать будущее. Предсказания . Магия волшебног...,lifestyle
1,"Run and Jump - ""Бесконечный раннер""",Run and Jump это новая увлекательная Аркада - ...,Платформенная Аркада - Раннер с захватывающим ...,action|arcade
2,Ghost Maze,"Играя за приведение, собирай необходимые комби...",Игра-головоломка. Попробуй найти выход из лаби...,arcade|puzzle
3,LabTools Mobile: ЛАБОРАТОРНЫЕ ПРИБОРЫ,AR-приложение «LabTools Mobile: Лабораторные п...,AR-приложение которое предназначено для изучен...,education
4,Mario Anime Coloring,\nРаскрасьте своих любимых персонажей из mario...,Раскрасьте своих любимых персонажей из mario я...,children|family


In [87]:
train['labels_str'] = train['labels_str'].apply(lambda x: x.split('|'))

# фичи

In [88]:
train = train[~train['full_description'].apply(str).str.isdigit()]

In [89]:
train['app_name_length'] = train['app_name'].apply(lambda x: len(x.split(' ')))
train['full_description_length'] = train['full_description'].apply(lambda x: len(x.split(' ')))
train['shortDescription_length'] = train['shortDescription'].apply(lambda x: len(str(x).split(' ')))

train['app_name_length_in_symbols'] = train['app_name'].apply(len)
train['full_description_length_in_symbols'] = train['full_description'].apply(len)
train['shortDescription_length_in_symbols'] = train['shortDescription'].apply(lambda x: len(str(x)))

train['app_name_sumbols_per_word'] = train['app_name_length_in_symbols'] // train['app_name_length']
train['full_description_sumbols_per_word'] = train['full_description_length_in_symbols'] // train['full_description_length']
train['shortDescription_sumbols_per_word'] = train['shortDescription_length_in_symbols'] // train['shortDescription_length']

In [90]:
train = train.drop(columns=['app_name_length_in_symbols', 'full_description_length_in_symbols', 'shortDescription_length_in_symbols'])

In [91]:
from transformers import AutoTokenizer

In [92]:
from sklearn.preprocessing import StandardScaler

In [93]:
model_name = "ai-forever/ruRoberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [94]:
all_labels = sorted(set(l for labs in train.labels_str for l in labs))
label2id = {label: i for i, label in enumerate(all_labels)}

In [95]:
X_train, X_val = train_test_split(train, test_size=0.2, random_state=185)

In [96]:
X_train = X_train.copy()
X_val = X_val.copy()

In [97]:
features_to_norm = [
    'app_name_length', 'full_description_length', 'shortDescription_length',
    'app_name_sumbols_per_word', 'full_description_sumbols_per_word',
    'shortDescription_sumbols_per_word'
]

In [98]:
scaler = StandardScaler()
X_train.loc[:, features_to_norm] = scaler.fit_transform(X_train[features_to_norm])

X_val.loc[:, features_to_norm] = scaler.transform(X_val[features_to_norm])

In [100]:
train_set = AppDataset(X_train, tokenizer, 300, label2id)
val_set = AppDataset(X_val, tokenizer, 300, label2id)

In [101]:
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16)

In [102]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiLabelClassifier(model_name, num_labels=len(label2id)).to(device)
# for param in model.backbone.parameters():
#     param.requires_grad = False

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 5  # 3 эпохи
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * num_training_steps)
, num_training_steps=num_training_steps
)

  return self.fget.__get__(instance, owner)()
Some weights of RobertaModel were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [103]:
from pathlib import Path

In [104]:
save_dir = Path("checkpoints")
save_dir.mkdir(exist_ok=True)

best_h3 = 0.0

In [105]:
# model.load_state_dict(torch.load(save_dir / "model_epoch3.pt"))

In [106]:
epochs = 5
for epoch in range(epochs): 
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_h3 = eval_epoch(model, val_loader, device)
    print(f"Epoch {epoch+1}/{epochs} | Train loss: {train_loss:.4f} | Val H@3: {val_h3:.4f}")

    torch.save(model.state_dict(), save_dir / f"model_epoch{epoch+1}.pt")

    if val_h3 > best_h3:
        best_h3 = val_h3
        torch.save(model.state_dict(), save_dir / "best_model.pt")
        print(f"✓ Saved new best model (H@3={best_h3:.4f})")


100%|██████████| 2674/2674 [39:21<00:00,  1.13it/s]
100%|██████████| 669/669 [03:13<00:00,  3.46it/s]


Epoch 1/5 | Train loss: 0.1021 | Val H@3: 0.9095
✓ Saved new best model (H@3=0.9095)


100%|██████████| 2674/2674 [39:26<00:00,  1.13it/s]
100%|██████████| 669/669 [03:14<00:00,  3.45it/s]


Epoch 2/5 | Train loss: 0.0445 | Val H@3: 0.9242
✓ Saved new best model (H@3=0.9242)


100%|██████████| 2674/2674 [39:34<00:00,  1.13it/s]
100%|██████████| 669/669 [03:22<00:00,  3.31it/s]


Epoch 3/5 | Train loss: 0.0388 | Val H@3: 0.9279
✓ Saved new best model (H@3=0.9279)


100%|██████████| 2674/2674 [39:33<00:00,  1.13it/s]
100%|██████████| 669/669 [03:18<00:00,  3.36it/s]


Epoch 4/5 | Train loss: 0.0331 | Val H@3: 0.9311
✓ Saved new best model (H@3=0.9311)


100%|██████████| 2674/2674 [39:26<00:00,  1.13it/s]
100%|██████████| 669/669 [03:12<00:00,  3.47it/s]


Epoch 5/5 | Train loss: 0.0288 | Val H@3: 0.9304


In [107]:
model.load_state_dict(torch.load(save_dir / "best_model.pt"))

<All keys matched successfully>

# Inference

In [108]:
test = pd.read_csv('data/test.tsv', sep='\t')
test.head()

Unnamed: 0,app_name,full_description,shortDescription
0,Lemon clicker,Lemon clicker простая игра в казуальном жанре ...,Это игра типа: кликер в котором надо кликать з...
1,Memo Английский язык,Приложение для изучения английского языка на о...,Приложение для изучения английского языка на о...
2,Slave Man Rescue,Посреди густого леса живут коренные жители это...,Игра-побег: помогите похищенному пленнику сбеж...
3,Taking Care of Granny,Дедушки и бабушки - лучшие спутники в нашем юн...,Увлекательная игра в которой надо помочь одино...
4,Escape From Classic Room,Escape from Classic Room - это игра-головоломк...,"Игра-головоломка, в которой вам нужно найти вы..."


In [109]:
test['app_name_length'] = test['app_name'].apply(lambda x: len(x.split(' ')))
test['full_description_length'] = test['full_description'].apply(lambda x: len(x.split(' ')))
test['shortDescription_length'] = test['shortDescription'].apply(lambda x: len(str(x).split(' ')))

test['app_name_length_in_symbols'] = test['app_name'].apply(len)
test['full_description_length_in_symbols'] = test['full_description'].apply(len)
test['shortDescription_length_in_symbols'] = test['shortDescription'].apply(lambda x: len(str(x)))

test['app_name_sumbols_per_word'] = test['app_name_length_in_symbols'] // test['app_name_length']
test['full_description_sumbols_per_word'] = test['full_description_length_in_symbols'] // test['full_description_length']
test['shortDescription_sumbols_per_word'] = test['shortDescription_length_in_symbols'] // test['shortDescription_length']

test = test.drop(columns=['app_name_length_in_symbols', 'full_description_length_in_symbols', 'shortDescription_length_in_symbols'])

In [110]:
test[features_to_norm] = scaler.transform(test[features_to_norm] )

In [111]:
test_set = AppDataset(test, tokenizer, 300, label2id, is_train=False)
test_loader = DataLoader(test_set, batch_size=16, shuffle=False)

In [112]:
preds = predict(model, test_loader, device)

100%|██████████| 941/941 [04:32<00:00,  3.45it/s]


In [113]:
np.save('rubert-base-cased.probs.npy', preds)

In [114]:
top3_preds = []
for row in preds:
    top3 = np.argsort(row)[-3:][::-1]
    top3_preds.append(top3)

In [115]:
idx2label = {l: i for i, l in label2id.items()}

pred_labels = []
for top3 in top3_preds:
    labels = [idx2label[i] for i in top3]
    pred_labels.append("|".join(labels))

In [116]:
submission = pd.DataFrame({
    "app_name": test["app_name"],
    "labels_str": pred_labels
})

submission.to_csv("sub13.tsv", sep="\t", index=False)