In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [2]:
import pandas as pd
import optuna
import torch
import torch.nn as nn
from ray import tune
from ray.tune.schedulers import PopulationBasedTraining
from sklearn.model_selection import ParameterGrid
import json
from transformers import LongformerTokenizer, LongformerForSequenceClassification, AutoTokenizer
from utils.load_dataset import create_loaders
from utils.training_loop import training_loop, calculate_metrics
from utils.models import Baseline, SmallTransformer, TextCNN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
device = 'cuda:0'

train_ds = pd.read_csv('data/train_ds.csv', converters={'embs': lambda x: json.loads(x.replace("'", '"'))})
val_ds = pd.read_csv('data/val_ds.csv', converters={'embs': lambda x: json.loads(x.replace("'", '"'))})
test_ds = pd.read_csv('data/test_ds.csv', converters={'embs': lambda x: json.loads(x.replace("'", '"'))})

train_loader, val_loader, test_loader = create_loaders(train_ds, val_ds, test_ds, batch_size=64)

## 1. Эксперименты с моделями без тюнинга гиперпараметров  (3 балла)

- Усложним бейзлайн, заменив RNN на LSTM
- Попробуем CNN для текстов
- Попробуем маленькую transformer модельку
- TF-IDF + LogisticRegression

(Конечно, можно было бы взять большую предобученную модель, качество будет на порядки выше, но предположим, что наши бизнес-ограничения требуют очень дешевое обучение и инференс)

In [4]:
model_lstm = Baseline(embed_dim=64, hidden_size=64, output_size=7, use_rnn=False, num_layers=2).to(device)
training_loop(model_lstm, 'LSTM', device, train_loader, test_loader)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgoodevening13[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 20/20 [04:54<00:00, 14.74s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▅▁▆▃▆▆▆▇▇▇▇███▇▇██
test_recall_n,▅▃▅█▆▆▅▅▄▄▃▁▃▂▂▁▃▂▁▂
train_f1,▁▁▂▁▂▃▄▄▅▅▆▆▆▇▇▇▇▇██
train_loss,█▆▆▆▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁▁
train_recall_n,▆▅▆█▇▅▅▅▄▆▄▁▆▆▅▅▇▇▇▇

0,1
epoch,19.0
test_f1,0.76227
test_recall_n,0.86155
train_f1,0.92719
train_loss,0.25357
train_recall_n,0.98446


In [5]:
del model_lstm

In [6]:
model_cnn = TextCNN(embed_dim=64, output_size=7).to(device)
training_loop(model_cnn, 'CNN', device, train_loader, test_loader)

100%|██████████| 20/20 [04:44<00:00, 14.23s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▄▇▄▇▆█▆▆▄▆▇▃▇▅▆▆▄▅▅
test_recall_n,▆█▇▄▇▃█▄▆▂▂▅▁▄▂▁▃▃▁▁
train_f1,▁▂▂▃▃▄▄▅▅▅▆▆▆▇▇▇████
train_loss,█▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁
train_recall_n,▇▇▇▄▆▂█▃▆▁▄▇▃▇▆▅▇█▇▆

0,1
epoch,19.0
test_f1,0.72833
test_recall_n,0.81673
train_f1,0.93503
train_loss,0.23738
train_recall_n,0.96582


In [7]:
del model_cnn

In [8]:
model_t = SmallTransformer(embed_dim=64, num_heads=2, hidden_dim=64, output_size=7).to(device)
training_loop(model_t, 'SmallTransformer', device, train_loader, test_loader)

100%|██████████| 20/20 [05:26<00:00, 16.32s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▃▅▅▇▇▇▇██████▇█▇██
test_recall_n,▅▅▆█▇▇▇▇▄▅▄▄▄▃▆▃▅▁▃▃
train_f1,▁▁▂▂▂▃▃▄▅▅▆▆▆▇▇▇▇███
train_loss,█▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁
train_recall_n,▃▆▅█▇▅▇▇▁▂▃▄▂▃▆▄▆▁▄▆

0,1
epoch,19.0
test_f1,0.74093
test_recall_n,0.84064
train_f1,0.93329
train_loss,0.28997
train_recall_n,0.9836


In [9]:
del model_t

In [4]:
used_tokenizer = AutoTokenizer.from_pretrained('unsloth/Meta-Llama-3.1-8B-Instruct')

def decode(ids):
    return used_tokenizer.decode(ids)

train_ds['text'] = train_ds['embs'].apply(decode)
val_ds['text'] = val_ds['embs'].apply(decode)
test_ds['text'] = test_ds['embs'].apply(decode)

In [5]:
vectorizer = TfidfVectorizer(max_features=128256)
X_train = vectorizer.fit_transform(train_ds['text'])
model = LogisticRegression(max_iter=500)
model.fit(X_train, train_ds['enc_label'])

In [6]:
predictions = model.predict(X_train)
X_test = vectorizer.transform(test_ds['text'])
predictions_test = model.predict(X_test)

In [7]:
labels = train_ds['enc_label'].to_numpy()
labels_test = test_ds['enc_label'].to_numpy()

In [8]:
import wandb
from torcheval.metrics.functional import multiclass_f1_score
def calculate_recall_normal(predictions, labels):
    normal_idx = [i for i in range(len(labels)) if labels[i] == 1.]
    recall_normal = sum([1 if predictions[idx] == 1. else 0 for idx in normal_idx]) / len(normal_idx)
    return recall_normal

f1_train = multiclass_f1_score(torch.Tensor(predictions).to(dtype=torch.int64),
                               torch.Tensor(labels).to(dtype=torch.int64), average='weighted', num_classes=7)

f1_test = multiclass_f1_score(torch.Tensor(predictions_test).to(dtype=torch.int64), 
                               torch.Tensor(labels_test).to(dtype=torch.int64), average='weighted', num_classes=7)
recall_n_train = calculate_recall_normal(predictions, labels)
recall_n_test = calculate_recall_normal(predictions_test, labels_test)

wandb.init(
        project="ml_sys_design",
        name='TF-IDF Classifier',
        group='model_comparison'
    )
for i in range(20):
    wandb.log({
            "epoch": i,
            "train_loss": 0,
            "train_recall_n": recall_n_train,
            "train_f1": f1_train,
            "test_recall_n": recall_n_test,
            "test_f1": f1_test,
        })
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgoodevening13[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
test_recall_n,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_f1,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_recall_n,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,19.0
test_f1,0.75609
test_recall_n,0.96713
train_f1,0.78055
train_loss,0.0
train_recall_n,0.99032


## 2. Эксперименты с моделями с тюнингом гиперпараметров  (4 балла)

берем две модели: LSTM и SmallTransformer

Сначала подбираем параметры для LSTM

In [5]:
def objective_optuna(trial):
    params = {
        "hidden_size": trial.suggest_categorical("hidden_dim", [32, 64, 128]),
        "num_layers": trial.suggest_int("num_layers", 1, 4),
        "dropout": trial.suggest_float("dropout", 0.1, 0.5)
    }
    
    model = Baseline(embed_dim=params['hidden_size'], hidden_size=params['hidden_size'], output_size=7, use_rnn=False, 
                     num_layers=params['num_layers']).to(device)
    
    val_f1 = training_loop(model, 'LSTM', device, train_loader, val_loader, params=params)
    return val_f1

study = optuna.create_study(direction="maximize")
study.optimize(objective_optuna, n_trials=10)

[I 2025-03-25 20:43:26,520] A new study created in memory with name: no-name-200122c6-00cc-4c72-9ae2-89d0deb956e1
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgoodevening13[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 20/20 [06:18<00:00, 18.94s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▃▃▇███▇▇▇███▇▇▇▇▇▇
test_recall_n,▅▆█▇█▇▆▅▃▅▂▃▃▄▂▂▂▂▂▁
train_f1,▁▁▂▂▃▃▄▅▅▆▆▇▇▇▇▇▇███
train_loss,█▇▆▆▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁
train_recall_n,▃▅█▇█▇▆▁▁▅▁▂▆▆▅▆▇▆▇▅

0,1
epoch,19.0
test_f1,0.74277
test_recall_n,0.83528
train_f1,0.94039
train_loss,0.19723
train_recall_n,0.98182


[I 2025-03-25 20:49:49,829] Trial 0 finished with value: 0.7427650690078735 and parameters: {'hidden_dim': 128, 'num_layers': 3, 'dropout': 0.4464886137511739}. Best is trial 0 with value: 0.7427650690078735.


100%|██████████| 20/20 [05:14<00:00, 15.74s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▄▄▅▃▆▇▇███▇▇▇▇▇▇▇▇
test_recall_n,▆▇▇▇▇█▇▇▆▆▅▅▄▁▃▃▄▃▃▃
train_f1,▁▁▂▂▃▃▄▅▅▆▆▆▇▇▇▇▇███
train_loss,█▆▆▅▅▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁
train_recall_n,▆▆▇▆▅▇▆▇▆▇▆▇▇▁▆▅▇▅██

0,1
epoch,19.0
test_f1,0.7493
test_recall_n,0.87135
train_f1,0.94354
train_loss,0.20884
train_recall_n,0.9888


[I 2025-03-25 20:55:08,231] Trial 1 finished with value: 0.7492995858192444 and parameters: {'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.36044941024150834}. Best is trial 1 with value: 0.7492995858192444.


100%|██████████| 20/20 [06:11<00:00, 18.58s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▃▄▄▄▄▆▄▆▆▇▇▇▇▇▇█▇▇█
test_recall_n,▅█▆█▆▇▆▅▃▃▆▅▅▅▃▄▄▂▁▄
train_f1,▁▂▂▂▃▃▄▄▅▅▄▆▆▆▇▇▇███
train_loss,█▅▅▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁
train_recall_n,▅█▅█▇▇▆▅▁▃█▅▅▆▃▆▆▄▁▆

0,1
epoch,19.0
test_f1,0.73582
test_recall_n,0.89376
train_f1,0.85171
train_loss,0.41991
train_recall_n,0.98268


[I 2025-03-25 21:01:22,426] Trial 2 finished with value: 0.7358205318450928 and parameters: {'hidden_dim': 32, 'num_layers': 4, 'dropout': 0.26072583518065967}. Best is trial 1 with value: 0.7492995858192444.


100%|██████████| 20/20 [05:13<00:00, 15.68s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▄▆▇▇▇▇█████▇▇▇▇▇▆▇
test_recall_n,█▆▇█▆▄▅▆▅▅▅▄▄▁▁▂▃▂▂▂
train_f1,▁▁▂▂▃▄▄▅▅▆▆▆▇▇▇▇████
train_loss,█▆▆▅▅▅▄▄▃▃▃▃▂▂▂▂▁▁▁▁
train_recall_n,█▇▇█▇▁▄▃▄▅▆▆▆▅▅▆▇▇▇▇

0,1
epoch,19.0
test_f1,0.7489
test_recall_n,0.88889
train_f1,0.94655
train_loss,0.19139
train_recall_n,0.98808


[I 2025-03-25 21:06:38,430] Trial 3 finished with value: 0.7488998174667358 and parameters: {'hidden_dim': 64, 'num_layers': 1, 'dropout': 0.25140622923018885}. Best is trial 1 with value: 0.7492995858192444.


100%|██████████| 20/20 [06:15<00:00, 18.77s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▂▅▆▃▅███████▇█▇███
test_recall_n,▆▆▆█▆█▅▅▄▃▂▄▃▂▃▂▁▂▁▁
train_f1,▁▁▁▂▃▃▄▅▅▆▆▆▇▇▇▇▇███
train_loss,█▇▆▆▅▅▄▄▃▃▃▃▂▂▂▂▁▁▁▁
train_recall_n,▄▄▄█▁▇▁▅▄▃▃▆▆▅▆▅▇█▇█

0,1
epoch,19.0
test_f1,0.7485
test_recall_n,0.83821
train_f1,0.94678
train_loss,0.19721
train_recall_n,0.99203


[I 2025-03-25 21:12:56,251] Trial 4 finished with value: 0.7485037446022034 and parameters: {'hidden_dim': 128, 'num_layers': 3, 'dropout': 0.47835899429406614}. Best is trial 1 with value: 0.7492995858192444.


100%|██████████| 20/20 [05:09<00:00, 15.47s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▂▃▃▃▃▅▅▅▆█▇▄▇▆▆▆██
test_recall_n,▄▇▅█▅▇▃▇▆▅▆▇▄▂▁▃▂▂▁▁
train_f1,▁▁▂▂▂▂▃▄▄▅▅▅▆▆▇▆▇███
train_loss,█▆▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁
train_recall_n,▂▆▅█▅▇▁▅▂▁▄▇▃▃▁▂▄▄▄▅

0,1
epoch,19.0
test_f1,0.73768
test_recall_n,0.88109
train_f1,0.85756
train_loss,0.3936
train_recall_n,0.98064


[I 2025-03-25 21:18:08,080] Trial 5 finished with value: 0.7376782894134521 and parameters: {'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.4177951562393335}. Best is trial 1 with value: 0.7492995858192444.


100%|██████████| 20/20 [05:49<00:00, 17.47s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▅▅▆▇▇████▇▇▇▇▇▇▇▇▇▇
test_recall_n,▅█▆▇▆▆▇▆▅▅▃▄▄▃▄▃▂▂▁▃
train_f1,▁▁▂▃▄▄▅▆▆▇▇▇▇███████
train_loss,█▇▆▅▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁
train_recall_n,▃█▄▅▁▅▅▄▆▆▃▅▆▆█▇▇▇▆█

0,1
epoch,19.0
test_f1,0.74821
test_recall_n,0.87817
train_f1,0.97838
train_loss,0.08921
train_recall_n,0.99236


[I 2025-03-25 21:24:00,549] Trial 6 finished with value: 0.748207151889801 and parameters: {'hidden_dim': 128, 'num_layers': 1, 'dropout': 0.4957924025577741}. Best is trial 1 with value: 0.7492995858192444.


100%|██████████| 20/20 [05:33<00:00, 16.69s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▂▂▅▅▇▇███▇██▇██▇▇▇
test_recall_n,▅▅▆██▅▅▄▅▅▅▃▃▄▃▃▃▁▂▂
train_f1,▁▁▂▂▃▄▄▅▅▆▆▆▇▇▇▇████
train_loss,█▆▆▅▅▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁
train_recall_n,▄▅▆█▇▃▃▁▅▅▆▄▄▆▅▆▇▅▆▇

0,1
epoch,19.0
test_f1,0.75263
test_recall_n,0.85283
train_f1,0.91943
train_loss,0.24591
train_recall_n,0.98432


[I 2025-03-25 21:29:36,901] Trial 7 finished with value: 0.7526282668113708 and parameters: {'hidden_dim': 64, 'num_layers': 2, 'dropout': 0.3926072895884507}. Best is trial 7 with value: 0.7526282668113708.


100%|██████████| 20/20 [04:44<00:00, 14.21s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▃▄▄▄▁▄▄▅▄▇▇▅▇▇▇█████
test_recall_n,▄▄▃▄█▆▇▆▄▆▅▃▄▃▄▄▄▃▂▁
train_f1,▁▂▂▂▁▃▃▃▃▄▅▅▅▆▆▆▇▇██
train_loss,█▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train_recall_n,▃▄▃▄█▆▇▆▄▆▅▄▃▁▅▅▅▃▄▂

0,1
epoch,19.0
test_f1,0.74166
test_recall_n,0.84211
train_f1,0.86327
train_loss,0.40918
train_recall_n,0.96404


[I 2025-03-25 21:34:23,506] Trial 8 finished with value: 0.7416589856147766 and parameters: {'hidden_dim': 32, 'num_layers': 2, 'dropout': 0.1915978973911459}. Best is trial 7 with value: 0.7526282668113708.


100%|██████████| 20/20 [06:43<00:00, 20.20s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▃▃▃▃▅▅▆▆██▇▇▇██▇███
test_recall_n,█▇▇▇▇▅▄▆▄▄▄▄▃▃▃▁▂▂▁▂
train_f1,▁▂▂▂▂▃▄▅▅▆▆▆▆▇▇▇████
train_loss,█▇▆▆▅▅▄▄▃▃▃▃▂▂▂▂▂▁▁▁
train_recall_n,█▅▆▇█▆▁▆▁▄▅▄▇▇▇▄▇▅▃▆

0,1
epoch,19.0
test_f1,0.75078
test_recall_n,0.85283
train_f1,0.9316
train_loss,0.23706
train_recall_n,0.98465


[I 2025-03-25 21:41:10,286] Trial 9 finished with value: 0.7507827281951904 and parameters: {'hidden_dim': 128, 'num_layers': 4, 'dropout': 0.48174889407109966}. Best is trial 7 with value: 0.7526282668113708.


In [21]:
! pip install -q tensorboardx

[33mDEPRECATION: Loading egg at /extra_disk_1/alinashutova/anaconda3/envs/aquakv/lib/python3.11/site-packages/fast_hadamard_transform-1.0.4.post1-py3.11-linux-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m

In [14]:
import torch.optim as optim


grid_params = {
    "lr": [1e-3, 1e-4],
    "optimizer": [optim.Adam, optim.RMSprop],
}

best_val_f1 = 0
for params in ParameterGrid(grid_params):
    model = Baseline(embed_dim=64, hidden_size=64, output_size=7, use_rnn=False, 
                    num_layers=1).to(device)

    val_f1 = training_loop(model, 'LSTM', device, train_loader, val_loader, params=params, optimizer=params["optimizer"], lr=params['lr'])
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_params = params
    
    print("Лучшие параметры (Grid Search):", best_params)

100%|██████████| 20/20 [04:43<00:00, 14.18s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▃▂▅▆▇▇▇▇▇██▇▇▇▇▇▇▇
test_recall_n,▆▇▆█▇▅▇▅▄▄▅▄▄▃▂▂▂▄▂▁
train_f1,▁▁▂▂▂▃▄▅▅▆▆▆▆▇▇▇█▇██
train_loss,█▆▆▅▅▅▄▄▃▃▃▂▂▂▂▂▁▁▁▁
train_recall_n,▅▆▅█▆▄▄▁▁▂▄▆▄▃▄▃▄▇▄▄

0,1
epoch,19.0
test_f1,0.75255
test_recall_n,0.87037
train_f1,0.94375
train_loss,0.21868
train_recall_n,0.97899


Лучшие параметры (Grid Search): {'lr': 0.001, 'optimizer': <class 'torch.optim.adam.Adam'>}


100%|██████████| 20/20 [04:35<00:00, 13.75s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▂▁▃▄▄▆▆▅▇▆▅▇▇█▇▇▇▇▇█
test_recall_n,▃█▇▇▃▃▅▅▄▂▂▄▃▃▂▃▃▃▁▄
train_f1,▁▁▂▂▃▄▄▄▅▅▅▆▆▆▇▇▇███
train_loss,█▇▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁▁
train_recall_n,▅█▇█▅▁▃▄▆▃▃▅▆▇▄▇▇▆▄▇

0,1
epoch,19.0
test_f1,0.76108
test_recall_n,0.92105
train_f1,0.90429
train_loss,0.28166
train_recall_n,0.99065


Лучшие параметры (Grid Search): {'lr': 0.001, 'optimizer': <class 'torch.optim.rmsprop.RMSprop'>}


100%|██████████| 20/20 [04:28<00:00, 13.41s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▅▇▇▇▇▇▇▇▇▇▇▇▇▇█████
test_recall_n,█▃▁▂▂▄▃▂▃▄▃▂▂▅▃▄▃▂▃▃
train_f1,▁▄▆▆▆▇▇▇▇▇▇▇▇▇▇█████
train_loss,█▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
train_recall_n,█▃▁▄▃▃▃▃▃▄▃▂▂▅▃▄▄▃▄▃

0,1
epoch,19.0
test_f1,0.69011
test_recall_n,0.9347
train_f1,0.67688
train_loss,0.76432
train_recall_n,0.98169


Лучшие параметры (Grid Search): {'lr': 0.001, 'optimizer': <class 'torch.optim.rmsprop.RMSprop'>}


100%|██████████| 20/20 [04:35<00:00, 13.75s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▅▇▇▇▇▇█████████████
test_recall_n,█▄▂▁▃▂▄▄▃▄▄▄▆▄▃▅▃▄▄▃
train_f1,▁▆▆▇▇▇▇▇▇███████████
train_loss,█▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁
train_recall_n,█▄▃▁▄▃▄▄▂▄▄▄▆▅▃▅▃▅▅▄

0,1
epoch,19.0
test_f1,0.64938
test_recall_n,0.92105
train_f1,0.66467
train_loss,0.79458
train_recall_n,0.97576


Лучшие параметры (Grid Search): {'lr': 0.001, 'optimizer': <class 'torch.optim.rmsprop.RMSprop'>}


In [None]:
import torch.optim as optim


def train_pbt(config):
    model = Baseline(embed_dim=64, hidden_size=64, output_size=7, use_rnn=False, num_layers=1, dropout=config["dropout"]).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    optimizer = optim.RMSprop(model.parameters(), lr=config["lr"])

    for epoch in range(20):
        model.train()
        running_loss = 0.0
        for i, (inputs, labels, extra, lengths) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs, lengths)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        model.eval()
        val_f1, val_recall_n = calculate_metrics(test_loader, model, device, 1, 7, pretrained=False)
        train_f1, train_recall_n = calculate_metrics(train_loader, model, device, 1, 7, pretrained=False)
        tune.report({
            "epoch": float(epoch),
            "train_loss": float(running_loss / len(train_loader)),
            "train_recall_n": float(train_recall_n),
            "train_f1": float(train_f1),
            "val_recall_n": float(val_recall_n),
            "val_f1": float(val_f1),
        })
        
        

pbt_scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="val_f1",
    mode="max",
    perturbation_interval=2,
    hyperparam_mutations={
        "lr": tune.loguniform(1e-5, 1e-3),
        "dropout": tune.uniform(0.1, 0.5),
    }
)

tune.run(
    train_pbt,
    config={
        "lr": 1e-4,
        "dropout": 0.3,
    },
    scheduler=pbt_scheduler,
    num_samples=5,
    resources_per_trial={
        "cpu": 4,
        "gpu": 0.5 if torch.cuda.is_available() else 0
    }
)

2025-03-26 01:22:15,468	INFO worker.py:1852 -- Started a local Ray instance.
2025-03-26 01:22:18,416	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2025-03-26 01:22:18,421	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-03-26 01:56:56
Running for:,00:34:37.71
Memory:,396.2/917.3 GiB

Trial name,status,loc,iter,total time (s),epoch,train_loss,train_recall_n
train_pbt_9cf58_00000,TERMINATED,192.168.1.2:65746,20,110.344,19,0.416506,0.978199
train_pbt_9cf58_00001,TERMINATED,192.168.1.2:71412,20,110.22,19,0.336853,0.980109
train_pbt_9cf58_00002,TERMINATED,192.168.1.2:77689,20,108.842,19,0.364481,0.978068
train_pbt_9cf58_00003,TERMINATED,192.168.1.2:85349,20,110.633,19,0.41488,0.976223
train_pbt_9cf58_00004,TERMINATED,192.168.1.2:91006,20,178.639,19,0.54759,0.953171






[33m(raylet)[0m [2025-03-26 01:22:25,463 E 907034 907067] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-26_01-22-13_421315_893989 is over 95% full, available space: 72.6015 GB; capacity: 2558.08 GB. Object creation will fail if spilling is required.


Trial name,epoch,train_f1,train_loss,train_recall_n,val_f1,val_recall_n
train_pbt_9cf58_00000,19,0.847908,0.416506,0.978199,0.721012,0.869522
train_pbt_9cf58_00001,19,0.881591,0.336853,0.980109,0.728849,0.860558
train_pbt_9cf58_00002,19,0.865369,0.364481,0.978068,0.688118,0.840637
train_pbt_9cf58_00003,19,0.852606,0.41488,0.976223,0.68874,0.846614
train_pbt_9cf58_00004,19,0.78848,0.54759,0.953171,0.681952,0.86255


[33m(raylet)[0m [2025-03-26 01:22:35,484 E 907034 907067] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-26_01-22-13_421315_893989 is over 95% full, available space: 72.5943 GB; capacity: 2558.08 GB. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-03-26 01:22:45,502 E 907034 907067] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-26_01-22-13_421315_893989 is over 95% full, available space: 72.5932 GB; capacity: 2558.08 GB. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-03-26 01:22:55,537 E 907034 907067] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-26_01-22-13_421315_893989 is over 95% full, available space: 72.5931 GB; capacity: 2558.08 GB. Object creation will fail if spilling is required.
[36m(train_pbt pid=925824)[0m *** SIGSEGV received at time=1742941379 on cpu 23 ***
[36m(train_pbt pid=925824)[0m PC: @     0x7f3b0d3765de  (unknown)  ray::gcs::TaskInfoAccessor::A

<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x7f11a40f7d50>

[33m(raylet)[0m [2025-03-26 01:57:00,538 E 907034 907067] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-26_01-22-13_421315_893989 is over 95% full, available space: 72.6379 GB; capacity: 2558.08 GB. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-03-26 01:57:10,610 E 907034 907067] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-26_01-22-13_421315_893989 is over 95% full, available space: 72.6368 GB; capacity: 2558.08 GB. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-03-26 01:57:20,708 E 907034 907067] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-26_01-22-13_421315_893989 is over 95% full, available space: 72.6349 GB; capacity: 2558.08 GB. Object creation will fail if spilling is required.
[33m(raylet)[0m [2025-03-26 01:57:30,753 E 907034 907067] (raylet) file_system_monitor.cc:116: /var/tmp/ray/session_2025-03-26_01-22-13_421315_893989 is over 95% full, available spac

Видно, что результаты PopulationBasedTraining строго хуже

То же самое для CNN

In [16]:
def objective_optuna(trial):
    params = {
        "embed_dim": trial.suggest_categorical("embed_dim", [32, 64, 128]),
        "num_convs": trial.suggest_int("num_convs", 2, 5),
        "dropout": trial.suggest_float("dropout", 0.1, 0.5)
    }
    
    model = TextCNN(embed_dim=params['embed_dim'], output_size=7, num_convs=params['num_convs'], dropout=params['dropout']).to(device)
    
    val_f1 = training_loop(model, 'CNN', device, train_loader, val_loader, params=params)
    return val_f1

study = optuna.create_study(direction="maximize")
study.optimize(objective_optuna, n_trials=6)

[I 2025-03-25 23:35:23,653] A new study created in memory with name: no-name-a20f00a2-b78a-44b0-8914-b6eaef0976aa


100%|██████████| 20/20 [04:36<00:00, 13.83s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▂▁▅▅▆▅▇▇▇█▇██▇▇▇▄█▇▇
test_recall_n,▇▅▆█▆▇▆▅▆▆▆▆▆▅▅▆▁▆▄▅
train_f1,▁▁▂▂▃▃▄▄▅▆▆▆▆▆▇▇▇███
train_loss,█▇▆▅▅▅▄▄▄▄▃▃▃▂▂▂▂▁▁▁
train_recall_n,█▆▇█▇▇▅▆▆▅▆▆▇▇▆▇▁▇▆▆

0,1
epoch,19.0
test_f1,0.74704
test_recall_n,0.87135
train_f1,0.89788
train_loss,0.34612
train_recall_n,0.9647


[I 2025-03-25 23:40:02,650] Trial 0 finished with value: 0.7470376491546631 and parameters: {'embed_dim': 32, 'num_convs': 3, 'dropout': 0.15071331742827523}. Best is trial 0 with value: 0.7470376491546631.


100%|██████████| 20/20 [04:28<00:00, 13.42s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▅▇▇▂▇█▇█▆▆▆▇█▆▆▆▆▆
test_recall_n,▇▃▆█▆▇▅▇▅▅▄▆▁▅▅▃▃▂▁▁
train_f1,▁▁▂▂▃▂▄▄▅▅▅▅▆▇▇▇████
train_loss,█▇▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▁▁▁
train_recall_n,▇▅▆█▇▇▁▆▄▅▆█▁▇▇▅▅▅▆▅

0,1
epoch,19.0
test_f1,0.73967
test_recall_n,0.82359
train_f1,0.92219
train_loss,0.28821
train_recall_n,0.96285


[I 2025-03-25 23:44:32,906] Trial 1 finished with value: 0.7396743297576904 and parameters: {'embed_dim': 64, 'num_convs': 2, 'dropout': 0.39229083512627383}. Best is trial 0 with value: 0.7470376491546631.


100%|██████████| 20/20 [04:49<00:00, 14.47s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▃▅▄▆▁▅▄▇█▇▇▇▇▇█▆▅▇▇▇
test_recall_n,▆█▁▅▃▇▇▆▄▇▆▅▅▆▆▇▂▃▁▄
train_f1,▁▂▂▃▃▃▃▅▅▅▅▆▆▆▇▆▇███
train_loss,█▇▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁
train_recall_n,▇█▁▇▅▇█▇▆▇▇▇▇▇▇█▇▇▆█

0,1
epoch,19.0
test_f1,0.75031
test_recall_n,0.87817
train_f1,0.91061
train_loss,0.30432
train_recall_n,0.98762


[I 2025-03-25 23:49:24,868] Trial 2 finished with value: 0.7503054738044739 and parameters: {'embed_dim': 32, 'num_convs': 5, 'dropout': 0.10028882933049982}. Best is trial 2 with value: 0.7503054738044739.


100%|██████████| 20/20 [04:38<00:00, 13.93s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▃█▆▇▇▅▇▅▅▃▆▇▇▇▆▇▄▇▆
test_recall_n,██▇▅▅▆▅▃▆▅▁▂▂▄▃▁▄▃▂▁
train_f1,▁▂▃▄▄▅▅▆▅▆▆▇▇▇██████
train_loss,█▇▆▆▅▅▄▄▃▃▃▃▂▂▂▂▁▁▁▁
train_recall_n,█▇▇▁▃▆▃▃▇▇▂▂▄▆▅▅█▇▆▆

0,1
epoch,19.0
test_f1,0.74914
test_recall_n,0.83821
train_f1,0.96251
train_loss,0.17022
train_recall_n,0.97952


[I 2025-03-25 23:54:05,977] Trial 3 finished with value: 0.749137818813324 and parameters: {'embed_dim': 128, 'num_convs': 3, 'dropout': 0.2009823567262941}. Best is trial 2 with value: 0.7503054738044739.


100%|██████████| 20/20 [04:44<00:00, 14.24s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▄▁▅▆▇▇▆▂▇▇▄▅▆▇█▆████
test_recall_n,▆▆▇▇██▅█▅▅▅▇▇▆▅▁▅▄▅▅
train_f1,▁▁▂▂▃▃▃▃▅▅▄▄▅▆▇▆▇▇██
train_loss,█▇▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁
train_recall_n,▇▇▇███▇█▅▆▇██▇▇▁▇▆█▇

0,1
epoch,19.0
test_f1,0.75318
test_recall_n,0.86647
train_f1,0.89725
train_loss,0.37167
train_recall_n,0.95528


[I 2025-03-25 23:58:53,050] Trial 4 finished with value: 0.7531791925430298 and parameters: {'embed_dim': 32, 'num_convs': 5, 'dropout': 0.35235017826646153}. Best is trial 4 with value: 0.7531791925430298.


100%|██████████| 20/20 [04:40<00:00, 14.04s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▆▆▄▆▇▅▇█▆█▇█▇▇▇▇▇▇▇
test_recall_n,▇▆▃▅▂▄▅▂▅▃█▃▇▅▄▁▁▂▁▂
train_f1,▁▂▃▂▄▄▄▅▅▅▆▆▆▇▇▇▇███
train_loss,█▇▆▆▆▅▅▄▄▄▃▃▃▃▂▂▂▂▁▁
train_recall_n,▇▇▅▇▁▆▅▄▆▅█▅█▇▆▆▆▇▇▇

0,1
epoch,19.0
test_f1,0.7388
test_recall_n,0.82359
train_f1,0.93257
train_loss,0.25429
train_recall_n,0.96252


[I 2025-03-26 00:03:36,461] Trial 5 finished with value: 0.7387993931770325 and parameters: {'embed_dim': 64, 'num_convs': 3, 'dropout': 0.3569675213020953}. Best is trial 4 with value: 0.7531791925430298.


In [18]:

grid_params = {
    "lr": [1e-3, 1e-4],
    "optimizer": [optim.Adam, optim.RMSprop],
}
best_val_f1 = 0
for params in ParameterGrid(grid_params):
    model = TextCNN(embed_dim=32, output_size=7, dropout=0.352, num_convs=5).to(device)

    val_f1 = training_loop(model, 'CNN', device, train_loader, val_loader, params=params, optimizer=params["optimizer"], lr=params['lr'])
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_params = params

print("Лучшие параметры (Grid Search):", best_params)

100%|██████████| 20/20 [06:51<00:00, 20.56s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▄▂▇▇▅▆▆▆▅▇▆█▇██▇▇▇▇
test_recall_n,▃▃▁█▅▆▂▆▇▂▆▆▅▃▅▆▄▆▃▄
train_f1,▁▂▂▃▃▃▄▄▄▅▅▅▇▆▇▇█▇██
train_loss,█▇▆▆▅▅▅▄▄▄▃▃▃▃▂▂▂▁▁▁
train_recall_n,▅▆▄█▇▇▄▆▇▁█▇▆▆▇█▆█▆▇

0,1
epoch,19.0
test_f1,0.74395
test_recall_n,0.85283
train_f1,0.88489
train_loss,0.35594
train_recall_n,0.95765


100%|██████████| 20/20 [04:59<00:00, 14.95s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▃▆▅▂▅▁██▇▂▅▆▆▇▄▅▆▅▅▇
test_recall_n,▅▇▇▂█▇▇▇▇▁▆▄▇▇▄▅▅▁▆▆
train_f1,▄▅▅▁▅▅▆▇▇▃▆▆▇█▆▇▇▃▇█
train_loss,█▆▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁
train_recall_n,▇▆█▁█▇█▇█▂▇▅█▇▆▇▆▁▇▇

0,1
epoch,19.0
test_f1,0.71516
test_recall_n,0.82651
train_f1,0.8295
train_loss,0.47988
train_recall_n,0.93447


100%|██████████| 20/20 [04:50<00:00, 14.53s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▄▄▄▆▆▅▆▄▇▇▇▇▇██▇██
test_recall_n,█▆▅▄▂▅▁▂▄▂▃▅▃▅▄▆█▂▆▅
train_f1,▁▂▃▃▃▅▅▅▅▄▆▇▇▇▇▇▇███
train_loss,█▅▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁
train_recall_n,█▇▆▆▅▆▄▄▅▄▄▆▄▆▅▆▇▁▆▅

0,1
epoch,19.0
test_f1,0.71986
test_recall_n,0.90253
train_f1,0.69622
train_loss,0.77858
train_recall_n,0.96555


100%|██████████| 20/20 [04:51<00:00, 14.58s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▂▂▅▅▅▇▄▄▅█▆▅█▇▇█▄▆▇
test_recall_n,▇▇▄▅▇▇▇▅▆▇▆▅▅▇█▇▃▁▃▆
train_f1,▁▁▂▃▃▄▅▄▄▅▆▆▅▇▆▆█▆▇▇
train_loss,█▆▆▅▅▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁
train_recall_n,██▅▆███▆▇▇▇▆▅▇█▇▂▁▅▇

0,1
epoch,19.0
test_f1,0.68947
test_recall_n,0.90936
train_f1,0.68312
train_loss,0.78377
train_recall_n,0.96924


Лучшие параметры (Grid Search): {'lr': 0.001, 'optimizer': <class 'torch.optim.adam.Adam'>}


## 4. Демо инференса модели  (2 балл)

В репозитории по ссылке https://github.com/goodevening13/last_hse

Все скриншоты также в репозитории

In [4]:
from utils.models import MainModel, MainModelConfig
import torch.optim as optim

config = MainModelConfig(embed_dim=64, hidden_size=64, output_size=7, num_layers=1, dropout=0.25)
model = MainModel(config).to(device)
    
_ = training_loop(model, 'LSTM', device, train_loader, val_loader, params=None, lr=0.001, optimizer=optim.RMSprop)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mgoodevening13[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 20/20 [04:29<00:00, 13.47s/it]


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
test_f1,▁▄▄▅▅▆▇▇▆█▇▇▇█▇▇█▇▇▇
test_recall_n,█▇▇▇▆█▅▇█▇▆▆▃▆▁▅▅▅▄▂
train_f1,▁▂▂▂▃▂▄▄▄▅▅▆▆▆▆▇▇███
train_loss,█▇▆▆▅▅▅▄▄▄▃▃▃▂▂▂▂▁▁▁
train_recall_n,████▇█▆▇▇█▇▇▅▇▁█▇▇▇▆

0,1
epoch,19.0
test_f1,0.75014
test_recall_n,0.83626
train_f1,0.91885
train_loss,0.28122
train_recall_n,0.96641


In [5]:
from huggingface_hub import login
with open("token", "r") as f:
        hf_token = f.read().strip()
login(token=hf_token)

In [7]:
model.push_to_hub("model_hse_hw", config=config)

model.safetensors:   0%|          | 0.00/33.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/goodevening13/model_hse_hw/commit/759fd10b988c9f6dc8541abd6b2d6adb5cf0d8d2', commit_message='Upload model', commit_description='', oid='759fd10b988c9f6dc8541abd6b2d6adb5cf0d8d2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/goodevening13/model_hse_hw', endpoint='https://huggingface.co', repo_type='model', repo_id='goodevening13/model_hse_hw'), pr_revision=None, pr_num=None)