In [1]:
!pip install lightning gensim torch

from time import perf_counter
from contextlib import contextmanager
from typing import Optional
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
from pathlib import Path
import lightning as L
from torch.utils.data import DataLoader, Dataset
from sklearn.utils.class_weight import compute_class_weight
from torchmetrics import MetricCollection
from torchmetrics.classification import MulticlassAccuracy, MulticlassPrecision, MulticlassRecall, MulticlassF1Score
import torch.optim as optim
from lightning.pytorch.callbacks import TQDMProgressBar, ModelCheckpoint
from lightning.pytorch.loggers.tensorboard import TensorBoardLogger
from gensim.models import KeyedVectors
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
from lightning.pytorch.tuner import Tuner

Collecting lightning
  Obtaining dependency information for lightning from https://files.pythonhosted.org/packages/8c/a1/b2a6c33675510bc3e1ca6d010b244ac0dd9c81fc1723a37e7491aa586041/lightning-2.1.3-py3-none-any.whl.metadata


  Downloading lightning-2.1.3-py3-none-any.whl.metadata (56 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/56.8 kB[0m [31m?[0m eta [36m-:--:--[0m


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
















Downloading lightning-2.1.3-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m


[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/2.0 MB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m


[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.0/2.0 MB[0m [31m36.1 MB/s[0m eta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25h

Installing collected packages: lightning


Successfully installed lightning-2.1.3




## Models training

In [2]:
@contextmanager
def measure_time() -> float:
    start = perf_counter()
    yield lambda: perf_counter() - start

In [3]:
POLISH_TRANSFORMER_MODEL_NAME = "dkleczek/bert-base-polish-cased-v1"
DATA_PATH = Path("/kaggle/input/poleval")
CWD_PATH = Path.cwd()

In [4]:
!wget https://github.com/sdadas/polish-nlp-resources/releases/download/v1.0/glove.zip
!unzip /kaggle/working/glove.zip
!rm /kaggle/working/glove.zip

--2023-12-28 21:23:39--  https://github.com/sdadas/polish-nlp-resources/releases/download/v1.0/glove.zip
Resolving github.com (github.com)... 

140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 

302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/151131563/b3ad1180-acb6-11e9-83f1-dcfed2e65aca?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20231228%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231228T212340Z&X-Amz-Expires=300&X-Amz-Signature=e3124c9c16fc4112786adc9e34534a0ba353760c44100d886435043ee03a9895&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=151131563&response-content-disposition=attachment%3B%20filename%3Dglove.zip&response-content-type=application%2Foctet-stream [following]
--2023-12-28 21:23:40--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/151131563/b3ad1180-acb6-11e9-83f1-dcfed2e65aca?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20231228%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20231228T212340Z&X-Amz-Expires=300&X-Amz-Signature=e3124c9c16fc4112786adc9e34534a0ba353760c44100d886435043ee03a9895&X-Amz-SignedHeaders=host&actor_id

HTTP request sent, awaiting response... 

200 OK
Length: 656379892 (626M) [application/octet-stream]
Saving to: 'glove.zip'


glove.zip             0%[                    ]       0  --.-KB/s               


glove.zip             4%[                    ]  25.41M   127MB/s               


glove.zip            11%[=>                  ]  73.65M   184MB/s               


glove.zip            22%[===>                ] 140.32M   234MB/s               


glove.zip            30%[=====>              ] 191.04M   235MB/s               























2023-12-28 21:23:43 (217 MB/s) - 'glove.zip' saved [656379892/656379892]



Archive:  /kaggle/working/glove.zip
  inflating: glove_100_3_polish.txt  




In [5]:
class TransformerWrapper(nn.Module):

    def __init__(self, model_name: str = POLISH_TRANSFORMER_MODEL_NAME, start_training_layer: int = -1, num_classes: int = 2):
        super().__init__()

        self.model, model_out_channels = self._get_transformer(model_name=model_name, start_training_layer=start_training_layer)

        self.classifier = nn.Sequential(
            nn.Linear(in_features=model_out_channels, out_features=1024),
            nn.SiLU(),
            nn.Dropout(0.2),
            nn.Linear(in_features=1024, out_features=num_classes),
            nn.LogSoftmax(dim=1)
        )

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        pooler_output = self.model(input_ids, attention_mask=attention_mask)["pooler_output"]

        return self.classifier(pooler_output)
    
    def _get_transformer(self, model_name: str, start_training_layer: int):
        """Get pretrained Transformer model.

        Args:
            start_training_layer (int): Get number of layer from which model will be unfrozen. Pass -1 if unfreeze none of them.
        """
        model = AutoModel.from_pretrained(model_name)

        if start_training_layer == -1:
            for param in model.parameters():
                param.requires_grad = False
            return model, model.pooler.dense.out_features

        start_training_index = start_training_layer * 16

        for param in model.embeddings.parameters():
            param.requires_grad = False

        for idx, param in enumerate(model.encoder.layer.parameters()):
            param.requires_grad = False if idx < start_training_index else True

        for param in model.pooler.parameters():
            param.requires_grad = True if start_training_layer != -1 else False

        return model, model.pooler.dense.out_features

In [6]:
class TransfromerDataset(Dataset):
    def __init__(self, data_df: pd.DataFrame, target_column: str, text_column: str, model_name: str = POLISH_TRANSFORMER_MODEL_NAME):
        super().__init__()

        self.data, self.target = self._prepare_data_to_transformer(
            data_df=data_df,
            target_column=target_column,
            text_column=text_column,
            model_name=model_name
        )

        self.class_mapping = {
            class_name: idx for idx, class_name in enumerate((np.unique(self.target)))
        }

        self.num_classes = max(list(self.class_mapping.values())) + 1

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        sample_data_input_id = torch.tensor(self.data["input_ids"][index])
        sample_data_attention_mask = torch.tensor(self.data["attention_mask"][index])
        sample_target = F.one_hot(
            torch.tensor(self.class_mapping[self.target[index]]), num_classes=self.num_classes
        ).float()

        return sample_data_input_id, sample_data_attention_mask, sample_target
    
    def _prepare_data_to_transformer(
        self, data_df: pd.DataFrame, target_column: str, text_column: str, model_name: str = POLISH_TRANSFORMER_MODEL_NAME
    ):
        tokenizer = AutoTokenizer.from_pretrained(model_name)

        data = tokenizer.batch_encode_plus(
            data_df[text_column].tolist(),
            max_length = 512,
            padding='max_length',
            truncation=True
        )

        target = data_df[target_column].tolist()

        return data, target
    
    def __len__(self) -> int:
        return len(self.target)
    
    def get_labels(self) -> list[int]:
        return [self.class_mapping[label] for label in self.target]

In [7]:
class TransformerDatasetModule(L.LightningDataModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.save_hyperparameters()

    def setup(self, stage: Optional[str] = None):
        self.train = TransfromerDataset(
            data_df=pd.read_csv(self.hparams.data_root / "train.csv"),
            target_column=self.hparams.target_column,
            text_column=self.hparams.text_column,
            model_name=self.hparams.model_name
        )
        self.test = TransfromerDataset(
            data_df=pd.read_csv(self.hparams.data_root / "test.csv"),
            target_column=self.hparams.target_column,
            text_column=self.hparams.text_column,
            model_name=self.hparams.model_name
        )

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.hparams.batch_size, shuffle=True)

    def test_dataloader(self):
        return DataLoader(self.test, batch_size=self.hparams.batch_size, shuffle=False)
    
    def get_class_weights(self) -> list[float]:
        labels = self.train.get_labels()
        return torch.tensor(compute_class_weight('balanced', classes=np.unique(labels), y=labels))

In [8]:
class TransformerModule(L.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.save_hyperparameters()

        self.model = TransformerWrapper(
            model_name=self.hparams.model_name,
            start_training_layer=self.hparams.start_training_layer,
            num_classes=self.hparams.num_classes
        )

        metrics = MetricCollection([
            MulticlassAccuracy(self.hparams.num_classes, average=None),
            MulticlassPrecision(self.hparams.num_classes, average=None),
            MulticlassRecall(self.hparams.num_classes, average=None),
            MulticlassF1Score(self.hparams.num_classes, average=None)
        ])
        self.metrics = {
            "train": metrics.clone(prefix='train_'),
            "test": metrics.clone(prefix='test_')
        }

        self.criterion = nn.CrossEntropyLoss(weight=self.hparams.class_weights)

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
        return self.model(input_ids, attention_mask)

    def training_step(self, batch, batch_idx):
        return self._shared_eval(batch, batch_idx, "train")

    def test_step(self, batch, batch_idx):
        return self._shared_eval(batch, batch_idx, "test")

    def _shared_eval(self, batch, batch_idx, stage):
        input_ids, attention_mask, targets = batch
        logits = self(input_ids, attention_mask)

        loss = self.criterion(logits, targets)

        self.metrics[stage].update(torch.argmax(logits, -1).detach().cpu(), torch.argmax(targets, -1).detach().cpu())

        self.log(f"{stage}_loss", loss.detach().cpu(), on_epoch=True, on_step=True)
        return loss
    
    def on_train_epoch_end(self) -> None:
        metrics = self.metrics["train"].compute()

        for metric_name, values in metrics.items():
            for idx, value in enumerate(values):
                self.log(f"{metric_name}_class_{idx}", value, on_epoch=True)

        self.metrics["train"].reset()

    def on_test_epoch_end(self) -> None:
        metrics = self.metrics["test"].compute()

        for metric_name, values in metrics.items():
            for idx, value in enumerate(values):
                self.log(f"{metric_name}_class_{idx}", value, on_epoch=True)

        self.metrics["test"].reset()

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.97)
        return [optimizer], [scheduler]

# Transformer Training

## Setup
- only classification head
- unfreeze last encoder layer + classification head
- unfreeze last 2 encoder layers + classification head

In [9]:
transformer_scores = {}

for start_training_layer in [-1, 8, 9, 10]:
    datamodule = TransformerDatasetModule(
        target_column="label",
        text_column="preprocessed_text",
        batch_size=64,
        model_name=POLISH_TRANSFORMER_MODEL_NAME,
        data_root=DATA_PATH
    )
    datamodule.setup()

    model = TransformerModule(
        model_name=POLISH_TRANSFORMER_MODEL_NAME,
        num_classes=2,
        start_training_layer=start_training_layer,
        lr=2e-5,
        class_weights=datamodule.get_class_weights()
    )

    trainer = L.Trainer(
        max_epochs=50,
        accelerator="gpu",
        devices="auto",
        callbacks=[TQDMProgressBar(refresh_rate=2), ModelCheckpoint(
            dirpath="checkpoints/",
            filename="{epoch}-{train_loss:.2f}",
            mode="min",
            monitor='train_loss',
            
        )],
        logger=TensorBoardLogger(save_dir="logs/"),
        log_every_n_steps=2,
    )

    tuner = Tuner(trainer)

    tuner.lr_find(
        model=model,
        datamodule=datamodule,
        method="fit"
    )

    trainer.fit(model, datamodule=datamodule)
    
    best_model_path = trainer.checkpoint_callback.best_model_path
    best_model = TransformerModule.load_from_checkpoint(best_model_path)
    
    transformer_test_scores = trainer.test(best_model, datamodule=datamodule)

    transformer_scores[start_training_layer] = transformer_test_scores[0]

tokenizer_config.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/489k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/531M [00:00<?, ?B/s]

INFO: GPU available: True (cuda), used: True


INFO: TPU available: False, using: 0 TPU cores


INFO: IPU available: False, using: 0 IPUs


INFO: HPU available: False, using: 0 HPUs




INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO: LR finder stopped early after 91 steps due to diverging loss.


INFO: Learning rate set to 0.09120108393559097


INFO: Restoring states from the checkpoint path at /kaggle/working/.lr_find_83f20473-2ca6-4862-9c83-71aebc98ade1.ckpt


INFO: Restored all states from the checkpoint at /kaggle/working/.lr_find_83f20473-2ca6-4862-9c83-71aebc98ade1.ckpt


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


INFO: 
  | Name      | Type               | Params
-------------------------------------------------
0 | model     | TransformerWrapper | 132 M 
1 | criterion | CrossEntropyLoss   | 0     
-------------------------------------------------
789 K     Trainable params
132 M     Non-trainable params
132 M     Total params
531.643   Total estimated model params size (MB)


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=50` reached.


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

INFO: GPU available: True (cuda), used: True


INFO: TPU available: False, using: 0 TPU cores


INFO: IPU available: False, using: 0 IPUs


INFO: HPU available: False, using: 0 HPUs


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO: LR finder stopped early after 80 steps due to diverging loss.


INFO: Learning rate set to 3.311311214825911e-05


INFO: Restoring states from the checkpoint path at /kaggle/working/.lr_find_fd8f04ae-89f2-4e6a-bd5b-0d59a23f58cb.ckpt


INFO: Restored all states from the checkpoint at /kaggle/working/.lr_find_fd8f04ae-89f2-4e6a-bd5b-0d59a23f58cb.ckpt


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


INFO: 
  | Name      | Type               | Params
-------------------------------------------------
0 | model     | TransformerWrapper | 132 M 
1 | criterion | CrossEntropyLoss   | 0     
-------------------------------------------------
29.7 M    Trainable params
103 M     Non-trainable params
132 M     Total params
531.643   Total estimated model params size (MB)


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=50` reached.


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

INFO: GPU available: True (cuda), used: True


INFO: TPU available: False, using: 0 TPU cores


INFO: IPU available: False, using: 0 IPUs


INFO: HPU available: False, using: 0 HPUs


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO: LR finder stopped early after 76 steps due to diverging loss.


INFO: Learning rate set to 0.001584893192461114


INFO: Restoring states from the checkpoint path at /kaggle/working/.lr_find_3d3aa8f3-d3cd-4676-b9a1-e32fa7cc7899.ckpt


INFO: Restored all states from the checkpoint at /kaggle/working/.lr_find_3d3aa8f3-d3cd-4676-b9a1-e32fa7cc7899.ckpt


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


INFO: 
  | Name      | Type               | Params
-------------------------------------------------
0 | model     | TransformerWrapper | 132 M 
1 | criterion | CrossEntropyLoss   | 0     
-------------------------------------------------
22.6 M    Trainable params
110 M     Non-trainable params
132 M     Total params
531.643   Total estimated model params size (MB)


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=50` reached.


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

INFO: GPU available: True (cuda), used: True


INFO: TPU available: False, using: 0 TPU cores


INFO: IPU available: False, using: 0 IPUs


INFO: HPU available: False, using: 0 HPUs


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO: LR finder stopped early after 76 steps due to diverging loss.


INFO: Learning rate set to 4.365158322401661e-06


INFO: Restoring states from the checkpoint path at /kaggle/working/.lr_find_a6b16f69-9ef6-491b-8455-c048cc8f5825.ckpt


INFO: Restored all states from the checkpoint at /kaggle/working/.lr_find_a6b16f69-9ef6-491b-8455-c048cc8f5825.ckpt


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


INFO: 
  | Name      | Type               | Params
-------------------------------------------------
0 | model     | TransformerWrapper | 132 M 
1 | criterion | CrossEntropyLoss   | 0     
-------------------------------------------------
15.6 M    Trainable params
117 M     Non-trainable params
132 M     Total params
531.643   Total estimated model params size (MB)


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=50` reached.


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

In [10]:
pd.DataFrame.from_dict(transformer_scores)

Unnamed: 0,-1,8,9,10
test_loss_epoch,1.013691,1.925034,0.69253,0.590184
test_MulticlassAccuracy_class_0,0.095704,0.970092,0.0,0.898314
test_MulticlassAccuracy_class_1,0.941176,0.570588,1.0,0.758824
test_MulticlassPrecision_class_0,0.946237,0.960689,0.0,0.975783
test_MulticlassPrecision_class_1,0.087767,0.638158,0.084619,0.408228
test_MulticlassRecall_class_0,0.095704,0.970092,0.0,0.898314
test_MulticlassRecall_class_1,0.941176,0.570588,1.0,0.758824
test_MulticlassF1Score_class_0,0.173827,0.965368,0.0,0.935447
test_MulticlassF1Score_class_1,0.160562,0.602484,0.156035,0.530864


In [11]:
class Word2VecWrapper(nn.Module):

    def __init__(self, num_classes: int = 2, num_layers: int = 1):
        super().__init__()

        self.lstm = nn.LSTM(input_size=100, hidden_size=256, batch_first=True, num_layers=num_layers, bidirectional=True, dropout=0.2)

        self.fcn = nn.Sequential(
            nn.Linear(256, 512),
            nn.SiLU(),
            nn.Linear(512, num_classes)
        )

    def forward(self, sequence):
        _, (last_hidden, _) = self.lstm(sequence)

        return self.fcn(last_hidden[-1])
    
    def _get_word2vec(self, model_path: str):
        return KeyedVectors.load_word2vec_format(model_path)

In [12]:
class LSTMModule(L.LightningModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.save_hyperparameters()

        self.model = Word2VecWrapper(
            num_classes=self.hparams.num_classes,
            num_layers=self.hparams.num_layers
        )

        metrics = MetricCollection([
            MulticlassAccuracy(self.hparams.num_classes, average=None),
            MulticlassPrecision(self.hparams.num_classes, average=None),
            MulticlassRecall(self.hparams.num_classes, average=None),
            MulticlassF1Score(self.hparams.num_classes, average=None)
        ])
        self.metrics = {
            "train": metrics.clone(prefix='train_'),
            "test": metrics.clone(prefix='test_')
        }

        self.criterion = nn.CrossEntropyLoss(weight=self.hparams.class_weights)

    def forward(self, sequence):
        return self.model(sequence)

    def training_step(self, batch, batch_idx):
        return self._shared_eval(batch, batch_idx, "train")

    def test_step(self, batch, batch_idx):
        return self._shared_eval(batch, batch_idx, "test")

    def _shared_eval(self, batch, batch_idx, stage):
        sequences, targets = batch
        batch_size = targets.shape[0]
        logits = self(sequences)

        loss = self.criterion(logits, targets)

        self.metrics[stage].update(torch.argmax(logits, -1).detach().cpu(), torch.argmax(targets, -1).detach().cpu())

        self.log(f"{stage}_loss", loss.detach().cpu(), on_epoch=True, on_step=True, batch_size=batch_size)
        return loss
    
    def on_train_epoch_end(self) -> None:
        metrics = self.metrics["train"].compute()

        for metric_name, values in metrics.items():
            for idx, value in enumerate(values):
                self.log(f"{metric_name}_class_{idx}", value, on_epoch=True)

        self.metrics["train"].reset()

    def on_test_epoch_end(self) -> None:
        metrics = self.metrics["test"].compute()

        for metric_name, values in metrics.items():
            for idx, value in enumerate(values):
                self.log(f"{metric_name}_class_{idx}", value, on_epoch=True)

        self.metrics["test"].reset()

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=self.hparams.lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.97)
        return [optimizer], [scheduler]

In [13]:
class LSTMDataset(Dataset):
    def __init__(self, data_df: pd.DataFrame, target_column: str, text_column: str, model_path: str = "glove_100_3_polish.txt"):
        super().__init__()

        self.word2vec = KeyedVectors.load_word2vec_format(model_path)

        self.data, self.target = self._prepare_data_to_transformer(
            data_df=data_df,
            target_column=target_column,
            text_column=text_column,
        )

        self.class_mapping = {
            class_name: idx for idx, class_name in enumerate((np.unique(self.target)))
        }

        self.num_classes = max(list(self.class_mapping.values())) + 1

    def __getitem__(self, index: int) -> tuple[torch.Tensor, torch.Tensor]:
        sample_data = torch.tensor(self.data[index]).float()

        sample_target = F.one_hot(
            torch.tensor(self.class_mapping[self.target[index]]), num_classes=self.num_classes
        ).float()

        return sample_data, sample_target
    
    def _prepare_data_to_transformer(
        self, data_df: pd.DataFrame, target_column: str, text_column: str
    ):
        data = data_df[text_column].tolist()

        data = [
            element.split(" ") for element in data
        ]

        oov_embedding = np.random.random(self.word2vec.vector_size)

        data = [
            [
                self.word2vec.get_vector(word) if word in self.word2vec.key_to_index else oov_embedding for word  in words
            ] for words in data
        ]

        target = data_df[target_column].tolist()

        return data, target
    
    def __len__(self) -> int:
        return len(self.target)
    
    def get_labels(self) -> list[int]:
        return [self.class_mapping[label] for label in self.target]

In [14]:
class LSTMDatasetModule(L.LightningDataModule):
    def __init__(self, *args, **kwargs):
        super().__init__()
        self.save_hyperparameters()

    def setup(self, stage: Optional[str] = None):
        self.train = LSTMDataset(
            data_df=pd.read_csv(self.hparams.data_root / "train.csv"),
            target_column=self.hparams.target_column,
            text_column=self.hparams.text_column,
        )
        self.test = LSTMDataset(
            data_df=pd.read_csv(self.hparams.data_root / "test.csv"),
            target_column=self.hparams.target_column,
            text_column=self.hparams.text_column,
        )

    def train_dataloader(self):
        return DataLoader(self.train, batch_size=self.hparams.batch_size, shuffle=True, collate_fn=self._collate_fn)

    def test_dataloader(self):
        return DataLoader(self.test, batch_size=self.hparams.batch_size, shuffle=False, collate_fn=self._collate_fn)
    
    def _collate_fn(self, batch: list[tuple[torch.Tensor, torch.Tensor]]):
        sequences, targets = [seq for seq, _ in batch], [target for _, target in batch]
        
        lengths = [len(seq) for seq in sequences]
        
        padded_seqs = pad_sequence(sequences, batch_first=True)
        
        packed_seqs = pack_padded_sequence(padded_seqs, lengths, batch_first=True, enforce_sorted=False)

        return packed_seqs, torch.stack(targets)
    
    def get_class_weights(self) -> list[float]:
        labels = self.train.get_labels()
        return torch.tensor(compute_class_weight('balanced', classes=np.unique(labels), y=labels))

# LSTM Training

## Setup
- Word embeddings from GloVe + LSTM

In [15]:
lstm_test_scores = {}

for num_layers in [1, 3, 5, 10]:
    datamodule = LSTMDatasetModule(
        target_column="label",
        text_column="preprocessed_text",
        batch_size=128,
        model_path=CWD_PATH / "glove_100_3_polish.txt",
        data_root=DATA_PATH
    )
    datamodule.setup()

    model = LSTMModule(
        model_path=CWD_PATH / "glove_100_3_polish.txt",
        num_classes=2,
        lr=1e-3,
        class_weights=datamodule.get_class_weights(),
        num_layers=num_layers
    )

    trainer = L.Trainer(
        max_epochs=50,
        accelerator="gpu",
        devices="auto",
        callbacks=[TQDMProgressBar(refresh_rate=2), ModelCheckpoint(
            dirpath="checkpoints/",
            filename="{epoch}-{train_loss:.2f}",
            mode="min",
            monitor='train_loss',

        )],
        logger=TensorBoardLogger(save_dir="logs/"),
        log_every_n_steps=2,
    )

    tuner = Tuner(trainer)

    tuner.lr_find(
        model=model,
        datamodule=datamodule,
        method="fit"
    )

    trainer.fit(model, datamodule=datamodule)
    lstm_test_scores[num_layers] = trainer.test(model, datamodule=datamodule)[0]

INFO: GPU available: True (cuda), used: True


INFO: TPU available: False, using: 0 TPU cores


INFO: IPU available: False, using: 0 IPUs


INFO: HPU available: False, using: 0 HPUs


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
  sample_data = torch.tensor(self.data[index]).float()


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO: LR finder stopped early after 95 steps due to diverging loss.


INFO: Learning rate set to 0.00017378008287493763


INFO: Restoring states from the checkpoint path at /kaggle/working/.lr_find_d0807e5f-c7ab-48ac-9a49-a0bd91c3e72e.ckpt


INFO: Restored all states from the checkpoint at /kaggle/working/.lr_find_d0807e5f-c7ab-48ac-9a49-a0bd91c3e72e.ckpt


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


INFO: 
  | Name      | Type             | Params
-----------------------------------------------
0 | model     | Word2VecWrapper  | 865 K 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
865 K     Trainable params
0         Non-trainable params
865 K     Total params
3.463     Total estimated model params size (MB)


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=50` reached.


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

INFO: GPU available: True (cuda), used: True


INFO: TPU available: False, using: 0 TPU cores


INFO: IPU available: False, using: 0 IPUs


INFO: HPU available: False, using: 0 HPUs


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

INFO: LR finder stopped early after 91 steps due to diverging loss.


INFO: Learning rate set to 0.0022908676527677745


INFO: Restoring states from the checkpoint path at /kaggle/working/.lr_find_7659d74d-9db9-4e23-b1b4-cda6e2c656e7.ckpt


INFO: Restored all states from the checkpoint at /kaggle/working/.lr_find_7659d74d-9db9-4e23-b1b4-cda6e2c656e7.ckpt


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:639: Checkpoint directory /kaggle/working/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


INFO: 
  | Name      | Type             | Params
-----------------------------------------------
0 | model     | Word2VecWrapper  | 4.0 M 
1 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
4.0 M     Trainable params
0         Non-trainable params
4.0 M     Total params
16.079    Total estimated model params size (MB)


/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

In [None]:
pd.DataFrame.from_dict(lstm_test_scores)

## Data generation

In [None]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline
from typing import Iterable, Iterator


class FillingMaskDataGenerator:
    def __init__(self) -> None:
        model = BertForMaskedLM.from_pretrained("dkleczek/bert-base-polish-uncased-v1")
        tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1")
        self.nlp = pipeline('fill-mask', model=model, tokenizer=tokenizer, top_k=3)

    def get_for_single(self, masked_sentence: str, n: int = 3) -> Iterator[str]:
        """Create n examples with filled mask

        Args:
            masked_sentence (str): Sentence with '[MASK]' where to fill
            n (int, optional): n examples. Defaults to 3.
        """
        yield from [result["sequence"] for result in self.nlp(masked_sentence)]
    
    def get_for_iterable(self, masked_sequences: Iterable[str], n: int = 3) -> Iterator[str]:
        for masked_sequence in masked_sequences:
            yield from self.get_for_single(masked_sentence=masked_sequence, n=n)