In [1]:
import pytorch_lightning as pl
import torch
import json
from torch.utils.data import DataLoader
from pytorch_lightning import Trainer

from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup, logging
from tokenizers import BertWordPieceTokenizer
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

# from flat_dataset import FlatNERELDataset, collate_to_max_length
from baseline import BaselineRuBERT
from iobes_flat_dataset import IOBESFlatRuNNEDataset, collate_to_max_length
from score import Evaluator

In [2]:
VOCAB_PATH = "./vocab.txt"
NERS_PATH = "./eval/ref/ners.txt"
IN_PATH = "./eval"
OUT_PATH = "./eval"

TRAIN_PATH = "../data/train"
DEV_PATH = "../data/dev"
# TEST_PATH = "./data/test"

TRAIN_IDS_PATH = "../public_data/train.jsonl"
DEV_IDS_PATH = "../public_data/dev.jsonl"
# TEST_IDS_PATH = "./data/test.jsonl"

CKPT_PATH = "./checkpoints"

MAX_LEN = 128
BATCH_SIZE = 1
NUM_WORKERS = 8
MAX_EPOCHS = 1
LR = 1e-4
WEIGHT_DECAY = 0.02

In [3]:
logging.set_verbosity_error()

In [4]:
bertwptokenizer = BertWordPieceTokenizer(VOCAB_PATH, lowercase=False)

In [5]:
train_dataset = IOBESFlatRuNNEDataset( dataset_name = "train",
                                       dataset_path = TRAIN_PATH, 
                                       ners_path = NERS_PATH, 
                                       format_path = TRAIN_IDS_PATH,
                                       in_path = IN_PATH,
                                       tokenizer = bertwptokenizer, 
                                       max_length = MAX_LEN )
train_dataloader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = NUM_WORKERS,
    collate_fn = collate_to_max_length
)

Loading train:


  0%|          | 0/922 [00:00<?, ?it/s]

In [6]:
dev_dataset   = IOBESFlatRuNNEDataset( dataset_name = "dev",
                                       dataset_path = DEV_PATH,   
                                       ners_path = NERS_PATH, 
                                       format_path = DEV_IDS_PATH,
                                       in_path = IN_PATH,
                                       tokenizer = bertwptokenizer, 
                                       max_length = MAX_LEN )
dev_dataloader = DataLoader(
    dataset = dev_dataset,
    batch_size = BATCH_SIZE,
    shuffle = False,
    num_workers = NUM_WORKERS,
    collate_fn = collate_to_max_length
)

Loading dev:


  0%|          | 0/323 [00:00<?, ?it/s]

In [7]:
model = BaselineRuBERT (
    in_path = IN_PATH,
    out_path = OUT_PATH,
    tag_to_id = train_dataset.tag_to_id,
    total_steps = (len(train_dataset) // BATCH_SIZE) * MAX_EPOCHS,
    lr = LR,
    weight_decay = WEIGHT_DECAY
)

In [8]:
checkpoint_callback = ModelCheckpoint(
    # Директория, куда будут сохраняться чекпойнты и логи (по умолчанию корневая папка проекта)
    dirpath = CKPT_PATH,
    save_top_k = 1,
    verbose = True,
    monitor = 'macro_f1',
    mode = "max", # Сохраняем самые максимальные по метрике модели
)

In [9]:
trainer = Trainer(
    # gpus = -1,
    callbacks = [checkpoint_callback],
    num_sanity_val_steps = -1,
    max_epochs = 1
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [10]:
trainer.fit(model, train_dataloader, dev_dataloader) # Запуск процесса обучения и валидации, с мониторингом


  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | BertForTokenClassification | 177 M 
-----------------------------------------------------
177 M     Trainable params
0         Non-trainable params
177 M     Total params
709.411   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]


Loss on dev: 5589.921964
Mention F1: 0.00%
Mention recall: 0.00%
Mention precision: 0.00%
Macro F1: 0.00%
Macro F1 few-shot: 0.00%


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
