In [1]:
!pip install naeval
!pip install slovnet
!pip install razdel
!pip install corus
!pip install intervaltree
!pip install ipymarkup
# !pip install -r requirements.txt

Collecting naeval
  Downloading naeval-0.2.0-py3-none-any.whl (52 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m798.2 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: naeval
Successfully installed naeval-0.2.0
Collecting slovnet
  Downloading slovnet-0.6.0-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m738.2 kB/s[0m eta [36m0:00:00[0m
Collecting razdel (from slovnet)
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Collecting navec (from slovnet)
  Downloading navec-0.10.0-py3-none-any.whl (23 kB)
Installing collected packages: razdel, navec, slovnet
Successfully installed navec-0.10.0 razdel-0.5.0 slovnet-0.6.0
Collecting corus
  Downloading corus-0.10.0-py3-none-any.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.7/83.7 kB[0m [31m989.1 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: corus
Successfully i

In [2]:

from os import getenv, environ
from os.path import exists, join, expanduser
from random import seed, sample, randint, uniform
from subprocess import run

from tqdm.notebook import tqdm as log_progress

import torch
from torch import optim

from naeval.ner.datasets import (
    load_factru,
    load_ne5,
)

# from slovnet.s3 import S3
from slovnet.io import (
    format_jl,
    parse_jl,

    load_gz_lines,
    dump_gz_lines
)
from slovnet.board import (
    TensorBoard,
    LogBoard,
    MultiBoard
)
from slovnet.const import (
    TRAIN, TEST,
    PER, LOC, ORG,
    CUDA0,
)
from slovnet.token import tokenize

from slovnet.model.bert import (
    RuBERTConfig,
    BERTEmbedding,
    BERTEncoder,
    BERTNERHead,
    BERTNER
)
from slovnet.markup import (
    SpanMarkup,
    show_span_markup
)
from slovnet.vocab import BERTVocab, BIOTagsVocab
from slovnet.encoders.bert import BERTNERTrainEncoder, BERTInferEncoder
from slovnet.score import (
    NERBatchScore,
    NERScoreMeter,
    score_ner_batch
)
from slovnet.mask import (
    Masked,
    split_masked,
    pad_masked
)

from slovnet.infer.bert import BERTNERInfer, BERTTagDecoder

DATA_DIR = 'data'
MODEL_DIR = 'model'
BERT_DIR = 'bert'

RAW_DIR = join(DATA_DIR, 'raw')

CORUS_NE5 = join(RAW_DIR, 'Collection5')
CORUS_FACTRU = join(RAW_DIR, 'factRuEval-2016-master')

NE5 = join(DATA_DIR, 'ne5.jl.gz')
FACTRU = join(DATA_DIR, 'factru.jl.gz')

S3_DIR = '02_bert_ner'
S3_NE5 = join(S3_DIR, NE5)
S3_FACTRU = join(S3_DIR, FACTRU)

VOCAB = 'vocab.txt'
EMB = 'emb.pt'
ENCODER = 'encoder.pt'
NER = 'ner.pt'

BERT_VOCAB = join(BERT_DIR, VOCAB)
BERT_EMB = join(BERT_DIR, EMB)
BERT_ENCODER = join(BERT_DIR, ENCODER)

S3_RUBERT_DIR = '01_bert_news/rubert'
S3_MLM_DIR = '01_bert_news/model'
S3_BERT_VOCAB = join(S3_RUBERT_DIR, VOCAB)
S3_BERT_EMB = join(S3_MLM_DIR, EMB)
S3_BERT_ENCODER = join(S3_MLM_DIR, ENCODER)

MODEL_ENCODER = join(MODEL_DIR, ENCODER)
MODEL_NER = join(MODEL_DIR, NER)

S3_MODEL_ENCODER = join(S3_DIR, MODEL_ENCODER)
S3_MODEL_NER = join(S3_DIR, MODEL_NER)

BOARD_NAME = getenv('board_name', '02_bert_ner')
RUNS_DIR = 'runs'

TRAIN_BOARD = '01_train'
TEST_BOARD = '02_test'

SEED = int(getenv('seed', 72))
DEVICE = getenv('device', CUDA0)
BERT_LR = float(getenv('bert_lr', 0.000045))
LR = float(getenv('lr', 0.0075))
LR_GAMMA = float(getenv('lr_gamma', 0.45))
EPOCHS = int(getenv('epochs', 5))

SEQ_LEN = int(getenv('SEQ_LEN', 256))
BATCH_SIZE = int(getenv('BATCH_SIZE', 64))

#####################
#
#  CUSTOM TAGS TUNING
#
############### START

CUSTOM_TUNING = True # Set this flag to true in order to use your custom dataset and tags
# CUSTOM_TEXTS = join(DATA_DIR, 'custom-dataset.jl.gz') # Put your own data into the data dir
CUSTOM_TEXTS = join(DATA_DIR, 'new_data.jl.gz') # Put your own data into the data dir
TAGS = ['ORG', 'NUM', 'NAME_EMPLOYEE', 'LINK', 'DATE', 'ACRONYM', 'MAIL', 'TELEPHONE', 'TECH', 'NAME', 'PERCENT'] if CUSTOM_TUNING else [PER, LOC, ORG] # List all your custom tags

################# END


def process_batch(model, criterion, batch):
    input, target = batch

    pred = model(input.value)
    pred = pad_masked(pred, input.mask)
    mask = pad_masked(input.mask, input.mask)

    loss = criterion(pred, target.value, target.mask)

    pred = Masked(pred, mask)
    return batch.processed(loss, pred)

In [3]:
%load_ext autoreload
%autoreload 2

!mkdir -p {DATA_DIR} {MODEL_DIR}
# s3 = S3()

In [4]:
!mkdir data
!wget https://github.com/duusha/hackx5/raw/master/scripts/02_bert_ner/data/new_data.jl.gz
!mv new_data.jl.gz data
!wget https://github.com/duusha/hackx5/raw/master/scripts/02_bert_ner/bert.zip
!jar xvf bert.zip

mkdir: cannot create directory ‘data’: File exists
--2024-05-26 14:46:29--  https://github.com/duusha/hackx5/raw/master/scripts/02_bert_ner/data/new_data.jl.gz
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/duusha/hackx5/master/scripts/02_bert_ner/data/new_data.jl.gz [following]
--2024-05-26 14:46:29--  https://raw.githubusercontent.com/duusha/hackx5/master/scripts/02_bert_ner/data/new_data.jl.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 77565 (76K) [application/octet-stream]
Saving to: ‘new_data.jl.gz’


2024-05-26 14:46:29 (3.32 MB/s) - ‘new_data.jl.gz’ saved [77565/77565]

--2024-05-26

In [5]:
#if not exists(NE5):
#    s3.download(S3_NE5, NE5)
#    s3.download(S3_FACTRU, FACTRU)

In [6]:
# if not exists(BERT_VOCAB):
    # s3.download(S3_BERT_VOCAB, BERT_VOCAB)
    # s3.download(S3_BERT_EMB, BERT_EMB)
    # s3.download(S3_BERT_ENCODER, BERT_ENCODER)

In [7]:
words_vocab = BERTVocab.load(BERT_VOCAB)
tags_vocab = BIOTagsVocab(TAGS)

In [8]:
torch.manual_seed(SEED)
seed(SEED)

In [9]:
config = RuBERTConfig()
emb = BERTEmbedding.from_config(config)
encoder = BERTEncoder.from_config(config)
ner = BERTNERHead(config.emb_dim, len(tags_vocab))
model = BERTNER(emb, encoder, ner)

for param in emb.parameters():
    param.requires_grad = False

model.emb.load(BERT_EMB)
model.encoder.load(BERT_ENCODER)
model = model.to(DEVICE)

In [10]:
records = []
lines = load_gz_lines(CUSTOM_TEXTS) if CUSTOM_TUNING else load_gz_lines(NE5)
# lines = load_gz_lines(NE5)
items = parse_jl(lines)
items = log_progress(items)

for item in items:
    record = SpanMarkup.from_json(item)
    tokens = list(tokenize(record.text))
    record = record.to_bio(tokens)
    records.append(record)

size = round(len(records) * 0.2)

markups = {
    TEST: records[:size],
    TRAIN: records[size:]
}

0it [00:00, ?it/s]

In [11]:
encode = BERTNERTrainEncoder(
    words_vocab, tags_vocab,
    seq_len=128,
    batch_size=32,
    shuffle_size=10000
)

batches = {}
for name in [TEST, TRAIN]:
    batches[name] = [_.to(DEVICE) for _ in encode(markups[name])]
    # batches[name] = [_ for _ in encode(markups[name])]

In [12]:
board = MultiBoard([
    TensorBoard(BOARD_NAME, RUNS_DIR),
    LogBoard()
])
boards = {
    TRAIN: board.section(TRAIN_BOARD),
    TEST: board.section(TEST_BOARD),
}

In [13]:
optimizer = optim.Adam([
    dict(params=encoder.parameters(), lr=BERT_LR),
    dict(params=ner.parameters(), lr=LR),
])
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, LR_GAMMA)

In [14]:
meters = {
    TRAIN: NERScoreMeter(),
    TEST: NERScoreMeter(),
}

for epoch in log_progress(range(EPOCHS)):
    model.train()
    for batch in log_progress(batches[TRAIN], leave=False):
        optimizer.zero_grad()
        batch = process_batch(model, ner.crf, batch)
        batch.loss.backward()
        optimizer.step()

        score = NERBatchScore(batch.loss)
        meters[TRAIN].add(score)

    meters[TRAIN].write(boards[TRAIN])
    meters[TRAIN].reset()

    model.eval()
    with torch.no_grad():
        for batch in log_progress(batches[TEST], leave=False, desc=TEST):
            batch = process_batch(model, ner.crf, batch)
            batch.target = split_masked(batch.target.value, batch.target.mask)
            batch.pred = ner.crf.decode(batch.pred.value, batch.pred.mask)
            score = score_ner_batch(batch, tags_vocab)
            meters[TEST].add(score)

        meters[TEST].write(boards[TEST])
        meters[TEST].reset()

    scheduler.step()
    board.step()

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

[2024-05-26 14:46:56]    0 50.9338 01_train/01_loss


test:   0%|          | 0/2 [00:00<?, ?it/s]

[2024-05-26 14:46:56]    0 0.0008 02_test/01_loss
[2024-05-26 14:46:56]    0 0.0000 02_test/02_ORG
[2024-05-26 14:46:56]    0 0.0000 02_test/03_NUM
[2024-05-26 14:46:56]    0 0.0000 02_test/04_NAME_EMPLOYEE
[2024-05-26 14:46:56]    0 0.0000 02_test/05_LINK
[2024-05-26 14:46:56]    0 0.0000 02_test/06_DATE
[2024-05-26 14:46:56]    0 0.0000 02_test/07_ACRONYM
[2024-05-26 14:46:56]    0 0.0000 02_test/08_MAIL
[2024-05-26 14:46:56]    0 0.0000 02_test/09_TELEPHONE
[2024-05-26 14:46:56]    0 0.0000 02_test/10_TECH
[2024-05-26 14:46:56]    0 0.0000 02_test/11_NAME
[2024-05-26 14:46:56]    0 0.0000 02_test/12_PERCENT


  0%|          | 0/7 [00:00<?, ?it/s]

[2024-05-26 14:47:01]    1 0.0001 01_train/01_loss


test:   0%|          | 0/2 [00:00<?, ?it/s]

[2024-05-26 14:47:02]    1 0.0005 02_test/01_loss
[2024-05-26 14:47:02]    1 0.0000 02_test/02_ORG
[2024-05-26 14:47:02]    1 0.0000 02_test/03_NUM
[2024-05-26 14:47:02]    1 0.0000 02_test/04_NAME_EMPLOYEE
[2024-05-26 14:47:02]    1 0.0000 02_test/05_LINK
[2024-05-26 14:47:02]    1 0.0000 02_test/06_DATE
[2024-05-26 14:47:02]    1 0.0000 02_test/07_ACRONYM
[2024-05-26 14:47:02]    1 0.0000 02_test/08_MAIL
[2024-05-26 14:47:02]    1 0.0000 02_test/09_TELEPHONE
[2024-05-26 14:47:02]    1 0.0000 02_test/10_TECH
[2024-05-26 14:47:02]    1 0.0000 02_test/11_NAME
[2024-05-26 14:47:02]    1 0.0000 02_test/12_PERCENT


  0%|          | 0/7 [00:00<?, ?it/s]

[2024-05-26 14:47:07]    2 0.0002 01_train/01_loss


test:   0%|          | 0/2 [00:00<?, ?it/s]

[2024-05-26 14:47:08]    2 0.0012 02_test/01_loss
[2024-05-26 14:47:08]    2 0.0000 02_test/02_ORG
[2024-05-26 14:47:08]    2 0.0000 02_test/03_NUM
[2024-05-26 14:47:08]    2 0.0000 02_test/04_NAME_EMPLOYEE
[2024-05-26 14:47:08]    2 0.0000 02_test/05_LINK
[2024-05-26 14:47:08]    2 0.0000 02_test/06_DATE
[2024-05-26 14:47:08]    2 0.0000 02_test/07_ACRONYM
[2024-05-26 14:47:08]    2 0.0000 02_test/08_MAIL
[2024-05-26 14:47:08]    2 0.0000 02_test/09_TELEPHONE
[2024-05-26 14:47:08]    2 0.0000 02_test/10_TECH
[2024-05-26 14:47:08]    2 0.0000 02_test/11_NAME
[2024-05-26 14:47:08]    2 0.0000 02_test/12_PERCENT


  0%|          | 0/7 [00:00<?, ?it/s]

[2024-05-26 14:47:12]    3 -0.0002 01_train/01_loss


test:   0%|          | 0/2 [00:00<?, ?it/s]

[2024-05-26 14:47:13]    3 -0.0006 02_test/01_loss
[2024-05-26 14:47:13]    3 0.0000 02_test/02_ORG
[2024-05-26 14:47:13]    3 0.0000 02_test/03_NUM
[2024-05-26 14:47:13]    3 0.0000 02_test/04_NAME_EMPLOYEE
[2024-05-26 14:47:13]    3 0.0000 02_test/05_LINK
[2024-05-26 14:47:13]    3 0.0000 02_test/06_DATE
[2024-05-26 14:47:13]    3 0.0000 02_test/07_ACRONYM
[2024-05-26 14:47:13]    3 0.0000 02_test/08_MAIL
[2024-05-26 14:47:13]    3 0.0000 02_test/09_TELEPHONE
[2024-05-26 14:47:13]    3 0.0000 02_test/10_TECH
[2024-05-26 14:47:13]    3 0.0000 02_test/11_NAME
[2024-05-26 14:47:13]    3 0.0000 02_test/12_PERCENT


  0%|          | 0/7 [00:00<?, ?it/s]

[2024-05-26 14:47:18]    4 0.0006 01_train/01_loss


test:   0%|          | 0/2 [00:00<?, ?it/s]

[2024-05-26 14:47:19]    4 0.0000 02_test/01_loss
[2024-05-26 14:47:19]    4 0.0000 02_test/02_ORG
[2024-05-26 14:47:19]    4 0.0000 02_test/03_NUM
[2024-05-26 14:47:19]    4 0.0000 02_test/04_NAME_EMPLOYEE
[2024-05-26 14:47:19]    4 0.0000 02_test/05_LINK
[2024-05-26 14:47:19]    4 0.0000 02_test/06_DATE
[2024-05-26 14:47:19]    4 0.0000 02_test/07_ACRONYM
[2024-05-26 14:47:19]    4 0.0000 02_test/08_MAIL
[2024-05-26 14:47:19]    4 0.0000 02_test/09_TELEPHONE
[2024-05-26 14:47:19]    4 0.0000 02_test/10_TECH
[2024-05-26 14:47:19]    4 0.0000 02_test/11_NAME
[2024-05-26 14:47:19]    4 0.0000 02_test/12_PERCENT


In [15]:
# [2020-03-31 14:05:40]    0 14.3334 01_train/01_loss
# [2020-03-31 14:05:43]    0 2.3965 02_test/01_loss
# [2020-03-31 14:05:43]    0 0.9962 02_test/02_PER
# [2020-03-31 14:05:43]    0 0.9807 02_test/03_LOC
# [2020-03-31 14:05:43]    0 0.9691 02_test/04_ORG
# [2020-03-31 14:06:10]    1 1.8448 01_train/01_loss
# [2020-03-31 14:06:13]    1 2.1326 02_test/01_loss
# [2020-03-31 14:06:13]    1 0.9975 02_test/02_PER
# [2020-03-31 14:06:13]    1 0.9862 02_test/03_LOC
# [2020-03-31 14:06:13]    1 0.9710 02_test/04_ORG
# [2020-03-31 14:06:40]    2 1.2753 01_train/01_loss
# [2020-03-31 14:06:43]    2 2.1436 02_test/01_loss
# [2020-03-31 14:06:43]    2 0.9972 02_test/02_PER
# [2020-03-31 14:06:43]    2 0.9867 02_test/03_LOC
# [2020-03-31 14:06:43]    2 0.9705 02_test/04_ORG
# [2020-03-31 14:07:10]    3 1.1283 01_train/01_loss
# [2020-03-31 14:07:13]    3 2.1885 02_test/01_loss
# [2020-03-31 14:07:13]    3 0.9975 02_test/02_PER
# [2020-03-31 14:07:13]    3 0.9867 02_test/03_LOC
# [2020-03-31 14:07:13]    3 0.9719 02_test/04_ORG
# [2020-03-31 14:07:40]    4 1.0464 01_train/01_loss

# [2020-03-31 14:07:43]    4 2.1705 02_test/01_loss
# [2020-03-31 14:07:43]    4 0.9977 02_test/02_PER
# [2020-03-31 14:07:43]    4 0.9862 02_test/03_LOC
# [2020-03-31 14:07:43]    4 0.9722 02_test/04_ORG

In [16]:
model.encoder.dump(MODEL_ENCODER)
ner.dump(MODEL_NER)

# s3.upload(MODEL_ENCODER, S3_MODEL_ENCODER)
# s3.upload(MODEL_NER, S3_MODEL_NER)

In [17]:
!zip -r model.zip model

  adding: model/ (stored 0%)
  adding: model/encoder.pt (deflated 7%)
  adding: model/ner.pt (deflated 9%)
