In [1]:
import torch
from transformers import get_constant_schedule_with_warmup
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

from dataloader import CoNLLReader
from tqdm import tqdm

In [2]:
try:
    del globals()['NERmodelbase']
except:
    pass
from NERmodel import NERmodelbase

In [3]:
encoder_model = 'bert-base-multilingual-uncased'
tokenizer = AutoTokenizer.from_pretrained(encoder_model)

In [4]:
def collate_batch(batch):
    batch_ = list(zip(*batch))
    tokens, masks, token_masks, gold_spans, tags = batch_[0], batch_[1], batch_[2], batch_[3], batch_[4]

    max_len = max([len(token) for token in tokens])
    token_tensor = torch.empty(size=(len(tokens), max_len), dtype=torch.long).fill_(tokenizer.pad_token_id)
    tag_tensor = torch.empty(size=(len(tokens), max_len), dtype=torch.long).fill_(mconern['O'])
    mask_tensor = torch.zeros(size=(len(tokens), max_len), dtype=torch.bool)
    token_masks_tensor = torch.zeros(size=(len(tokens), max_len), dtype=torch.bool)

    for i in range(len(tokens)):
        tokens_ = tokens[i]
        seq_len = len(tokens_)

        token_tensor[i, :seq_len] = tokens_
        tag_tensor[i, :seq_len] = tags[i]
        mask_tensor[i, :seq_len] = masks[i]
        token_masks_tensor[i, :seq_len] = token_masks[i]
#     print(tokenizer.convert_ids_to_tokens(token_tensor[0]))
#     print("Hello")
    return token_tensor, tag_tensor, mask_tensor, token_masks_tensor, gold_spans

In [5]:
def get_optimizer(net, opt=False):
    optimizer = torch.optim.AdamW(net.parameters(), lr=1e-4, weight_decay=0.03)
    if opt:
        scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEP)
        return [optimizer], [scheduler]
    return [optimizer]


In [6]:
NUM_EPOCH = 1
BATCH_SIZE = 64
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [7]:
# wnut_iob = {'B-CORP': 0, 'I-CORP': 1, 'B-CW': 2, 'I-CW': 3, 'B-GRP': 4, 'I-GRP': 5, 'B-LOC': 6, 'I-LOC': 7,
#             'B-PER': 8, 'I-PER': 9, 'B-PROD': 10, 'I-PROD': 11, 'O': 12}
mconern = {'B-AerospaceManufacturer': 0, 'I-AerospaceManufacturer': 1, 'B-AnatomicalStructure': 2,
           'I-AnatomicalStructure': 3,
           'B-ArtWork': 4, 'I-ArtWork': 5, 'B-Artist': 6, 'I-Artist': 7, 'B-Athlete': 8, 'I-Athlete': 9,
           'B-CarManufacturer': 10, 'I-CarManufacturer': 11, 'B-Cleric': 12, 'I-Cleric': 13, 'B-Clothing': 14,
           'I-Clothing': 15, 'B-Disease': 16, 'I-Disease': 17, 'B-Drink': 18, 'I-Drink': 19, 'B-Facility': 20,
           'I-Facility': 21, 'B-Food': 22, 'I-Food': 23, 'B-HumanSettlement': 24, 'I-HumanSettlement': 25,
           'B-MedicalProcedure': 26,
           'I-MedicalProcedure': 27, 'B-Medication/Vaccine': 28, 'I-Medication/Vaccine': 29, 'B-MusicalGRP': 30,
           'I-MusicalGRP': 31,
           'B-MusicalWork': 32, 'I-MusicalWork': 33, 'O': 34, 'B-ORG': 35, 'I-ORG': 36, 'B-OtherLOC': 37,
           'I-OtherLOC': 38, 'B-OtherPER': 39,
           'I-OtherPER': 40, 'B-OtherPROD': 41, 'I-OtherPROD': 42, 'B-Politician': 43, 'I-Politician': 44,
           'B-PrivateCorp': 45, 'I-PrivateCorp': 46,
           'B-PublicCorp': 47, 'I-PublicCorp': 48, 'B-Scientist': 49, 'I-Scientist': 50, 'B-Software': 51,
           'I-Software': 52, 'B-SportsGRP': 53,
           'I-SportsGRP': 54, 'B-SportsManager': 55, 'I-SportsManager': 56, 'B-Station': 57, 'I-Station': 58,
           'B-Symptom': 59, 'I-Symptom': 60, 'B-Vehicle': 61,
           'I-Vehicle': 62, 'B-VisualWork': 63, 'I-VisualWork': 64, 'B-WrittenWork': 65, 'I-WrittenWork': 66}

In [17]:
ds = CoNLLReader(target_vocab=mconern, encoder_model=encoder_model)
ds.read_data(data=r'C:\Users\Rah12937\PycharmProjects\mconer\multiconer2023\train_dev\en-dev.conll')


Reading file C:\Users\Rah12937\PycharmProjects\mconer\multiconer2023\train_dev\en-dev.conll
Finished reading 871 instances from file C:\Users\Rah12937\PycharmProjects\mconer\multiconer2023\train_dev\en-dev.conll


In [9]:

valid = CoNLLReader(target_vocab=mconern, encoder_model=encoder_model)
valid.read_data(data=r'C:\Users\Rah12937\PycharmProjects\mconer\multiconer2023\train_dev\en-dev.conll')

Reading file C:\Users\Rah12937\PycharmProjects\mconer\multiconer2023\train_dev\en-dev.conll
Finished reading 871 instances from file C:\Users\Rah12937\PycharmProjects\mconer\multiconer2023\train_dev\en-dev.conll


In [10]:
model = NERmodelbase(tag_to_id=mconern, device=device, encoder_model=encoder_model, dropout=0.3).to(device)
criterion = torch.nn.CrossEntropyLoss()

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
trainloader = DataLoader(ds, batch_size=BATCH_SIZE, collate_fn=collate_batch, num_workers=0, shuffle=False)
validloader = DataLoader(valid, batch_size=BATCH_SIZE, collate_fn=collate_batch, num_workers=0)

In [12]:
WARMUP_STEP = int(len(trainloader) * NUM_EPOCH * 0.05)
print(WARMUP_STEP)
optim, scheduler = get_optimizer(model, True)

13


In [16]:
for epoch in range(NUM_EPOCH):
    with tqdm(trainloader, unit='batch') as tepoch:
        # model.train()
        tepoch.set_description(f"Epoch {epoch}")
        running_loss = 0
        for i, data in enumerate(tepoch):
            optim[0].zero_grad()
            outputs = model(data)
            loss = outputs['loss']
            running_loss += loss
            loss.backward()
            optim[0].step()
            scheduler[0].step()
            # if i % 10 == 0:  # print every 2000 mini-batches
#     model.spanf1.reset()
#     # run validation
#     with torch.no_grad():
#         with tqdm(validloader, unit='batch') as tepoch:
#             val_loss = 0
#             # model.eval()
#             for i, data in enumerate(tepoch):
#                 outputs = model(data)
#                 val_loss += outputs['loss']

    print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / len(trainloader):.3f}')
#     print(f'[{epoch + 1}, {i + 1:5d}] loss: {val_loss / len(validloader):.3f}')
    # running_loss = 0.0
    print(f"{outputs['results']}")
    model.spanf1.reset()

Epoch 0:  24%|███████████████████████████████████████████████████▌                                                                                                                                                                   | 63/263 [00:32<01:42,  1.95batch/s]


KeyboardInterrupt: 

In [14]:
tokenizer.convert_ids_to_tokens(next(iter(trainloader))[0][0])

['[PAD]',
 'robert',
 'gott',
 '##sch',
 '##al',
 '##k',
 '1939',
 'academy',
 'award',
 'winner',
 'and',
 'founder',
 'of',
 'pana',
 '##vision',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [15]:
next(iter(trainloader))

(tensor([[    0, 10918, 29338,  ...,     0,     0,     0],
         [    0, 10770, 10103,  ...,     0,     0,     0],
         [    0, 14194, 11865,  ...,     0,     0,     0],
         ...,
         [    0, 14106, 10739,  ...,     0,     0,     0],
         [    0, 10103, 10201,  ...,     0,     0,     0],
         [    0, 10431,   100,  ...,     0,     0,     0]]),
 tensor([[34, 39, 40,  ..., 34, 34, 34],
         [34, 34, 34,  ..., 34, 34, 34],
         [34, 34, 34,  ..., 34, 34, 34],
         ...,
         [34, 39, 40,  ..., 34, 34, 34],
         [34, 34, 34,  ..., 34, 34, 34],
         [34, 34, 34,  ..., 34, 34, 34]]),
 tensor([[ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False]])

In [22]:
model.crf.transitions.cpu().detach().numpy()

array([[ 0.32475677,  0.02010572,  0.09323062, ..., -0.04281631,
        -0.13109145,  0.05667825],
       [-0.04844541, -0.02674677, -0.00335966, ...,  0.1421236 ,
        -0.18490355, -0.16127391],
       [-0.0008058 ,  0.09693959,  0.28803515, ...,  0.01462255,
         0.05248784, -0.13044989],
       ...,
       [-0.08800284,  0.1114422 , -0.0198133 , ...,  0.04746086,
         0.07426888, -0.282844  ],
       [ 0.00893064, -0.17262436,  0.17598367, ...,  0.01136907,
        -0.00776488, -0.10469054],
       [ 0.1291324 , -0.08140508,  0.26613107, ...,  0.03656916,
         0.24330777, -0.10258736]], dtype=float32)