In [1]:
import os
import json
from functools import partial
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer
from transformers import BertModel
from sklearn.metrics import classification_report

In [2]:
from dataset import Dataset_NER, ner_collate_fn
from tag_id_converter import Tag_ID_Converter

In [3]:
PRETAINED_MODEL_NAME = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(PRETAINED_MODEL_NAME)
bert = BertModel.from_pretrained(PRETAINED_MODEL_NAME)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
PATH_dir = '../data/ko_ner_data'
# PATH_dir = '../0_data/En_NER_POS'
PATH_ko_train = os.path.join(PATH_dir, 'prepro_train.json')
PATH_ko_test = os.path.join(PATH_dir, 'prepro_test.json')
PATH_ko_dev = os.path.join(PATH_dir, 'prepro_dev.json')
# PATH_ko_dev = os.path.join(PATH_dir, 'prepro_valid.json')
# PATH_tag_cnt_dict = os.path.join(PATH_dir, 'prepro_tag_cnt.json')

In [5]:
dataset_train = Dataset_NER(PATH_ko_train)
dataset_test = Dataset_NER(PATH_ko_test)
dataset_dev = Dataset_NER(PATH_ko_dev)

In [6]:
print(
		len(dataset_train),
		len(dataset_test),
		len(dataset_dev)
	)

4250 500 250


In [7]:
tag_converter = Tag_ID_Converter(PATH_dir, ['prepro_train_tag_list.json', 'prepro_test_tag_list.json', 'prepro_dev_tag_list.json'])

In [8]:
tag_converter.id_to_tag

{0: '[PAD]',
 1: 'I-TI',
 2: 'I-PS',
 3: 'I-1',
 4: 'I-<휠',
 5: 'I-OG',
 6: 'I-',
 7: 'B-OG',
 8: 'I-DT',
 9: 'B-<휠',
 10: 'B-PS',
 11: 'B-조선',
 12: 'B-LC',
 13: 'B-TI',
 14: 'I-조선',
 15: 'B-1',
 16: 'B-',
 17: 'I-LC',
 18: 'I-목소',
 19: 'B-목소',
 20: 'O',
 21: 'B-DT'}

In [9]:

tag_converter.tag_to_id

{'[PAD]': 0,
 'I-TI': 1,
 'I-PS': 2,
 'I-1': 3,
 'I-<휠': 4,
 'I-OG': 5,
 'I-': 6,
 'B-OG': 7,
 'I-DT': 8,
 'B-<휠': 9,
 'B-PS': 10,
 'B-조선': 11,
 'B-LC': 12,
 'B-TI': 13,
 'I-조선': 14,
 'B-1': 15,
 'B-': 16,
 'I-LC': 17,
 'I-목소': 18,
 'B-목소': 19,
 'O': 20,
 'B-DT': 21}

In [10]:
batch_size = 16
partial_collate_fn = partial(ner_collate_fn, tokenizer, tag_converter)

In [11]:
dataloader_train = DataLoader(
    dataset_train,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=partial_collate_fn
)
dataloader_test = DataLoader(
    dataset_test,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=partial_collate_fn
)
dataloader_dev = DataLoader(
    dataset_dev,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=partial_collate_fn
)

In [12]:
class Bert_NER(nn.Module):

    def __init__(self, bert, output):
        super().__init__()
        self.bert = bert
        self.dropout = nn.Dropout(p = 0.1)
        self.lin = nn.Linear(768, output)
        self.softmax = nn.Softmax(2)

    def forward(self,**kargs):
        emb = self.bert(**kargs)
        e = self.dropout(emb['last_hidden_state'])
        w = self.lin(e)
        return w

In [13]:
tag_len = len(tag_converter.tag_to_id)
model = Bert_NER(bert, tag_len)

In [14]:
CELoss = nn.CrossEntropyLoss(ignore_index=0)
optimizer = AdamW(model.parameters(), lr=1.0e-5)

In [15]:
model.cuda()
device = model.bert.device
device

device(type='cuda', index=0)

In [16]:
train_epoch = 100

In [17]:
for epoch in range(train_epoch):
    model.train()

    for iteration, batch in enumerate(dataloader_train):
        batch_inputs = {k: v.cuda(device) for k, v in list(batch[0].items())}
        batch_labels = batch[1].cuda(device)

        output = model(**batch_inputs)
        loss = CELoss(output.view(-1, output.size(-1)), batch_labels.view(-1))

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

        if (iteration + 1) % 10 == 0:
            print(f'epoch - {epoch}: {iteration + 1} - loss: {loss.item()}')

    print(f'epoch {epoch} END')
  

epoch - 0: 10 - loss: 1.3251761198043823
epoch - 0: 20 - loss: 1.1230926513671875
epoch - 0: 30 - loss: 0.8182607889175415
epoch - 0: 40 - loss: 0.9185926914215088
epoch - 0: 50 - loss: 0.4299090802669525
epoch - 0: 60 - loss: 0.5123492479324341
epoch - 0: 70 - loss: 0.6249414682388306
epoch - 0: 80 - loss: 0.46355965733528137
epoch - 0: 90 - loss: 0.5088244080543518
epoch - 0: 100 - loss: 0.36648431420326233
epoch - 0: 110 - loss: 0.47082483768463135
epoch - 0: 120 - loss: 0.4174671471118927
epoch - 0: 130 - loss: 0.37461042404174805
epoch - 0: 140 - loss: 0.34939849376678467
epoch - 0: 150 - loss: 1.6007741689682007
epoch - 0: 160 - loss: 0.40960147976875305
epoch - 0: 170 - loss: 0.36753809452056885
epoch - 0: 180 - loss: 0.2508137822151184
epoch - 0: 190 - loss: 0.3249797224998474
epoch - 0: 200 - loss: 0.1752331256866455
epoch - 0: 210 - loss: 0.4371016025543213
epoch - 0: 220 - loss: 0.27138054370880127
epoch - 0: 230 - loss: 0.21345990896224976
epoch - 0: 240 - loss: 0.487175613

In [18]:
model.eval()

gold_list = []
pred_list = []

with torch.no_grad():
    for iteration, batch in enumerate(dataloader_test):
        batch_inputs = {k: v.cuda(device) for k, v in list(batch[0].items())}
        batch_labels = batch[1].cuda(device)
        
        output = model(**batch_inputs)
        loss = CELoss(output.view(-1, output.size(-1)), batch_labels.view(-1))
        
        print('loss:', loss.item())
        pred_ids = torch.argmax(output, dim=-1)
        
        for g, p in zip(batch_labels, pred_ids):
            gold_mask = g != tag_converter.pad_id
            
            gold = tag_converter.convert_id_to_tag_list(g[gold_mask].tolist())
            pred = tag_converter.convert_id_to_tag_list(p[gold_mask].tolist())
            gold_list.append(gold)
            pred_list.append(pred)
            
            print(gold)
            print(pred)

loss: 0.1204371452331543
['B-PS', 'I-PS', 'I-PS', 'O', 'B-DT', 'I-DT', 'O', 'B-LC', 'O', 'O', 'O', 'B-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OG', 'I-OG', 'I-OG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PS', 'I-PS', 'I-PS', 'I-PS', 'I-PS', 'I-PS', 'I-PS', 'I-PS', 'I-PS', 'I-PS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['B-PS', 'I-PS', 'I-PS', 'O', 'B-DT', 'I-DT', 'O', 'B-LC', 'O', 'O', 'O', 'B-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'I-OG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OG', 'I-OG', 'I-OG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PS', 'I-PS', 'I-PS', 'I-PS', 'B-PS', 'I-PS', 'I-PS', 'I-PS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'B-PS', 'I-PS', 'I-PS', 'I-PS', 'I-PS', 'I-PS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PS', 'I

In [19]:
gold_list_flat = []
pred_list_flat = []
for g, p in zip(gold_list, pred_list):
    gold_list_flat += g
    pred_list_flat += p

In [20]:
tags = tag_converter.tag_list
tags.remove('O')
print(tags)

['I-TI', 'I-PS', 'I-1', 'I-<휠', 'I-OG', 'I-', 'B-OG', 'I-DT', 'B-<휠', 'B-PS', 'B-조선', 'B-LC', 'B-TI', 'I-조선', 'B-1', 'B-', 'I-LC', 'I-목소', 'B-목소', 'B-DT']


In [21]:

print(classification_report(gold_list_flat, pred_list_flat, digits=5, labels=tags))

              precision    recall  f1-score   support

        I-TI    0.92157   0.98947   0.95431        95
        I-PS    0.95664   0.81265   0.87879      1249
         I-1    0.00000   0.00000   0.00000         5
        I-<휠    0.00000   0.00000   0.00000         0
        I-OG    0.88924   0.87267   0.88088       966
          I-    0.00000   0.00000   0.00000         0
        B-OG    0.90553   0.87919   0.89217       447
        I-DT    0.94667   0.74737   0.83529       380
        B-<휠    0.00000   0.00000   0.00000         0
        B-PS    0.93812   0.93812   0.93812       404
        B-조선    0.00000   0.00000   0.00000         0
        B-LC    0.88288   0.85217   0.86726       115
        B-TI    0.85366   0.92105   0.88608        38
        I-조선    0.00000   0.00000   0.00000         0
         B-1    0.00000   0.00000   0.00000         1
          B-    0.00000   0.00000   0.00000         0
        I-LC    0.72932   0.59877   0.65763       162
        I-목소    0.00000   0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
def get_chunk_type(tag_name):
    tag_class = tag_name.split('-')[0]
    tag_type = tag_name.split('-')[-1]
    return tag_class, tag_type

In [23]:
def get_chunks(seq):
    default = "O"

    chunks = []
    chunk_type, chunk_start = None, None
    for i, tok in enumerate(seq):
        # End of a chunk 1
        if tok == default and chunk_type is not None:
            # Add a chunk.
            chunk = (chunk_type, chunk_start, i)
            chunks.append(chunk)
            chunk_type, chunk_start = None, None

        # End of a chunk + start of a chunk!
        elif tok != default:
            tok_chunk_class, tok_chunk_type = get_chunk_type(tok)
            if chunk_type is None:
                chunk_type, chunk_start = tok_chunk_type, i
            elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
                chunk = (chunk_type, chunk_start, i)
                chunks.append(chunk)
                chunk_type, chunk_start = tok_chunk_type, i
        else:
            pass

    # end condition
    if chunk_type is not None:
        chunk = (chunk_type, chunk_start, len(seq))
        chunks.append(chunk)

    return chunks

In [24]:
def evaluate_ner_F1(total_answers, total_preds):
    num_match = num_preds = num_answers = 0

    for answers, preds in zip(total_answers, total_preds):

        answer_seg_result = set(get_chunks(answers))
        pred_seg_result = set(get_chunks(preds))

        num_match += len(answer_seg_result & pred_seg_result)
        num_answers += len(answer_seg_result)
        num_preds += len(pred_seg_result)

    precision = 100.0 * num_match / num_preds
    recall = 100.0 * num_match / num_answers
    F1 = 2 * precision * recall / (precision + recall)

    return precision, recall, F1

In [25]:
evaluate_ner_F1(gold_list, pred_list)

(87.8839590443686, 88.18493150684931, 88.03418803418802)