# NER Baseline

## 1. Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import easydict
import numpy as np
from tqdm.auto import tqdm
from einops import rearrange
from seqeval.scheme import IOB2
from seqeval.metrics import f1_score as entity_f1_score
from sklearn.metrics import f1_score as char_f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F

from datasets import load_dataset
from transformers import AutoTokenizer, get_scheduler
from transformers.adapters import XLMRobertaAdapterModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
cfg = easydict.EasyDict(
    model_name = 'xlm-roberta-base',
    batch_size = 32,
    num_epochs = 5,
    device = 'cuda:2',
)

## 2. Data

In [4]:
label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        text = ' '.join(item['tokens'])
        
        tags = [] # 각 스펠 단위별로 라벨링을 만드는 리스트 
        # 현재 잘려 있는 token과 앞으로 자를 토큰이 다르기 때문에 이걸 해주는 거구나 
        for token, tag in zip(item['tokens'], item['ner_tags']):
            label = label_list[tag]
            tags.append(tag)
            for _ in range(len(token)-1): # for문이 끝나고 tags.append(0)을 해주기 때문에 -1을 해줌 
                if label[0] == 'B': # token은 여러 글자를 가지고 있고 같은 token이라면 라벨이 B로 시작한다. 그러나, token의 첫번째 글자만 B로 마킹하고자 함 
                    tags.append(tag+1)
                else:
                    tags.append(tag)
            tags.append(0)
        tags = tags[:-1]
        
        inputs = self.tokenizer(text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        input_ids = inputs.input_ids[0]
        attention_mask = inputs.attention_mask[0]

        labels = []
        for i in range(self.max_length):
            char_span = inputs.token_to_chars(i) # 각 토큰의 시작과 끝을 알려주는 것이기 때문에 max_length 만큼 존재함 
            if char_span is None:
                label = -100
            else:
                label = tags[char_span.start]
            labels.append(label)
            
        labels = torch.tensor(labels)
        return input_ids, attention_mask, labels

In [5]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

In [6]:
data = load_dataset('conll2003')

Found cached dataset conll2003 (/home/kds/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)
100%|██████████| 3/3 [00:00<00:00, 482.94it/s]


In [7]:
train_dataset = Dataset(data['train'], tokenizer)
valid_dataset = Dataset(data['validation'], tokenizer)

In [8]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=cfg.batch_size, shuffle=False)

In [9]:
batch = next(iter(train_loader))
input_ids, attention_mask, labels = batch
input_ids.shape, attention_mask.shape, labels.shape

(torch.Size([32, 128]), torch.Size([32, 128]), torch.Size([32, 128]))

## 3. Model

In [10]:
class EntityHead(nn.Module):
    label_list = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
    # 정의 할때는 dropout_porb과 hidden_size가 필요한 것이고 
    # 정의한 Entityhead를 사용할 때는 forward에 정의된 변수(hidden_state)가 필요함 
    def __init__(self, dropout_prob, hidden_size):
        super().__init__()
        self.dropout = nn.Dropout(dropout_prob)
        self.classifier = nn.Linear(hidden_size, len(self.label_list))
    
    def forward(self, hidden_state):
        hidden_state = self.dropout(hidden_state)
        logits = self.classifier(hidden_state)
        return logits

## 4. Train

In [11]:
def metric_fn(total_preds, total_labels, label_list):
    total_preds_label = [label_list[p] for p in total_preds]
    total_labels_label = [label_list[l] for l in total_labels]
    
    entity_f1 = entity_f1_score([total_labels_label], [total_preds_label], average='macro', mode='strict', scheme=IOB2)
    char_f1 = char_f1_score(total_labels, total_preds, labels=list(range(len(label_list)-1)), average='macro', zero_division=True)
    
    entity_f1 = entity_f1 * 100
    char_f1 = char_f1 * 100
    return entity_f1, char_f1


def evaluate(backbone, head, loader, cfg):
    total_preds, total_labels = [], []
    for batch in tqdm(loader):
        batch = [b.to(cfg.device) for b in batch]
        input_ids, attention_mask, labels = batch

        with torch.no_grad():    
            hidden_state = backbone(input_ids, attention_mask).last_hidden_state
            logits = head(hidden_state)

        preds = logits.argmax(dim=-1)
        preds = preds.flatten().cpu().numpy()
        labels = labels.flatten().cpu().numpy()
        
        for p, l in zip(preds, labels):
            if l in [-100, 12]: continue
            total_preds.append(p)
            total_labels.append(l)
    
    entity_f1, char_f1 = metric_fn(total_preds, total_labels, head.label_list)
    return entity_f1, char_f1

In [12]:
backbone = XLMRobertaAdapterModel.from_pretrained(cfg.model_name)
backbone.add_adapter('entity', overwrite_ok=True) # 똑같은 이름을 가지고 있는 adapter가 있으면 덮어씀 
backbone.set_active_adapters('entity')
_ = backbone.train().to(cfg.device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaAdapterModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for prediction

In [13]:
head = EntityHead(backbone.config.hidden_dropout_prob, backbone.config.hidden_size)
_ = head.train().to(cfg.device)

In [14]:
adapter_params = [param for name, param in backbone.named_parameters() if 'adapter' in name]
head_params = list(head.parameters())
params = adapter_params + head_params
optimizer = torch.optim.AdamW(params, lr=1e-4, weight_decay=1e-3)

In [15]:
num_epochs = 10
num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = int(num_training_steps * 0.1)
scheduler = get_scheduler('cosine', optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)

In [16]:
for ep in range(num_epochs):
    pbar = tqdm(train_loader)
    for batch in pbar:
        batch = [b.to(cfg.device) for b in batch]
        input_ids, attention_mask, labels = batch
        
        hidden_state = backbone(input_ids, attention_mask).last_hidden_state
        logits = head(hidden_state) 
        
        # b : batch_size 
        # s : sequence의 줄임말로 max_length 만큼 출력됨 
        # c : label list에 있는 것들이 각각의 일어날 확률을 말하는 거구나?
        # 다른 곳에서는 이렇게 rearange를 안해주는데 왜 여기서만 해주는거지?
        logits = rearrange(logits, 'b s c -> (b s) c') # logits : torch.Size([32, 128, 9]) -> logits : (torch.Size([4096, 9]) == (batch_size, number of class)
        # tensor의 rearrange를 하는 이유는 간단하다. 연산을 간변하게 하기 위해서 이다. 
        # [32, 128, ?] -> [4096, ?]로 변경된 것은 32개의 [128, 9]의 행렬을 하나로 합쳐놓은 거라고 생각하면 된다. 
        # 이해가 되지 않으면 torch.randn(2, 3, 5)를 활용하여 간단하게 계산해보기 바란다. 
        labels = rearrange(labels, 'b s -> (b s)') #  labels : torch.Size([32, 128] -> labels : torch.Size([4096])
        loss = F.cross_entropy(logits, labels) 
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        pbar.set_postfix({'loss': loss.item()})
        
    entity_f1, char_f1 = evaluate(backbone, head, valid_loader, cfg)
    print(f'ep {ep:01d} | entity f1 {entity_f1:.2f} | char f1 {char_f1:.2f}')

100%|██████████| 439/439 [01:39<00:00,  4.41it/s, loss=0.227]
100%|██████████| 102/102 [00:09<00:00, 11.25it/s]


ep 0 | entity f1 40.84 | char f1 59.92


100%|██████████| 439/439 [01:43<00:00,  4.23it/s, loss=0.103] 
100%|██████████| 102/102 [00:09<00:00, 11.20it/s]


ep 1 | entity f1 80.96 | char f1 88.60


100%|██████████| 439/439 [01:43<00:00,  4.24it/s, loss=0.101] 
100%|██████████| 102/102 [00:09<00:00, 11.21it/s]


ep 2 | entity f1 85.58 | char f1 91.41


100%|██████████| 439/439 [01:42<00:00,  4.30it/s, loss=0.0282]
100%|██████████| 102/102 [00:09<00:00, 11.23it/s]


ep 3 | entity f1 87.24 | char f1 91.98


100%|██████████| 439/439 [01:41<00:00,  4.31it/s, loss=0.063] 
100%|██████████| 102/102 [00:09<00:00, 11.21it/s]


ep 4 | entity f1 87.01 | char f1 92.04


100%|██████████| 439/439 [01:41<00:00,  4.31it/s, loss=0.0619] 
100%|██████████| 102/102 [00:09<00:00, 11.21it/s]


ep 5 | entity f1 88.48 | char f1 93.31


100%|██████████| 439/439 [01:41<00:00,  4.30it/s, loss=0.0465] 
100%|██████████| 102/102 [00:09<00:00, 11.22it/s]


ep 6 | entity f1 89.03 | char f1 93.48


100%|██████████| 439/439 [01:41<00:00,  4.31it/s, loss=0.072]  
100%|██████████| 102/102 [00:09<00:00, 11.21it/s]


ep 7 | entity f1 89.85 | char f1 93.87


100%|██████████| 439/439 [01:41<00:00,  4.31it/s, loss=0.0336] 
100%|██████████| 102/102 [00:09<00:00, 11.18it/s]


ep 8 | entity f1 89.53 | char f1 93.88


100%|██████████| 439/439 [01:41<00:00,  4.31it/s, loss=0.115]  
100%|██████████| 102/102 [00:09<00:00, 11.19it/s]


ep 9 | entity f1 89.58 | char f1 93.95


In [None]:
backbone.save_adapter('models/adapters/entity', 'entity')
torch.save(head.state_dict(), 'models/heads/entity.pt')

## 5. Predict

In [None]:
import easydict
import numpy as np

import torch
import torch.nn as nn
from transformers import AutoTokenizer
# from adapter_transformers.src.transformers.adapters import XLMRobertaAdapterModel
from transformers.adapters import XLMRobertaAdapterModel

In [None]:
cfg = easydict.EasyDict(
    model_name = 'xlm-roberta-base',
    device = 'cuda:2',
)

In [None]:
def postprocess(inputs, logits, label_list):
    num_texts = logits.size(0) # logits size : torch.Size([1, 문장길이, 9]) 9는 분류하고자하는 라벨의 종류 수 
    input_ids = inputs.input_ids.cpu().numpy()
    
    scores = logits.softmax(dim=-1)
    scores, preds = scores.max(dim=-1)
    preds = preds.cpu().numpy()
    scores = scores.cpu().numpy()
    """
    preds : [[0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 3 4 4 0 0 0 0 3 4 4 0 0 0 0 3
              4 4 0 3 4 0 0 0 0 3 4 4 4 0 3 4 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
    scores : [[0.33479157 0.46193483 0.98810464 0.8788983  0.7549816  0.66784626
                0.40162045 0.51870275 0.84481525 0.99972934 0.9996375  0.99967337
                0.9997254  0.99979717 0.99983394 0.99981886 0.9997837  0.99967706
                0.99987686 0.9998398  0.9996594  0.99090004 0.9956403  0.9963806
                0.9998048  0.9996369  0.99984634 0.9990269  0.99177706 0.99515235
                0.9960179  0.99981445 0.9995647  0.9998305  0.9994886  0.99170285
                0.99539775 0.99576026 0.9765133  0.9661636  0.9937236  0.9618989
                0.9997024  0.9995864  0.9995679  0.98423475 0.9917414  0.9933913
                0.9942795  0.97152    0.97173023 0.9844504  0.9859282  0.93398285
                0.99957556 0.99927396 0.99951065 0.99959224 0.9973635  0.99931073
                0.9998578  0.99954706 0.9965514  0.99979395 0.9997625  0.9998196
                0.99960154 0.9995741  0.999689   0.9994893  0.34495312]]
    """

    total_spans = []
    for i in range(num_texts):
        pred = preds[i]
        pred = [label_list[p] for p in pred]
        
        spans, buffer = [], []
        for j, p in enumerate(pred):
            if p[0] == 'B':
                if input_ids[i, j] == 6: continue # 6은 ''를 가르킴 
                if buffer: # 빈리스트가 아닐때를 의미함 
                    spans.append(buffer)
                    buffer = []
                
                # batch_size가 1일때는 inputs.token_to_chars이라고 하면되는데 
                # batch_size가 1 이상일 때는 앞에 batch_index, token_index 순으로 넣어주어야합니다. 
                # 그래서 아래는 i, j라는 구조로 입력이 들어가게 되는 것입니다. 
                start, end = inputs.token_to_chars(i, j)
                buffer.append({'entity': p[-2:], 'start': start, 'end': end, 'score': scores[i,j]})

        
            elif p[0] == 'I':
                if not buffer: continue      
                start, end = inputs.token_to_chars(i, j)
                buffer.append({'entity': p[-2:], 'start': start, 'end': end, 'score': scores[i,j]})


            elif p[0] == 'O':
                if not buffer: continue
                spans.append(buffer)
                buffer = []
                    
        total_spans.append(spans)
    return total_spans


def grouping(span, text):
    start = span[0]['start']
    end = span[-1]['end']
    word = text[start:end]
    entity = span[0]['entity']
    score = np.prod([s['score'] for s in span])
    return {
        'word': word,
        'entity': entity,
        'start': start,
        'end': end,
        'score': score
    }


def predict(backbone, head, tokenizer, texts):
    inputs = tokenizer(texts, padding=True, return_tensors='pt').to(cfg.device)

    hidden_state = backbone(**inputs).last_hidden_state
    logits = head(hidden_state)
    
    total_spans = postprocess(inputs, logits, head.label_list)
    total_groups = []
    for spans, text in zip(total_spans, texts):
        groups = [grouping(span, text) for span in spans]
        total_groups.append(groups)
        
    return total_groups

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

In [None]:
backbone = XLMRobertaAdapterModel.from_pretrained(cfg.model_name)
backbone.load_adapter('models/adapters/entity')
_ = backbone.eval().requires_grad_(False).to(cfg.device)
backbone.set_active_adapters('entity')

In [None]:
head = EntityHead(backbone.config.hidden_dropout_prob, backbone.config.hidden_size)
head.load_state_dict(torch.load('models/heads/entity.pt'))
_ = head.eval().requires_grad_(False).to(cfg.device)

In [None]:
texts = [
    '국제 신용평가사 피치가 내년 성장률을 1.9%로 전망한 것을 시작으로 한국경제연구원(1.9%), 한국금융연구원(1.7%), 한국개발연구원(KDI·1.8%), 경제협력개발기구(OECD·1.8%) 등 주요 기관들은 줄줄이 1%대로 낮춰잡았다.'
]

In [None]:
results = predict(backbone, head, tokenizer, texts)

In [None]:
results

[[{'word': '한국경제연구원',
   'entity': 'RG',
   'start': 39,
   'end': 46,
   'score': 0.9830092},
  {'word': '한국금융연구원',
   'entity': 'RG',
   'start': 54,
   'end': 61,
   'score': 0.9830391},
  {'word': '한국개발연구원',
   'entity': 'RG',
   'start': 69,
   'end': 76,
   'score': 0.9829536},
  {'word': 'KDI', 'entity': 'RG', 'start': 77, 'end': 80, 'score': 0.9600995},
  {'word': '경제협력개발기구',
   'entity': 'RG',
   'start': 88,
   'end': 96,
   'score': 0.9641086},
  {'word': 'OECD',
   'entity': 'RG',
   'start': 97,
   'end': 101,
   'score': 0.9431588}]]