In [1]:
!nvidia-smi

Sun Dec  5 09:45:33 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    22W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers -q

[K     |████████████████████████████████| 3.1 MB 4.1 MB/s 
[K     |████████████████████████████████| 61 kB 579 kB/s 
[K     |████████████████████████████████| 895 kB 61.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 53.8 MB/s 
[K     |████████████████████████████████| 596 kB 77.7 MB/s 
[?25h

In [3]:
!pip install pytorch_lightning==1.2.8

Collecting pytorch_lightning==1.2.8
  Downloading pytorch_lightning-1.2.8-py3-none-any.whl (841 kB)
[K     |████████████████████████████████| 841 kB 4.3 MB/s 
Collecting fsspec[http]>=0.8.1
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 66.1 MB/s 
Collecting torchmetrics>=0.2.0
  Downloading torchmetrics-0.6.0-py3-none-any.whl (329 kB)
[K     |████████████████████████████████| 329 kB 53.1 MB/s 
[?25hCollecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 52.9 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 51.3 MB/s 
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting asynctest==0.13.0
  Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)
Collecting fro

In [4]:
!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9672 sha256=fe4b60d80ef0e8378251392df70cbfe94c01dee74bb0491c9098afe2df89043e
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import random
import easydict
import re
from tqdm import tqdm
from collections import Counter
import requests
import json
import wget
import math

import warnings
warnings.filterwarnings('ignore')

from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup

from sklearn.metrics import classification_report
import pytorch_lightning as pl
from pytorch_lightning.metrics.functional import accuracy, f1, auroc
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

In [7]:
torch.__version__

'1.10.0+cu111'

In [8]:
pl.__version__

'1.2.8'

In [9]:
args = easydict.EasyDict({
    'seed': 42
})

args.batch_size = 12
args.hidden_size = 768
args.n_class = 97
args.num_workers = 2
args.epochs = 5
args.train = False
args.bert_model = 'datawhales/korean-relation-extraction'
args.max_token_len = 512
args.max_acc_threshold = 0.6
args.test_data = '/content/drive/MyDrive/Korean_RE/data/news_processed/specific_domain_test.csv'
args.mode = "ALLCC"

In [10]:
args

{'batch_size': 12,
 'bert_model': 'datawhales/korean-relation-extraction',
 'epochs': 5,
 'hidden_size': 768,
 'max_acc_threshold': 0.6,
 'max_token_len': 512,
 'mode': 'ALLCC',
 'n_class': 97,
 'num_workers': 2,
 'seed': 42,
 'test_data': '/content/drive/MyDrive/Korean_RE/data/news_processed/specific_domain_test.csv',
 'train': False}

In [11]:
def entity_markers_added(sent: str, subj_range: list, obj_range: list) -> str:
    """ 문장과 관계를 구하고자 하는 두 개체의 인덱스 범위가 주어졌을 때 entity marker token을 추가하여 반환하는 함수.
    
    Example:
        sent = '모토로라 레이저 M는 모토로라 모빌리티에서 제조/판매하는 안드로이드 스마트폰이다.'
        subj_range = [0, 10]   # sent[subj_range[0]: subj_range[1]] => '모토로라 레이저 M'
        obj_range = [12, 21]   # sent[obj_range[0]: obj_range[1]] => '모토로라 모빌리티'
        
    Return:
        '[E1] 모토로라 레이저 M [/E1] 는  [E2] 모토로라 모빌리티 [/E2] 에서 제조/판매하는 안드로이드 스마트폰이다.'
    """
    result_sent = ''
    
    for i, char in enumerate(sent):
        if i == subj_range[0]:
            result_sent += ' [E1] '
        elif i == subj_range[1]:
            result_sent += ' [/E1] '
        if i == obj_range[0]:
            result_sent += ' [E2] '
        elif i == obj_range[1]:
            result_sent += ' [/E2] '
        result_sent += sent[i]
    if subj_range[1] == len(sent):
        result_sent += ' [/E1]'
    elif obj_range[1] == len(sent):
        result_sent += ' [/E2]'


    return result_sent.strip()

In [12]:
class KREDataset(Dataset):
    """ Dataloader for Korean Relation Extraction Dataset.
    """
    def __init__(self, data: pd.DataFrame, args):
        super().__init__()
        
        self.args = args
        self.data = data
        
        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        
        self.max_token_len = args.max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx: int):
        data_row = self.data.iloc[idx]
        
        # input 문장
        sentence = data_row.sentence

        subj_name = data_row.subj_name
        obj_name = data_row.obj_name
        subj_type = data_row.subj_type
        obj_type = data_row.obj_type
        
        # subj range, obj range
        subj_range = [data_row['subj_start_pos'], data_row['subj_end_pos']]
        obj_range = [data_row['obj_start_pos'], data_row['obj_end_pos']]
        
        # input 문장 변형 - entity markers 추가: entity_markers_added 함수 이용
        converted_sent = entity_markers_added(sentence, subj_range, obj_range)
        
        # labels = torch.FloatTensor(eval(data_row.label_onehot))
        
        encoding = self.tokenizer.encode_plus(
            converted_sent,
            add_special_tokens=True,
            max_length = self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        input_ids = encoding['input_ids'].flatten()
        mask = encoding['attention_mask'].flatten()
        
        return dict(sentence=converted_sent,
                   input_ids=input_ids,
                   attention_mask=mask,
                    subj_name=subj_name, subj_type=subj_type, obj_name=obj_name, obj_type=obj_type)

In [13]:
class KREModel(pl.LightningModule):
    """ Model for Multi-label classification for Korean Relation Extraction Dataset.
    """
    def __init__(self, args, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        
        self.args = args
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        
        self.bert = BertModel.from_pretrained(args.bert_model, return_dict=True)
        
        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        # entity markers tokens
        # special_tokens_dict = {'additional_special_tokens': ['[E1]', '[/E1]', '[E2]', '[/E2]']}
        # num_added_toks = self.tokenizer.add_special_tokens(special_tokens_dict)   # num_added_toks: 4
        
        # self.bert.resize_token_embeddings(len(self.tokenizer))
        
        if self.args.mode == "ALLCC":
            self.scale = 4
        elif self.args.mode == "ENTMARK":
            self.scale = 2
            
        self.classifier = nn.Linear(self.bert.config.hidden_size * self.scale, args.n_class)
        
        self.criterion = nn.BCELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        batch_size = input_ids.size()[0]
        
        bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
        last_hidden_state = bert_outputs.last_hidden_state
        
        # 모든 entity marker의 hidden states를 concat
        if self.args.mode == "ALLCC":
            h_start_pos_tensor = (input_ids == 20000).nonzero()
            h_end_pos_tensor = (input_ids == 20001).nonzero()
            t_start_pos_tensor = (input_ids == 20002).nonzero()
            t_end_pos_tensor = (input_ids == 20003).nonzero()
            
            h_start_list = h_start_pos_tensor.tolist()
            h_end_list = h_end_pos_tensor.tolist()
            t_start_list = t_start_pos_tensor.tolist()
            t_end_list = t_end_pos_tensor.tolist()
            
            special_token_idx = []
            
            # special_token_idx example: [[1, 9, 11, 19], [3, 5, 8, 12], ..]
            for h_start, h_end, t_start, t_end in zip(h_start_list, h_end_list, t_start_list, t_end_list):
                special_token_idx.append([h_start[1], h_end[1], t_start[1], t_end[1]])
            
            # concat_state shape: [batch size, hidden size * 4]
            for i, idx_list in enumerate(special_token_idx):
                if i == 0:
                    concat_state = last_hidden_state[i, idx_list].flatten().unsqueeze(0)
                else:
                    concat_state = torch.cat([concat_state, last_hidden_state[i, idx_list].flatten().unsqueeze(0)], dim=0)
            
        elif self.args.mode == "ENTMARK":
            h_start_pos_tensor = (input_ids == 20000).nonzero()
#             h_end_pos_tensor = (input_ids == 20001).nonzero()
            t_start_pos_tensor = (input_ids == 20002).nonzero()
#             t_end_pos_tensor = (input_ids == 20003).nonzero()
            
            h_start_list = h_start_pos_tensor.tolist()
#             h_end_list = h_end_pos_tensor.tolist()
            t_start_list = t_start_pos_tensor.tolist()
#             t_end_list = t_end_pos_tensor.tolist()
            
            special_token_idx = []
        
            # special_token_idx example: [[1, 11], [3, 8], ..]
            for h_start, t_start in zip(h_start_list, t_start_list):
                special_token_idx.append([h_start[1], t_start[1]])
            
            # concat_state shape: [batch size, hidden size * 2]
            for i, idx_list in enumerate(special_token_idx):
                if i == 0:
                    concat_state = last_hidden_state[i, idx_list].flatten().unsqueeze(0)
                else:
                    concat_state = torch.cat([concat_state, last_hidden_state[i, idx_list].flatten().unsqueeze(0)], dim=0)
        
        output = self.classifier(concat_state)
        output = torch.sigmoid(output)
        
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output
    

In [14]:
r = requests.get('https://raw.githubusercontent.com/datawhales/Korean_RE/main/data/relation/relid2label.json')
relid2label = json.loads(r.text)

relation_list = list(relid2label.keys())

def idx2relid(idx_list):
    """ onehot label에서 1인 위치 인덱스 리스트를 relation id 리스트로 변환하는 함수.
    
    Example:
        relation_list = ['P17', 'P131', 'P530', ...] 일 때,
        __idx2relid([0, 2]) => ['P17', 'P530'] 을 반환.
    """
    label_out = []

    for idx in idx_list:
        label = relation_list[idx]
        label_out.append(label)
        
    return label_out if label_out else np.nan

In [15]:
def inference(trained_model, test_dataset):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    trained_model = trained_model.to(device)
    
    predictions = []
    sentence_list = []
    subj_list = []
    subj_type_list = []
    obj_list = []
    obj_type_list = []
    
    for i, item in enumerate(tqdm(test_dataset)):
        _, prediction = trained_model(
            item["input_ids"].unsqueeze(dim=0).to(device),
            item["attention_mask"].unsqueeze(dim=0).to(device)
        )
        
        predictions.append(prediction.flatten())
        sentence_list.append(test_dataset[i]['sentence'])
        subj_list.append(test_dataset[i]['subj_name'])
        subj_type_list.append(test_dataset[i]['subj_type'])
        obj_list.append(test_dataset[i]['obj_name'])
        obj_type_list.append(test_dataset[i]['obj_type'])
    
    predictions = torch.stack(predictions).detach().cpu()
    
    y_pred = predictions.numpy()
    
    upper, lower = 1, 0
    y_pred = np.where(y_pred > args.max_acc_threshold, upper, lower)
    
    ## 결과 csv 파일 저장
    result_df = pd.DataFrame(columns=['sentence', 'subj_name', 'subj_type', 'obj_name', 'obj_type', 'relation'])
    
    result_df['sentence'] = sentence_list
    result_df['subj_name'] = subj_list
    result_df['subj_type'] = subj_type_list
    result_df['obj_name'] = obj_list
    result_df['obj_type'] = obj_type_list
    
    preds_list = []
    for i in range(len(y_pred)):
        class_pred = idx2relid(np.where(y_pred[i] == 1)[0])
        if str(class_pred) == 'nan':
            preds_list.append(class_pred)
        else:
            class_pred = [relid2label[relid] for relid in class_pred]
            preds_list.append(class_pred[0])
        
    result_df['relation'] = preds_list
    
    # result_df.to_csv('../log/results.csv', index=False)
    result_df.to_csv('/content/drive/MyDrive/Korean_RE/log/specific_domain_pred_results.csv', index=False)

In [16]:
test_df = pd.read_csv(args.test_data)
test_dataset = KREDataset(test_df, args)


trained_model = KREModel.load_from_checkpoint('/content/drive/MyDrive/Korean_RE/ckpt/whole_data/best-checkpoint.ckpt', args=args)

trained_model.eval()
trained_model.freeze()

inference(trained_model, test_dataset)

Downloading:   0%|          | 0.00/140k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/62.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/177 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/368 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/388M [00:00<?, ?B/s]

Some weights of the model checkpoint at datawhales/korean-relation-extraction were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 70264/70264 [22:36<00:00, 51.81it/s]
