<a href="https://colab.research.google.com/github/HyunLee103/NER_korean/blob/main/console.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Raw Data 로드

In [6]:
import os

BASE_DIR = '/content/drive/MyDrive/ku-ai/data'

def load_data(file_name):
    with open(os.path.join(BASE_DIR, file_name), 'r', encoding='utf-8') as fp:
        return fp.readlines()

raw_train_data = load_data('ner_train.txt')
raw_test_data = load_data('ner_dev.txt')

In [7]:
tmp_data = raw_train_data[0]
tmp_data

'366\t새 벽 출 조 시 <SP> 야 영 적 극 <SP> 권 장 하 ㅂ 니 다 <SP> .\tB_TI I_TI O O O <SP> O O O O <SP> O O O O O O <SP> O\n'

## Feature 로드

```
{형태소: 161차원 vector}
```

In [8]:
import joblib

train_feature = joblib.load(os.path.join(BASE_DIR, 'train_concat_features'))
test_feature = joblib.load(os.path.join(BASE_DIR, 'dev_concat_features'))

train_feature['새벽'].shape

(184,)

## 통합 데이터셋 만들기

```
{형태소: 161차원 vector}
```

Input: 음절단위 문장

1. 합친다
2. 형태소로짼다
3. 아까했던 헝태소: 벡터 를 (길이) 만큼 한다
4. 그러면 (seq_len, 161) 의 벡터가 데이터 크기만큼 나온다.
5. <SOS\> 벡터, <EOS\> 벡터, <PAD\> 벡터를 준다.
6. ?

In [30]:
train_morph_token = joblib.load(os.path.join(BASE_DIR, 'train_morph_token'))
train_pos_token = joblib.load(os.path.join(BASE_DIR, 'train_pos_token'))

test_morph_token = joblib.load(os.path.join(BASE_DIR, 'test_morph_token'))
test_pos_token = joblib.load(os.path.join(BASE_DIR, 'test_pos_token'))

print(train_morph_token[0])
print(train_pos_token[0])
print(test_morph_token[0])
print(test_pos_token[0])

['새벽', '출조', '시', '<SP>', '야영', '적극', '<SP>', '권장', '하', 'ㅂ니다', '<SP>', '.']
['NNG', 'NNG', 'NNG', '<SP>', 'NNG', 'NNG', '<SP>', 'NNG', 'XSV', 'EF', '<SP>', 'SF']
['6', '일', '<SP>', '유통', '업계', '와', '<SP>', '정유', '업계', '에', '<SP>', '따르', '면', '<SP>', '‘', '이마트', '-', 'SK', '’', '<SP>', '간판', '을', '<SP>', '내걸', 'ㄴ', '<SP>', '주유소', '가', '<SP>', '올해', '<SP>', '안', '에', '<SP>', '등장', '하', 'ㄹ', '<SP>', '것', '이', '<SP>', '확실시', '되', '자', '<SP>', '이마트', '와', '<SP>', '경쟁', '관계', '에', '<SP>', '있', '는', '<SP>', '롯데마트', '<SP>', '홈플러스', '<SP>', '등', '<SP>', '다른', '<SP>', '대형', '<SP>', '마트', '도', '<SP>', '매장', '<SP>', '내', '<SP>', '주유소', '<SP>', '설립', '을', '<SP>', '위하', '아', '<SP>', '정유', '사와', '<SP>', '물밑', '에서', '<SP>', '활발', '하', '게', '<SP>', '접촉', '하', '고', '<SP>', '있', '다', '.']
['SN', 'NNBC', '<SP>', 'NNG', 'NNG', 'JC', '<SP>', 'NNG', 'NNG', 'JKB', '<SP>', 'VV', 'EC', '<SP>', 'SY', 'NNP', 'SY', 'SL', 'SY', '<SP>', 'NNG', 'JKO', '<SP>', 'VV+ETM', 'NNG', '<SP>', 'NNG', 'JKS', '<SP>', 'NNG', '<S

## `Dataset` 객체로 만들기

In [34]:
import os

def convert_sentence(sentence):
    return sentence.replace(' ', '').replace('<SP>', ' ')


def load_tag_dict():
    fp = open(os.path.join(BASE_DIR, 'tag_vocab.txt'), 'r')
    tag_2_idx = {'<UNK>': 0, '<SP>': 1, '<EOS>': 2, '<PAD>': 3}
    idx_2_tag = {0: '<UNK>', 1: '<SP>', 2: '<EOS>', 3: '<PAD>'}

    index = 2
    for line in tqdm(fp.readlines()):
        tag = line.strip()
        tag_2_idx[tag] = index
        idx_2_tag[index] = tag
        index += 1

    return tag_2_idx, idx_2_tag

In [19]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset

UNK_TOKEN = 0
SP_TOKEN = 1
EOS_TOKEN = 2
PAD_TOKEN = 3

FEATURE_SIZE = 184

UNK_VECTOR = [UNK_TOKEN] * FEATURE_SIZE
SP_VECTOR = [SP_TOKEN] * FEATURE_SIZE
EOS_VECTOR = [EOS_TOKEN] * FEATURE_SIZE
PAD_VECTOR = [PAD_TOKEN] * FEATURE_SIZE

device = 'cuda' if torch.cuda.is_available() else 'cpu'


class NERDataset(Dataset):
    def __init__(self, raw_data, morphs_list, pos_list, feature=None, device='cpu'):
        super(NERDataset, self).__init__()
        self.source_list = []
        self.target_list = []

        tag_dict = load_tag_dict()
        max_length = self._get_max_length(raw_data)

        for row, morphs, pos in tqdm(zip(raw_data, morphs_list, pos_list)):
            index, syllables, tags = row.rstrip('\n').split('\t')
            syllables_list = syllables.split()
            tags_list = tags.split()

            encoded_syllable_list = []
            if feature:
                for morph in morphs:
                    if morph in feature.keys():
                        morph_size = len(morph) if morph != '<SP>' else 1
                        encoded_syllable_list += [feature[morph]] * morph_size
                    else:
                        encoded_syllable_list += [UNK_VECTOR] * len(morph)

                padding_size = max_length - len(encoded_syllable_list)
                encoded_syllable_list.append(EOS_VECTOR)
                encoded_syllable_list += [PAD_VECTOR] * padding_size

                self.source_list.append(encoded_syllable_list)
            else:
                # TODO: raw feature generation
                pass

            encoded_tag_list = []
            for tag in tags_list:
                if tag in tag_dict.keys():
                    encoded_tag_list.append(tag_dict[tag])
                else:
                    encoded_tag_list.append(tag_dict['<UNK>'])

            padding_size = max_length - len(encoded_tag_list)
            encoded_tag_list.append(tag_dict['<EOS>'])
            encoded_tag_list += [tag_dict['<PAD>']] * padding_size

            self.target_list.append(encoded_tag_list)

        self.source = torch.tensor(self.source_list).to(device)
        self.target = torch.tensor(self.target_list).to(device)

    def _get_max_length(self, raw_data):
        max_length = 0
        for row in raw_data:
            index, syllables, tags = row.rstrip('\n').split('\t')
            syllables_list = syllables.split()
            length = len(syllables_list)
            if max_length < length:
                max_length = length
        return max_length

    def __str__(self):
        return 'source: {}, target: {}'.format(self.source.shape, self.target.shape)

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        return {
            'source': self.source[idx],
            'target': self.target[idx],
        }


print()

train_dataset = NERDataset(raw_train_data, train_morph_token, train_pos_token, train_feature, device=device)
test_dataset = NERDataset(raw_test_data, test_morph_token, test_pos_token, test_feature, device=device)

print(train_dataset)
print(test_dataset)


100%|██████████| 12/12 [00:00<00:00, 6250.05it/s]
1832it [00:00, 18313.51it/s]




7319it [00:00, 15753.97it/s]


source: torch.Size([7319, 492, 184]), target: torch.Size([7319, 492])


In [None]:
joblib.dump(train_dataset, 'train_dataset')
joblib.dump(test_dataset, 'test_dataset')

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print('total steps: {}'.format(len(train_loader)), end='\n\n')

sample = next(iter(train_loader))
print('shape of {}: {}'.format('source', sample['source'].shape))
print('shape of {}: {}'.format('target', sample['target'].shape), end='\n\n')

In [31]:
test_dataset = NERDataset(raw_test_data, test_morph_token, test_pos_token, test_feature, device=device)


100%|██████████| 12/12 [00:00<00:00, 16810.84it/s]
995it [00:00, 22910.51it/s]


In [65]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

print('total steps: {}'.format(len(test_loader)), end='\n\n')
sample = next(iter(train_loader))
sample

total steps: 995



{'source': tensor([[[-1.0909e-01,  1.4707e-01, -4.0618e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-1.0909e-01,  1.4707e-01, -4.0618e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-4.7728e-01,  6.9660e-01, -1.9908e-01,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          ...,
          [ 3.0000e+00,  3.0000e+00,  3.0000e+00,  ...,  3.0000e+00,
            3.0000e+00,  3.0000e+00],
          [ 3.0000e+00,  3.0000e+00,  3.0000e+00,  ...,  3.0000e+00,
            3.0000e+00,  3.0000e+00],
          [ 3.0000e+00,  3.0000e+00,  3.0000e+00,  ...,  3.0000e+00,
            3.0000e+00,  3.0000e+00]],
 
         [[-5.3568e-02,  1.3946e-01,  1.9815e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-5.3568e-02,  1.3946e-01,  1.9815e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-4.7728e-01,  6.9660e-01, -1.9908e-01,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
         

## 모델링

In [22]:
!pip install pytorch-crf

Collecting pytorch-crf
  Downloading https://files.pythonhosted.org/packages/96/7d/4c4688e26ea015fc118a0327e5726e6596836abce9182d3738be8ec2e32a/pytorch_crf-0.7.2-py3-none-any.whl
Installing collected packages: pytorch-crf
Successfully installed pytorch-crf-0.7.2


In [58]:
import torch
import torch.nn as nn
from torchcrf import CRF


class RNN_CRF(nn.Module):
    def __init__(self, pretrained_weight, embedding_size, hidden_size, output_size, dropout=0.5):
        super(RNN_CRF, self).__init__()

        # self.embedding = nn.Embedding.from_pretrained(pretrained_weight)
        self.dropout = nn.Dropout(dropout)

        self.rnn = nn.GRU(
            embedding_size,
            hidden_size,
            batch_first=True,
            bidirectional=True
        )

        # CRF layer
        self.crf = CRF(output_size, batch_first=True)

        # (batch_size, seq_len, hidden_size * 2) -> (batch_size, seq_len, output_size)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, inputs, labels=None):
        # (batch_size, seq_len) -> (batch_size, seq_len, embedding_size)
        inputs = inputs.float()
        outputs, hidden = self.rnn(inputs)

        # (batch_size, seq_len, hidden_size * 2)
        outputs = self.dropout(outputs)

        # (batch_size, seq_len, hidden_size * 2) -> (batch_size, seq_len, output_size)
        logits = self.fc(outputs)

        if labels is not None:
            log_likelihood = self.crf(
                emissions=logits,
                tags=labels,
                reduction="mean"
            )
            loss = log_likelihood * -1.0
            return loss
        else:
            output = self.crf.decode(emissions=logits)
            return output

## Train and Evaluate

In [76]:
import torch
import torch.optim as optim
from seqeval.metrics import classification_report


def train(model, train_data, test_data=None, num_epochs=20):
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    accuracy_list = []

    for epoch in range(num_epochs):
        model.train()
        losses = []

        for step, batch in enumerate(train_data):
            source = batch['source']
            target = batch['target']
            
            loss = model.forward(source, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (step + 1) % 50 == 0:
                print('{} step processed.. current loss : {}'.format(step + 1, loss.data.item()))
            losses.append(loss.data.item())

        print('Average Loss : {}'.format(np.mean(losses)))

        torch.save(model, 'savepoint.model')
        do_test(model, test_data)


def tensor2list(input_tensor):
    return input_tensor.cpu().detach().numpy().tolist()


def do_test(model, test_dataloader):
    model.eval()
    
    _, idx2tag = load_tag_dict()

    predicts, answers = [], []
    
    for step, batch in enumerate(test_dataloader):
        source = batch['source']
        target = batch['target']

        # 예측 라벨 출력
        output = model(source)

        # 성능 평가를 위해 예측 값과 정답 값 리스트에 저장
        for idx, answer in enumerate(tensor2list(target)):
            answers.append([idx2tag[e].replace("_", "-") for e in answer if idx2tag[e] != "<SP>" and idx2tag[e] != "<PAD>"])
            predicts.append([idx2tag[e].replace("_", "-") for i, e in enumerate(output[idx]) if idx2tag[answer[i]] != "<SP>" and idx2tag[answer[i]] != "<PAD>"] )

    print(len(predicts))
    
    # 성능 평가
    print(classification_report(answers, predicts))

In [25]:
# load glove embeddings
import os
import joblib
import torch

train_weights = joblib.load(os.path.join(BASE_DIR, 'train_emb_word_dict_mecab_sp.pickle'))
dev_weights = joblib.load(os.path.join(BASE_DIR, 'dev_emb_word_dict_mecab_sp.pickle'))

weights = torch.FloatTensor(list({**train_weights, **dev_weights}.values())).cuda()

print(weights.shape)

torch.Size([23583, 128])


In [77]:
input_size = 184
hidden_size = 128
output_size = 14
dropout = 0.3

model = RNN_CRF(weights, input_size, hidden_size, output_size, dropout=dropout).to(device)
print(model)

train(model, train_loader, test_loader)

RNN_CRF(
  (dropout): Dropout(p=0.3, inplace=False)
  (rnn): GRU(184, 128, batch_first=True, bidirectional=True)
  (crf): CRF(num_tags=14)
  (fc): Linear(in_features=256, out_features=14, bias=True)
)
50 step processed.. current loss : 26.65117645263672
100 step processed.. current loss : 15.336833953857422


100%|██████████| 12/12 [00:00<00:00, 4899.41it/s]

Average Loss : 41.99581110581108





AttributeError: ignored

In [40]:
!pip install seqeval

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |███████▌                        | 10kB 22.2MB/s eta 0:00:01[K     |███████████████                 | 20kB 29.2MB/s eta 0:00:01[K     |██████████████████████▌         | 30kB 34.8MB/s eta 0:00:01[K     |██████████████████████████████  | 40kB 36.5MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 8.2MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16171 sha256=0d2a82c0ea7846ca1acd00f5ac82b2b4af8ff7f5e3addcc383a9eb9288bec644
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


## Test Macro F1

In [70]:
do_test(model, test_loader)

100%|██████████| 12/12 [00:00<00:00, 29127.11it/s]


995


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          DT       0.75      1.00      0.85    154027
          LC       0.00      0.00      0.00       537
          OG       0.00      0.00      0.00       973
          PS       0.00      0.00      0.00       742
          TI       0.00      0.00      0.00        95

   micro avg       0.75      0.98      0.85    156374
   macro avg       0.15      0.20      0.17    156374
weighted avg       0.74      0.98      0.84    156374

