In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Raw Data 로드

In [1]:
import os

BASE_DIR = 'data'

def load_data(file_name):
    with open(os.path.join(BASE_DIR, file_name), 'r', encoding='utf-8') as fp:
        return fp.readlines()

raw_train_data = load_data('ner_train.txt')
raw_test_data = load_data('ner_dev.txt')

In [2]:
tmp_data = raw_train_data[0]
tmp_data

'366\t새 벽 출 조 시 <SP> 야 영 적 극 <SP> 권 장 하 ㅂ 니 다 <SP> .\tB_TI I_TI O O O <SP> O O O O <SP> O O O O O O <SP> O\n'

## Feature 로드

```
{형태소: 161차원 vector}
```

In [136]:
import joblib

train_feature = joblib.load(os.path.join(BASE_DIR, 'train_concat_features'))
test_feature = joblib.load(os.path.join(BASE_DIR, 'dev_concat_features'))

train_feature['새벽'].shape

(184,)

## 통합 데이터셋 만들기

```
{형태소: 161차원 vector}
```

Input: 음절단위 문장

1. 합친다
2. 형태소로짼다
3. 아까했던 헝태소: 벡터 를 (길이) 만큼 한다
4. 그러면 (seq_len, 161) 의 벡터가 데이터 크기만큼 나온다.
5. <SOS\> 벡터, <EOS\> 벡터, <PAD\> 벡터를 준다.
6. ?

In [127]:
from konlpy.tag import Mecab


def convert_sentence(sentence):
    return sentence.replace(' ', '').replace('<SP>', ' ')


def tag_pos(sentence):
    syllable = sentence.split(' ')
    converted = convert_sentence(sentence)
    morpher = Mecab()
    morph_pos_list = morpher.pos(converted)
    morphs_list = []

    pos_list = []
    index = 0

    for morph_pos in morph_pos_list:
        morph, pos = morph_pos
        morphs_list.append(morph)
        pos_list.append('B_{}'.format(pos))

        for i in range(1, len(morph)):
            pos_list.append('I_{}'.format(pos))

        index += len(morph)
        if index < len(syllable):
            if syllable[index] == '<SP>':
                morphs_list.append('<SP>')
                pos_list.append('<SP>')
                index += 1

    return morphs_list, ' '.join(pos_list)

In [7]:
import joblib
from konlpy.tag import Mecab
from tqdm import tqdm

# <SP> 포함시키기
def getToken_with_sp(file):
    mecab = Mecab()

    with open(file, 'r', encoding='utf-8') as fs:
        total_data = fs.readlines()

    sentences = []
    tags = []
    for data in tqdm(total_data):
        id, sentence, tag = data.rstrip('\n').split('\t')
        sentences.append(sentence.replace(' ', '').replace('<SP>', ' <SP> '))
        tags.append(tag)

    morph_token = []
    for sentence in sentences:
        token_with_sp = sentence.split()

        for ind, word in enumerate(token_with_sp):
            if word != '<SP>':
                morph = mecab.morphs(word)
                token_with_sp[ind] = morph
            else:
                token_with_sp[ind] = ['<SP>']

        token_with_sp = [y for x in token_with_sp for y in x]
        morph_token.append(token_with_sp)

    return morph_token


train_token = getToken_with_sp('data/ner_train.txt')
dev_token = getToken_with_sp('data/ner_dev.txt')

joblib.dump(train_token, 'data/train_morph_token')
joblib.dump(dev_token, 'data/test_morph_token')

100%|██████████| 7319/7319 [00:00<00:00, 208751.16it/s]
100%|██████████| 995/995 [00:00<00:00, 225110.98it/s]


['data/test_morph_token']

In [13]:
# POS_token 만들기
def getPos_with_sp(file):
    mecab = Mecab()
    with open(file, 'r', encoding='utf-8') as fs:
        total_data = fs.readlines()

    sentences = []
    for data in tqdm(total_data):
        id, sentence, tag = data.rstrip('\n').split('\t')
        sentences.append(sentence.replace(' ', '').replace('<SP>', ' <SP> '))

    pos_token = []
    for sentence in sentences:
        pos_with_sp = sentence.split()

        for ind, word in enumerate(pos_with_sp):
            if word != '<SP>':
                morph_pos = mecab.pos(word)
                pos = [pos for morph, pos in morph_pos]
                pos_with_sp[ind] = pos
            else:
                pos_with_sp[ind] = ['<SP>']

        pos_with_sp = [y for x in pos_with_sp for y in x]
        pos_token.append(pos_with_sp)

    return pos_token

train_pos_token = getPos_with_sp('data/ner_train.txt')
dev_pos_token = getPos_with_sp('data/ner_dev.txt')

joblib.dump(train_pos_token, 'data/train_pos_token')
joblib.dump(dev_pos_token, 'data/test_pos_token')

100%|██████████| 7319/7319 [00:00<00:00, 147605.52it/s]
100%|██████████| 995/995 [00:00<00:00, 150033.52it/s]


['data/test_pos_token']

In [5]:
token = joblib.load('data/train_pos_token')

## `Dataset` 객체로 만들기

In [128]:
import os

def load_tag_dict():
    fp = open(os.path.join('data', 'tag_vocab.txt'), 'r')
    tag_dict = {'<UNK>': 0, '<SP>': 1, '<EOS>': 2, '<PAD>': 3}

    index = 2
    for line in tqdm(fp.readlines()):
        tag = line.strip()
        tag_dict[tag] = index
        index += 1

    return tag_dict

In [138]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset

UNK_TOKEN = 0
SP_TOKEN = 1
EOS_TOKEN = 2
PAD_TOKEN = 3

FEATURE_SIZE = 184

UNK_VECTOR = [UNK_TOKEN] * FEATURE_SIZE
SP_VECTOR = [SP_TOKEN] * FEATURE_SIZE
EOS_VECTOR = [EOS_TOKEN] * FEATURE_SIZE
PAD_VECTOR = [PAD_TOKEN] * FEATURE_SIZE

device = 'cuda' if torch.cuda.is_available() else 'cpu'


class NERDataset(Dataset):
    def __init__(self, raw_data, feature=None, device='cpu'):
        super(NERDataset, self).__init__()
        self.source_list = []
        self.target_list = []

        tag_dict = load_tag_dict()
        max_length = self._get_max_length(raw_data)

        for row in tqdm(raw_data):
            index, syllables, tags = row.rstrip('\n').split('\t')
            syllables_list = syllables.split()
            morphs_list, pos_list = tag_pos(syllables)
            tags_list = tags.split()

            encoded_syllable_list = []
            if feature:
                for morph in morphs_list:
                    if morph in feature.keys():
                        morph_size = len(morph) if morph != '<SP>' else 1
                        encoded_syllable_list += [feature[morph]] * morph_size
                    else:
                        encoded_syllable_list += [UNK_VECTOR] * len(morph)

                padding_size = max_length - len(encoded_syllable_list)
                encoded_syllable_list.append(EOS_VECTOR)
                encoded_syllable_list += [PAD_VECTOR] * padding_size

                self.source_list.append(encoded_syllable_list)
            else:
                # TODO: raw feature generation
                pass

            encoded_tag_list = []
            for tag in tags_list:
                if tag in tag_dict.keys():
                    encoded_tag_list.append(tag_dict[tag])
                else:
                    encoded_tag_list.append(tag_dict['<UNK>'])

            padding_size = max_length - len(encoded_tag_list)
            encoded_tag_list.append(tag_dict['<EOS>'])
            encoded_tag_list += [tag_dict['<PAD>']] * padding_size

            self.target_list.append(encoded_tag_list)

        self.source = torch.tensor(self.source_list).to(device)
        self.target = torch.tensor(self.target_list).to(device)

    def _get_max_length(self, raw_data):
        max_length = 0
        for row in raw_data:
            index, syllables, tags = row.rstrip('\n').split('\t')
            syllables_list = syllables.split()
            length = len(syllables_list)
            if max_length < length:
                max_length = length
        return max_length

    def __str__(self):
        return 'source: {}, target: {}'.format(self.source.shape, self.target.shape)

    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        return {
            'source': self.source[idx],
            'target': self.target[idx],
        }


print()

train_dataset = NERDataset(raw_train_data, train_feature, device=device)
test_dataset = NERDataset(raw_test_data, test_feature, device=device)

print(train_dataset)
print(test_dataset)

100%|██████████| 12/12 [00:00<00:00, 136400.13it/s]
  0%|          | 3/7319 [00:00<04:33, 26.74it/s]
100%|██████████| 7319/7319 [00:17<00:00, 412.33it/s]
100%|██████████| 12/12 [00:00<00:00, 22104.37it/s]
100%|██████████| 995/995 [00:04<00:00, 232.79it/s]
{'source': tensor([[[-0.0571,  0.0895, -0.0352,  ...,  0.0000,  0.0000,  0.0000],
         [-0.0571,  0.0895, -0.0352,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0209, -0.0194, -0.0035,  ...,  1.0000,  0.0000,  0.0000],
         ...,
         [ 3.0000,  3.0000,  3.0000,  ...,  3.0000,  3.0000,  3.0000],
         [ 3.0000,  3.0000,  3.0000,  ...,  3.0000,  3.0000,  3.0000],
         [ 3.0000,  3.0000,  3.0000,  ...,  3.0000,  3.0000,  3.0000]],

        [[-0.1091,  0.1471, -0.0406,  ...,  0.0000,  0.0000,  0.0000],
         [-0.1091,  0.1471, -0.0406,  ...,  0.0000,  0.0000,  0.0000],
         [-0.4773,  0.6966, -0.1991,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 3.0000,  3.0000,  3.0000,  ...,  3.0000,  3.0000,  3.

In [139]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print('total steps: {}'.format(len(train_loader)), end='\n\n')

sample = next(iter(test_loader))
print('shape of {}: {}'.format('source', sample['source'].shape))
print('shape of {}: {}'.format('target', sample['target'].shape), end='\n\n')

sample

total steps: 115

shape of source: torch.Size([64, 221, 184])
shape of target: torch.Size([64, 221])



{'source': tensor([[[-1.4930e-01, -2.0743e-04, -2.7540e-01,  ...,  0.0000e+00,
            0.0000e+00,  1.0000e+00],
          [-1.6598e-01, -1.2423e-02, -4.6280e-01,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-4.7728e-01,  6.9660e-01, -1.9908e-01,  ...,  0.0000e+00,
            0.0000e+00,  1.0000e+00],
          ...,
          [ 3.0000e+00,  3.0000e+00,  3.0000e+00,  ...,  3.0000e+00,
            3.0000e+00,  3.0000e+00],
          [ 3.0000e+00,  3.0000e+00,  3.0000e+00,  ...,  3.0000e+00,
            3.0000e+00,  3.0000e+00],
          [ 3.0000e+00,  3.0000e+00,  3.0000e+00,  ...,  3.0000e+00,
            3.0000e+00,  3.0000e+00]],
 
         [[-1.4132e-01,  4.0622e-02,  1.1855e-02,  ...,  0.0000e+00,
            0.0000e+00,  0.0000e+00],
          [-2.2431e-01, -1.6861e-01, -6.3922e-02,  ...,  0.0000e+00,
            0.0000e+00,  1.0000e+00],
          [-4.8806e-01, -6.9621e-02,  7.1299e-01,  ...,  0.0000e+00,
            0.0000e+00,  1.0000e+00],
         

In [40]:
joblib.dump(train_loader, 'train_loader')

['train_loader']

## 모델링

In [122]:
import torch
import torch.nn as nn
from torchcrf import CRF


class RNN_CRF(nn.Module):
    def __init__(self, pretrained_weight, embedding_size, hidden_size, output_size, dropout=0.5):
        super(RNN_CRF, self).__init__()

        # self.embedding = nn.Embedding.from_pretrained(pretrained_weight)
        self.dropout = nn.Dropout(dropout)

        self.rnn = nn.GRU(
            embedding_size,
            hidden_size,
            batch_first=True,
            bidirectional=True
        )

        # CRF layer
        self.crf = CRF(output_size, batch_first=True)

        # (batch_size, seq_len, hidden_size * 2) -> (batch_size, seq_len, output_size)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, inputs, labels=None):
        # (batch_size, seq_len) -> (batch_size, seq_len, embedding_size)
        inputs = inputs.float()
        outputs, hidden = self.rnn(inputs)

        # (batch_size, seq_len, hidden_size * 2)
        outputs = self.dropout(outputs)

        # (batch_size, seq_len, hidden_size * 2) -> (batch_size, seq_len, output_size)
        logits = self.fc(outputs)

        if labels is not None:
            log_likelihood = self.crf(
                emissions=logits,
                tags=labels,
                reduction="mean"
            )
            loss = log_likelihood * -1.0
            return loss
        else:
            output = self.crf.decode(emissions=logits)
            return output

## Train and Evaluate

In [123]:
import torch
import torch.optim as optim


def train(model, train_data, test_data=None, num_epochs=20):
    optimizer = optim.Adam(model.parameters(), lr=0.005)
    accuracy_list = []

    for epoch in range(num_epochs):
        model.train()
        losses = []

        for step, batch in enumerate(train_data):
            source = batch['source']
            target = batch['target']
            
            loss = model.forward(source, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (step + 1) % 50 == 0:
                print(
                    '{} step processed.. current loss : {}'.format(
                        step + 1,
                        loss.data.item()
                    )
                )
            losses.append(loss.data.item())

        print('Average Loss : {}'.format(np.mean(losses)))

        torch.save(model, 'output/savepoint.model')
        # do_test(model, test_data, idx2tag)

In [124]:
# load glove embeddings
import joblib
import torch

train_weights = joblib.load('data/train_emb_word_dict_mecab_sp.pickle')
dev_weights = joblib.load('data/dev_emb_word_dict_mecab_sp.pickle')

weights = torch.DoubleTensor(list({**train_weights, **dev_weights}.values()))
print(weights.dtype)
print(weights.shape)

torch.float64
torch.Size([23583, 128])


In [125]:
embedding_size = 183
hidden_size = 256
output_size = 14

# glove matrix 풀어넣는 일만 남았어요
model = RNN_CRF(weights, embedding_size, hidden_size, output_size)
print(model)

train(model, train_loader)

RNN_CRF(
  (dropout): Dropout(p=0.5, inplace=False)
  (rnn): GRU(183, 256, batch_first=True, bidirectional=True)
  (crf): CRF(num_tags=14)
  (fc): Linear(in_features=512, out_features=14, bias=True)
)
torch.float32
torch.int64
torch.float32
torch.int64
torch.float32
torch.int64
torch.float32
torch.int64
torch.float32
torch.int64
torch.float32
torch.int64
torch.float32
torch.int64
torch.float32
torch.int64


KeyboardInterrupt: 

## Test Macro F1