<a href="https://colab.research.google.com/github/godpeny/laboratory/blob/master/Study/NLP_Using_Deep_Learning/Bidirectional_Encoder_Representation_From_Transformer/ner_using_bert_ko.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Default Setting For Using TPU in Google Colab

In [1]:
import tensorflow as tf
import os

resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)

strategy = tf.distribute.TPUStrategy(resolver)

In [2]:
pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m548.8 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=1dd188821bf085f2422cd02ab50b532bcf19e34819ce24b40577dc1b7021c54a
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [3]:
import pandas as pd
import numpy as np
import os
import urllib.request

from tqdm import tqdm
from transformers import shape_list, BertTokenizer, TFBertModel
import keras
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
from seqeval.metrics import f1_score, classification_report
import tensorflow as tf

In [4]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/18.%20Fine-tuning%20BERT%20(Cls%2C%20NER%2C%20NLI)/dataset/ner_train_data.csv", filename="ner_train_data.csv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/18.%20Fine-tuning%20BERT%20(Cls%2C%20NER%2C%20NLI)/dataset/ner_test_data.csv", filename="ner_test_data.csv")
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/18.%20Fine-tuning%20BERT%20(Cls%2C%20NER%2C%20NLI)/dataset/ner_label.txt", filename="ner_label.txt")

('ner_label.txt', <http.client.HTTPMessage at 0x7be4b68d5cf0>)

# Data Preprocessing

In [5]:
train_data = pd.read_csv("ner_train_data.csv")
test_data = pd.read_csv("ner_test_data.csv")

print(train_data.shape, len(train_data))
print(test_data.shape, len(test_data))

print(train_data.head(5))

(81000, 2) 81000
(9000, 2) 9000
                                            Sentence  \
0                      정은 씨를 힘들게 한 가스나그, 가만둘 수 없겠죠 .   
1                          ▶ 쿠마리 한동수가 말하는 '가넷 & 에르덴'   
2                    슈나이더의 프레젠테이션은 말 청중을 위한 특별한 쇼다 .   
3  지구 최대 연료탱크 수검 회사 구글이 연내 22명 안팎의 인력을 갖춘 연구개발(R&...   
4  5. <10:00:TI_HOUR> 도이치증권대 <0:1:QT_SPORTS> 연예오락...   

                                                 Tag  
0                              PER-B O O O O O O O O  
1                      O PER-B PER-I O PER-B O PER-B  
2                            PER-B O O CVL-B O O O O  
3  O O TRM-B O O ORG-B DAT-B NUM-B O O O ORG-B LO...  
4                              NUM-B O ORG-B O ORG-B  


In [6]:
train_data_sentences = [sent.split() for sent in train_data['Sentence']] # split by space
train_data_labels = [sent.split() for sent in train_data['Tag']]
test_data_sentences = [sent.split() for sent in test_data['Sentence']]
test_data_labels = [sent.split() for sent in test_data['Tag']]

labels = pd.read_table("ner_label.txt", header=None)
labels = labels[0].values.tolist() # make list

print(train_data_sentences[2])
print(train_data_labels[2])
print(labels)

['슈나이더의', '프레젠테이션은', '말', '청중을', '위한', '특별한', '쇼다', '.']
['PER-B', 'O', 'O', 'CVL-B', 'O', 'O', 'O', 'O']
['O', 'PER-B', 'PER-I', 'FLD-B', 'FLD-I', 'AFW-B', 'AFW-I', 'ORG-B', 'ORG-I', 'LOC-B', 'LOC-I', 'CVL-B', 'CVL-I', 'DAT-B', 'DAT-I', 'TIM-B', 'TIM-I', 'NUM-B', 'NUM-I', 'EVT-B', 'EVT-I', 'ANM-B', 'ANM-I', 'PLT-B', 'PLT-I', 'MAT-B', 'MAT-I', 'TRM-B', 'TRM-I']


In [7]:
tag_to_index = {tag : index for index, tag in enumerate(labels)}
index_to_tag = {index : tag for index, tag in enumerate(labels)}
tag_size = len(tag_to_index)

print(tag_to_index)
print(index_to_tag)
print(len(tag_to_index))

{'O': 0, 'PER-B': 1, 'PER-I': 2, 'FLD-B': 3, 'FLD-I': 4, 'AFW-B': 5, 'AFW-I': 6, 'ORG-B': 7, 'ORG-I': 8, 'LOC-B': 9, 'LOC-I': 10, 'CVL-B': 11, 'CVL-I': 12, 'DAT-B': 13, 'DAT-I': 14, 'TIM-B': 15, 'TIM-I': 16, 'NUM-B': 17, 'NUM-I': 18, 'EVT-B': 19, 'EVT-I': 20, 'ANM-B': 21, 'ANM-I': 22, 'PLT-B': 23, 'PLT-I': 24, 'MAT-B': 25, 'MAT-I': 26, 'TRM-B': 27, 'TRM-I': 28}
{0: 'O', 1: 'PER-B', 2: 'PER-I', 3: 'FLD-B', 4: 'FLD-I', 5: 'AFW-B', 6: 'AFW-I', 7: 'ORG-B', 8: 'ORG-I', 9: 'LOC-B', 10: 'LOC-I', 11: 'CVL-B', 12: 'CVL-I', 13: 'DAT-B', 14: 'DAT-I', 15: 'TIM-B', 16: 'TIM-I', 17: 'NUM-B', 18: 'NUM-I', 19: 'EVT-B', 20: 'EVT-I', 21: 'ANM-B', 22: 'ANM-I', 23: 'PLT-B', 24: 'PLT-I', 25: 'MAT-B', 26: 'MAT-I', 27: 'TRM-B', 28: 'TRM-I'}
29


# Tokenizing

In [8]:
def tokenize(sentences, label_list, max_seq_len, tokenizer, pad_token_id_for_segment=0, pad_token_id_for_label=-100):
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    input_ids, attention_masks, token_type_ids, data_labels = [], [], [], []
    limit_len = max_seq_len - 2

    for sentence, labels in zip(sentences, label_list):
        tokens = []
        label_ids = []

        for word, label in zip(sentence, labels):
            subword_tokens = tokenizer.tokenize(word)
            tokens.extend(subword_tokens) # extend : list + list, append : list + [list]
            # 첫번째 subword 토큰에만 label을 부여하고 나머지 subword 토큰에는 pad_token_id_for_label(=-100)을 부여한다.
            label_id = [tag_to_index[label]] + [pad_token_id_for_label] * (len(subword_tokens) - 1)
            label_ids.extend(label_id)

        # in order to add [CLS] and [SEP] tokens, length of tokens should be less than limit_len('max_seq_len' - 2)
        if len(tokens) > limit_len:
            tokens = tokens[:limit_len]
            label_ids = label_ids[:limit_len]

        tokens = [cls_token] + tokens + [sep_token] # [CLS] + tokens + [SEP]
        label_ids = [pad_token_id_for_label] + label_ids + [pad_token_id_for_label] # [PAD] + label_ids + [PAD]

        # convert tokens to token_ids
        input_id = tokenizer.convert_tokens_to_ids(tokens)

        # padding + attention masking + segment encoding
        padding_count = max_seq_len - len(input_id)

        input_id_padded = input_id + ([pad_token_id] * padding_count)
        data_label_padded = label_ids + ([pad_token_id_for_label] * padding_count)
        attention_mask = ([1] * len(input_id)) + ([0] * padding_count)
        token_type_id = ([pad_token_id_for_segment] * max_seq_len)

        # validation
        assert len(input_id_padded) == max_seq_len, "Error with input length {} vs {}".format(len(input_id_padded), max_seq_len)
        assert len(data_label_padded) == max_seq_len, "Error with input length {} vs {}".format(len(data_label_padded), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with input length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with input length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id_padded)
        data_labels.append(data_label_padded)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)

    input_ids = np.array(input_ids, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    data_labels = np.asarray(data_labels, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), data_labels

### append() vs extend()
 - append() : list + [list] vs extend() : list + list
```python
x = [1, 2, 3]
x.append([4, 5])
print(x) # [1, 2, 3, [4, 5]]

x = [1, 2, 3]
x.extend([4, 5])
print(x) # [1, 2, 3, 4, 5]
```

### Subword Labeling
 - 한 단어가 subword로 토큰화 되었을 때, 첫번째 subword 토큰에만 label을 부여하고 나머지 subword 토큰에는 pad_token_id_for_label(=-100)을 부여한다.

### np.array vs np.asarray
 - np.array() : copy
 - np.asarray() : reference
```python
# np.array()
a = np.array([1, 2, 3, 4])
aa = np.asarray(a, dtype=int)
print(a)
print(aa)
a[0] = -1
print(a)
print(aa)
```
```
[1 2 3 4]
[1 2 3 4]
[-1  2  3  4]
[-1  2  3  4]
```
```python
# np.asarray()
a = np.array([1, 2, 3, 4])
aa = np.array(a, dtype=int)
print(a)
print(aa)
a[0] = -1
print(a)
print(aa)
```
```
[1 2 3 4]
[1 2 3 4]
[-1  2  3  4]
[1 2 3 4]
```


In [9]:
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")
X_train, y_train = tokenize(train_data_sentences, train_data_labels, 128, tokenizer)
X_test, y_test = tokenize(test_data_sentences, test_data_labels, 128, tokenizer)

tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [10]:
# check the result
print(' 기존 원문 :', train_data_sentences[0])
print(' 기존 레이블 :', train_data_labels[0])
print('-' * 50)
print(' 토큰화 후 원문 :', [tokenizer.decode([word]) for word in X_train[0][0]])
print(' 토큰화 후 레이블 :', ['[PAD]' if idx == -100 else index_to_tag[idx] for idx
                           in y_train[0]])
print('-' * 50)
print(' 정수 인코딩 결과 :', X_train[0][0])
print(' 정수 인코딩 레이블 :', y_train[0])

 기존 원문 : ['정은', '씨를', '힘들게', '한', '가스나그,', '가만둘', '수', '없겠죠', '.']
 기존 레이블 : ['PER-B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
--------------------------------------------------
 토큰화 후 원문 : ['[CLS]', '정은', '씨', '##를', '힘들', '##게', '한', '가스', '##나', '##그', ',', '가만', '##둘', '수', '없', '##겠', '##죠', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PA

# Modeling

In [11]:
class TFBertForTokenClassification(keras.Model):
    def __init__(self, model_name, num_labels):
        super(TFBertForTokenClassification, self).__init__()
        self.bert = TFBertModel.from_pretrained(model_name, from_pt=True)
        self.classifier = keras.layers.Dense(units=num_labels,
                                             kernel_initializer=keras.initializers.TruncatedNormal(0.02),
                                             name="classifier")
    def call(self, inputs):
        input_ids, attention_mask, token_type_ids = inputs
        outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sequence_output = outputs[0] # (batch_size, sequence_length, hidden_size)
        prediction = self.classifier(sequence_output)

        return prediction

### Output of Bert Model
 - 1st element : last_hidden_state - sequence of hidden-states at the output of the last layer of the model. (batch_size, sequence_length, hidden_size) -> used for Many-To-Many modeling.
 - 2nd element: pooler_output - last layer hidden-state of the first token of the sequence (classification token == [CLS]) further processed by a Linear layer and a Tanh activation function. (batch_size, hidden_size) -> used for Many-To-One modeling.

In [12]:
def compute_loss(labels, logits):
    # from_logits=True : softmax를 거치지 않은 값이 logits이다.
    fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)

    # ignore_index=-100 : -100은 무시한다.
    active_loss = tf.reshape(labels, (-1,)) != -100 # [-100, 2, 1, -100, ... ] -> [False, True, True, False, ... ]

    # shape_list(labels)[2] : labels의 3번째 차원의 크기를 반환한다.
    reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
    reduced_labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)

    return fn(reduced_labels, reduced_logits)

### boolean_mask : applying active_loss
```python
labels = tf.constant([[-100, 2, 1, -100]])
logits = tf.constant([[[0.8, 0.1, 0.1],
                       [0.06, 0.04, 0.9],
                       [0.75, 0.1, 0.15],
                       [0.4, 0.5, 0.1]]])

active_loss = tf.reshape(labels, (-1,)) != -100 # [-100, 2, 1, -100, ... ] -> [False, True, True, False, ... ]

reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)  
# tf.Tensor([[0.06 0.04 0.9 ] [0.75 0.1  0.15]], shape=(2, 3), dtype=float32)  

reduced_labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)  
# tf.Tensor([2 1], shape=(2,), dtype=int32)  
```

## Custom Callback

In [13]:
class F1Score(tf.keras.callbacks.Callback):
    def __init__(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test

    def sequences_to_tags(self, label_ids, pred_ids):
        label_list, pred_list = [], []

        for i in range(0,len(label_ids)):
            labels_tags, pred_tags = [], []

            for label, pred in zip(label_ids[i], pred_ids[i]):
                if label != -100:
                    labels_tags.append(index_to_tag[label])
                    pred_tags.append(index_to_tag[pred])

            label_list.append(labels_tags)
            pred_list.append(pred_tags)

        return label_list, pred_list


    def on_epoch_end(self, epoch, logs={}):
        pred = self.model.predict(self.X_test)
        pred_argmax = np.argmax(pred, axis=-1)

        true_tags, pred_tags = self.sequences_to_tags(self.y_test, pred_argmax)

        score = f1_score(true_tags, pred_tags)
        print(' - f1: {:04.2f}'.format(score * 100))
        print(classification_report(true_tags, pred_tags, suffix=True))

In [14]:
with strategy.scope():
    model = TFBertForTokenClassification("klue/bert-base", num_labels=tag_size)
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    model.compile(optimizer=optimizer, loss=compute_loss)

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the 

In [15]:
f1score = F1Score(X_test, y_test)
history = model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_test, y_test), callbacks=[f1score])

Epoch 1/3




   6/2532 [..............................] - ETA: 3:13 - loss: 2.2727







 - f1: 79.29
              precision    recall  f1-score   support

         AFW       0.55      0.61      0.58       394
         ANM       0.71      0.78      0.74       701
         CVL       0.86      0.79      0.82      5758
         DAT       0.89      0.92      0.91      2521
         EVT       0.72      0.77      0.74      1094
         FLD       0.73      0.48      0.58       228
         LOC       0.83      0.84      0.84      2126
         MAT       0.00      0.00      0.00        12
         NUM       0.90      0.93      0.91      5590
         ORG       0.86      0.86      0.86      4086
         PER       0.88      0.89      0.89      4426
         PLT       0.26      0.21      0.23        34
         TIM       0.84      0.91      0.87       314
         TRM       0.68      0.73      0.70      1964

   micro avg       0.84      0.85      0.85     29248
   macro avg       0.69      0.69      0.69     29248
weighted avg       0.84      0.85      0.85     29248

Epoch 2/3
 -

# Prediction

In [16]:
# Predict

In [17]:
def predict(sentences, max_seq_len, tokenizer, pad_token_id_for_segment=0, pad_token_id_for_label=-100):
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    pad_token_id = tokenizer.pad_token_id

    input_ids, attention_masks, token_type_ids, label_masks = [], [], [], []
    limit_len = max_seq_len - 2

    for sentence in sentences:
        tokens = []
        label_mask = []

        for word in sentence:
            subword_tokens = tokenizer.tokenize(word)
            tokens.extend(subword_tokens) # extend : list + list, append : list + [list]
            label_mask.extend([0] + [pad_token_id_for_label] * (len(subword_tokens) - 1)) # [0, -100, -100, ...]

        # in order to add [CLS] and [SEP] tokens, length of tokens should be less than limit_len('max_seq_len' - 2)
        if len(tokens) > limit_len:
            tokens = tokens[:limit_len]
            label_mask = label_mask[:limit_len]

        tokens = [cls_token] + tokens + [sep_token] # [CLS] + tokens + [SEP]
        label_mask = [pad_token_id_for_label] + label_mask + [pad_token_id_for_label] # [PAD] + label_ids + [PAD]

        # convert tokens to token_ids : string -> int
        input_id = tokenizer.convert_tokens_to_ids(tokens)

        # padding + attention masking + segment encoding
        padding_count = max_seq_len - len(tokens)

        input_id_padded = input_id + ([pad_token_id] * padding_count)
        label_mask_padded = label_mask + ([pad_token_id_for_label] * padding_count)
        attention_mask = ([1] * len(input_id)) + ([0] * padding_count)
        token_type_id = ([pad_token_id_for_segment] * max_seq_len)

        # validation
        assert len(input_id_padded) == max_seq_len, "Error with input length {} vs {}".format(len(input_id_padded), max_seq_len)
        assert len(label_mask_padded) == max_seq_len, "Error with input length {} vs {}".format(len(label_mask_padded), max_seq_len)
        assert len(attention_mask) == max_seq_len, "Error with input length {} vs {}".format(len(attention_mask), max_seq_len)
        assert len(token_type_id) == max_seq_len, "Error with input length {} vs {}".format(len(token_type_id), max_seq_len)

        input_ids.append(input_id_padded)
        label_masks.append(label_mask_padded)
        attention_masks.append(attention_mask)
        token_type_ids.append(token_type_id)

    input_ids = np.array(input_ids, dtype=int)
    token_type_ids = np.array(token_type_ids, dtype=int)
    attention_masks = np.array(attention_masks, dtype=int)
    label_masks = np.asarray(label_masks, dtype=np.int32)

    return (input_ids, attention_masks, token_type_ids), label_masks


### label_mask
 - 레이블 마스크는 BERT 토크나이저가 하나의 단어에 대해서 서브워드로 분리할 경우, 첫번째 서브워드를 제외한 나머지 서브워드들에 대해서는 [PAD] 토큰. 즉, ‑100 을 부여합니다.
   이제 모델이 임의의 문장에 대해서 예측했을 때, 레이블 마스크의 값을 참고하여 첫번째 서브워드가 아닌 뒤의 서브워드들에 대한 예측값은 무시합니다.

In [22]:
# check with samples
X_pred, label_masks = predict(
test_data_sentences[:5], max_seq_len=128, tokenizer=tokenizer)

print(' 기 존 원 문 :', test_data_sentences[0])
print('-' * 50)
print(' 토 큰 화 후 원 문 :', [tokenizer.decode([word]) for word in X_pred[0][0]])
print(' 레 이 블 마 스 크 :', ['[PAD]' if idx == -100 else '[FIRST]' for idx in
label_masks[0]])

 기 존 원 문 : ['라티은-원윤정,', '휘닉스파크클래식', '프로골퍼']
--------------------------------------------------
 토 큰 화 후 원 문 : ['[CLS]', '라', '##티', '##은', '-', '원', '##윤', '##정', ',', '휘', '##닉스', '##파크', '##클', '##래', '##식', '프로', '##골', '##퍼', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

In [42]:
 def ner_predict(sentences, max_seq_len, tokenizer):
    word_list = [sentence.split() for sentence in sentences] # [['나는', '밥을', '먹는다'], ['나는', '밥을', '먹는다']]
    X_pred, label_masks = predict(word_list, max_seq_len, tokenizer)
    pred = model.predict(X_pred)
    pred_argmax = np.argmax(pred, axis=-1)

    print(word_list)

    predict_list = []
    for i in range(0, len(label_masks)):
        pred_tag = []
        """
        ex) 모델의 예측값 디코딩 과정
                예측값(y_predicted) 에서 레이블 마스크(label_masks) 의 값이 -100 인 동일 위치의 값을 삭제
                label_masks : [-100 0 -100 0 -100]
                pred_argmax : [ 0 1 0 2 0 ] ==> [1 2] ==> 최 종 예 측 (pred_tag) : [PER-B PER-I]
        """
        for pred_index, label_index in zip(pred_argmax[i], label_masks[i]):
            if label_index != -100:
                pred_tag.append(index_to_tag[pred_index])

        predict_list.append(pred_tag)

    result_list = []
    for sentence, pred in zip(word_list, predict_list):
        sentence_result = []
        for word, label_token in zip(sentence, pred):
            sentence_result.append((word, label_token))
        result_list.append(sentence_result)

    return result_list

In [43]:
sent1 = '오리온스는 리그 최정상급 포인트가드 김동훈을 앞세우는 빠른 공수전환이 돋보이는 팀이다'
sent2 = '하이신사에 속한 섬들도 위로 솟아 있는데 타인은 살고 있어요'

test_samples = [sent1, sent2]
result_list = ner_predict(test_samples, max_seq_len=128, tokenizer=tokenizer)

print(result_list)

[['오리온스는', '리그', '최정상급', '포인트가드', '김동훈을', '앞세우는', '빠른', '공수전환이', '돋보이는', '팀이다'], ['하이신사에', '속한', '섬들도', '위로', '솟아', '있는데', '타인은', '살고', '있어요']]
[[('오리온스는', 'ORG-B'), ('리그', 'O'), ('최정상급', 'O'), ('포인트가드', 'CVL-B'), ('김동훈을', 'PER-B'), ('앞세우는', 'O'), ('빠른', 'O'), ('공수전환이', 'O'), ('돋보이는', 'O'), ('팀이다', 'O')], [('하이신사에', 'LOC-B'), ('속한', 'O'), ('섬들도', 'O'), ('위로', 'O'), ('솟아', 'O'), ('있는데', 'O'), ('타인은', 'O'), ('살고', 'O'), ('있어요', 'O')]]
