In [3]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import os
import torch
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split

In [4]:
train = pd.read_csv('./data/prac_train_data.txt', names=['src', 'tar'], sep='\t')
train = train.reset_index()
train

Unnamed: 0,index,src,tar
0,1,비토리오,PER_B
1,2,양일,DAT_B
2,3,만에,-
3,4,영사관,ORG_B
4,5,감호,CVL_B
...,...,...,...
769060,2,어째,-
769061,3,뭔가,-
769062,4,수상쩍은,-
769063,5,좌담,-


In [5]:
train['src'] = train['src'].str.replace('．', '.', regex=False)
train.loc[train['src']=='.']

Unnamed: 0,index,src,tar
15,6,.,-
30,15,.,-
40,10,.,-
77,24,.,-
96,19,.,-
...,...,...,...
769013,11,.,-
769036,23,.,-
769053,17,.,-
769058,5,.,-


In [6]:
train['src'] = train['src'].astype(str)
train['tar'] = train['tar'].astype(str)

train['src'] = train['src'].str.replace(r'[^ㄱ-ㅣ가-힣0-9a-zA-Z.]+', "", regex=True)

In [7]:
train

Unnamed: 0,index,src,tar
0,1,비토리오,PER_B
1,2,양일,DAT_B
2,3,만에,-
3,4,영사관,ORG_B
4,5,감호,CVL_B
...,...,...,...
769060,2,어째,-
769061,3,뭔가,-
769062,4,수상쩍은,-
769063,5,좌담,-


In [8]:
data = [list(x) for x in train[['index','src','tar']].to_numpy()]

data[:20]

[[1, '비토리오', 'PER_B'],
 [2, '양일', 'DAT_B'],
 [3, '만에', '-'],
 [4, '영사관', 'ORG_B'],
 [5, '감호', 'CVL_B'],
 [6, '용퇴', '-'],
 [7, '항룡', '-'],
 [8, '압력설', '-'],
 [9, '의심만', '-'],
 [10, '가율', '-'],
 [1, '이', '-'],
 [2, '음경동맥의', '-'],
 [3, '직경이', '-'],
 [4, '8', 'NUM_B'],
 [5, '19mm입니다', 'NUM_B'],
 [6, '.', '-'],
 [1, '9세이브로', 'NUM_B'],
 [2, '구완', '-'],
 [3, '30위인', 'NUM_B'],
 [4, 'LG', 'ORG_B']]

In [10]:
label = train['tar'].unique().tolist()
label_dict = {word:i for i, word in enumerate(label)}
label_dict.update({'[PAD]':len(label_dict)})
index_to_ner = {i:j for j, i in label_dict.items()}

In [11]:
label

['PER_B',
 'DAT_B',
 '-',
 'ORG_B',
 'CVL_B',
 'NUM_B',
 'LOC_B',
 'EVT_B',
 'TRM_B',
 'TRM_I',
 'EVT_I',
 'PER_I',
 'CVL_I',
 'NUM_I',
 'TIM_B',
 'TIM_I',
 'ORG_I',
 'DAT_I',
 'ANM_B',
 'MAT_B',
 'MAT_I',
 'AFW_B',
 'FLD_B',
 'LOC_I',
 'AFW_I',
 'PLT_B',
 'FLD_I',
 'ANM_I',
 'PLT_I']

In [12]:
label_dict

{'PER_B': 0,
 'DAT_B': 1,
 '-': 2,
 'ORG_B': 3,
 'CVL_B': 4,
 'NUM_B': 5,
 'LOC_B': 6,
 'EVT_B': 7,
 'TRM_B': 8,
 'TRM_I': 9,
 'EVT_I': 10,
 'PER_I': 11,
 'CVL_I': 12,
 'NUM_I': 13,
 'TIM_B': 14,
 'TIM_I': 15,
 'ORG_I': 16,
 'DAT_I': 17,
 'ANM_B': 18,
 'MAT_B': 19,
 'MAT_I': 20,
 'AFW_B': 21,
 'FLD_B': 22,
 'LOC_I': 23,
 'AFW_I': 24,
 'PLT_B': 25,
 'FLD_I': 26,
 'ANM_I': 27,
 'PLT_I': 28,
 '[PAD]': 29}

In [13]:
index_to_ner

{0: 'PER_B',
 1: 'DAT_B',
 2: '-',
 3: 'ORG_B',
 4: 'CVL_B',
 5: 'NUM_B',
 6: 'LOC_B',
 7: 'EVT_B',
 8: 'TRM_B',
 9: 'TRM_I',
 10: 'EVT_I',
 11: 'PER_I',
 12: 'CVL_I',
 13: 'NUM_I',
 14: 'TIM_B',
 15: 'TIM_I',
 16: 'ORG_I',
 17: 'DAT_I',
 18: 'ANM_B',
 19: 'MAT_B',
 20: 'MAT_I',
 21: 'AFW_B',
 22: 'FLD_B',
 23: 'LOC_I',
 24: 'AFW_I',
 25: 'PLT_B',
 26: 'FLD_I',
 27: 'ANM_I',
 28: 'PLT_I',
 29: '[PAD]'}

In [16]:
tups = []
temp_tup = []
temp_tup.append(data[0][1:])

for i, j, k in data:
    if i != 1:
        temp_tup.append([j, label_dict[k]])
    if i == 1:
        if len(temp_tup) != 0:
            tups.append(temp_tup)
            temp_tup = []
            temp_tup.append([j, label_dict[k]])
            
tups.pop(0)

[['비토리오', 'PER_B']]

In [17]:
print(tups[0])
print(tups[1])

[['비토리오', 0], ['양일', 1], ['만에', 2], ['영사관', 3], ['감호', 4], ['용퇴', 2], ['항룡', 2], ['압력설', 2], ['의심만', 2], ['가율', 2]]
[['이', 2], ['음경동맥의', 2], ['직경이', 2], ['8', 5], ['19mm입니다', 5], ['.', 2]]


In [18]:
sentences = []
targets = []

for tup in tups:
    sentence = []
    target = []
    sentence.append('[CLS]')
    target.append(label_dict['-'])
    
    for i, j in tup:
        sentence.append(i)
        target.append(j)
    
    sentence.append('[SEP]')
    target.append(label_dict['-'])
    sentences.append(sentence)
    targets.append(target)

In [19]:
sentences[0]

['[CLS]',
 '비토리오',
 '양일',
 '만에',
 '영사관',
 '감호',
 '용퇴',
 '항룡',
 '압력설',
 '의심만',
 '가율',
 '[SEP]']

In [20]:
targets[0]

[2, 0, 1, 2, 3, 4, 2, 2, 2, 2, 2, 2]

In [21]:
from tokenization_kobert import KoBertTokenizer
tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')

loading file https://huggingface.co/monologg/kobert/resolve/main/tokenizer_78b3253a26.model from cache at /Users/hoon/.cache/huggingface/transformers/7e55d7972628e6fc1babc614b5dd8bb43ab4f9d8541adc9fb1851112a7a7c5cc.4d2f4af7c2ca9df5b147978a95d38840e84801a378eee25756b008638e0bdc7f
loading file https://huggingface.co/monologg/kobert/resolve/main/vocab.txt from cache at /Users/hoon/.cache/huggingface/transformers/efee434f5f4c5c89b5a7d8d5f30bbb0496f1540349fcfa21729cec5b96cfd2d1.719459e20bc981bc2093e859b02c3a3e51bab724d6b58927b23b512a3981229f
loading file https://huggingface.co/monologg/kobert/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/monologg/kobert/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/monologg/kobert/resolve/main/tokenizer_config.json from cache at /Users/hoon/.cache/huggingface/transformers/d1c07e179f5e00959a3c8e4a150eaa4907dfe26544e4a71f2b0163982a476523.767d1b760a83978bae6c324157fad57ee513a

In [22]:
tokenizer.tokenize('대한민국 만세')

['▁대한민국', '▁만', '세']

In [23]:
def tokenize_and_preserve_labels(sentence, txt_labels):
    tokenized_sent = []
    labels = []
    
    for word, label in zip(sentence, txt_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)
        
        tokenized_sent.extend(tokenized_word)
        labels.extend([label] * n_subwords)
        
    return tokenized_sent, labels

In [24]:
tokenized_txt_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in zip(sentences, targets)]

In [29]:
tokenized_txt_labels[0]

(['[CLS]',
  '▁비',
  '토',
  '리',
  '오',
  '▁양',
  '일',
  '▁만에',
  '▁영',
  '사',
  '관',
  '▁감',
  '호',
  '▁용',
  '퇴',
  '▁항',
  '룡',
  '▁압력',
  '설',
  '▁의심',
  '만',
  '▁',
  '가',
  '율',
  '[SEP]'],
 [2, 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [30]:
tokenized_txt = [token_label_pair[0] for token_label_pair in tokenized_txt_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_txt_labels]

In [31]:
tokenized_txt[1]

['[CLS]',
 '▁이',
 '▁음',
 '경',
 '동',
 '맥',
 '의',
 '▁직',
 '경',
 '이',
 '▁8',
 '▁19',
 'mm',
 '입니다',
 '▁',
 '.',
 '[SEP]']

In [32]:
labels[1]

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 2, 2, 2]

In [33]:
np.quantile(np.array([len(x) for x in tokenized_txt]), 0.975)

88.0

In [34]:
max_len = 88
batch_size = 32

In [37]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_txt],
                          maxlen=max_len, dtype='int', value=0,
                          truncating='post', padding='post')

input_ids[1]

array([   2, 3647, 3606, 5424, 5872, 6172, 7095, 4349, 5424, 7096,  624,
        548,  424, 7139,  517,   54,    3,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0])

In [38]:
tags = pad_sequences([lab for lab in labels], maxlen=max_len, value=label_dict['[PAD]'],
                     padding='post', dtype='int', truncating='post')

tags[1]

array([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  5,  5,  5,  5,  2,  2,  2,
       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29])

In [39]:
attention_mask = np.array([[int(i != 0) for i in ii] for ii in input_ids])
attention_mask

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

In [46]:
import torch
from transformers import BertModel

In [50]:
model = BertModel.from_pretrained('monologg/kobert', num_labels=len(label_dict), output_attentions=False, output_hidden_states=False)

loading configuration file https://huggingface.co/monologg/kobert/resolve/main/config.json from cache at /Users/hoon/.cache/huggingface/transformers/31dc8da633439f22ed80bede01f337996bc709eb8429f86f2b24e2103558b039.89a06cdfd16840fd89cc5c2493ef63cd0b6068e85f70ac988a3673e2722cab2e
Model config BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "2

In [48]:
model.embeddings.word_embeddings

Embedding(8002, 768, padding_idx=1)