In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchcrf import CRF

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from transformers import ElectraModel, ElectraTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import os
import random
import time
import pandas as pd

# Data Preparation

## Loading

In [2]:
df_1 = pd.read_excel('dataset/eshc_NER_v2.xlsx').set_axis(['index', 'src', 'tar'], axis='columns')
df_2 = pd.read_excel('dataset/port_NER_v2.xlsx').set_axis(['index', 'src', 'tar'], axis='columns')
df_3 = pd.read_excel('dataset/ulsan_2021-2019_NER_v2.xlsx').set_axis(['index', 'src', 'tar'], axis='columns')
df_4 = pd.read_excel('dataset/ulsan_2022_NER_v2.xlsx').set_axis(['index', 'src', 'tar'], axis='columns')
len(df_1), len(df_2), len(df_3), len(df_4)

(17688, 6819, 69241, 26984)

In [3]:
### start_num

# df_1[100:150] # 0
# df_2[100:150] # 0
# df_3[100:150] # 1
# df_4[100:150] # 1

## Cleansing

In [4]:
def preprocess(df):
    df = df.fillna("-")
    df['src'] = df['src'].astype(str)
    df['src'] = df['src'].str.replace("．", ".", regex=False)
    df['src'] = df['src'].str.replace(r'[^ㄱ-ㅣ가-힣0-9a-zA-Z.]+', "", regex=True)
    df['tar'] = df['tar'].astype(str)
    df['tar'] = df['tar'].str.replace("1", "-", regex=False)
    df['tar'] = df['tar'].str.replace("(삭제)", "-", regex=False)
    df['tar'] = df['tar'].str.replace(r'^AF$', 'AFW_B', regex=True)
    df['tar'] = df['tar'].str.replace("AFW_B", "AWF_B", regex=False)
    df['tar'] = df['tar'].str.replace("AFW_b", "AFW_B", regex=False)
    df['tar'] = df['tar'].str.replace('AFW _B', 'AFW_B', regex=False)
    df['tar'] = df['tar'].str.replace('AWF_B', 'AFW_B', regex=False)
    df['tar'] = df['tar'].str.replace('AFW)B', 'AFW_B', regex=False)
    df['tar'] = df['tar'].str.replace('AFW0B', 'AFW_B', regex=False)
    df['tar'] = df['tar'].str.replace('AFW)I', 'AFW_I', regex=False)
    df['tar'] = df['tar'].str.replace('AFW_O', 'AFW_I', regex=False)
    df['tar'] = df['tar'].str.replace("ANW_B", "ANM_B", regex=False)
    df['tar'] = df['tar'].str.replace("ANM_b", "ANM_B", regex=False)
    df['tar'] = df['tar'].str.replace('DAT0B', 'DAT_B', regex=False)
    df['tar'] = df['tar'].str.replace('DAT+I', 'DAT_I', regex=False)
    df['tar'] = df['tar'].str.replace('DAT_II', 'DAT_I', regex=False)
    df['tar'] = df['tar'].str.replace("LOC_b", "LOC_B", regex=False)
    df['tar'] = df['tar'].str.replace(r'^LOC_$', 'LOC_I', regex=True)
    df['tar'] = df['tar'].str.replace("LOC_2", "LOC_I", regex=False)
    df['tar'] = df['tar'].str.replace(r'^C$', 'MAT_B', regex=True)
    df['tar'] = df['tar'].str.replace(',MAT_B', 'MAT_B', regex=False)
    df['tar'] = df['tar'].str.replace('MAT)B', 'MAT_B', regex=False)
    df['tar'] = df['tar'].str.replace('MAT+I', 'MAT_I', regex=False)
    df['tar'] = df['tar'].str.replace("NUMB", "NUM_B", regex=False)
    df['tar'] = df['tar'].str.replace("NUM_b", "NUM_B", regex=False)
    df['tar'] = df['tar'].str.replace("NIM_B", "NUM_B", regex=False)
    df['tar'] = df['tar'].str.replace(r'^N$', 'NUM_I', regex=True)
    df['tar'] = df['tar'].str.replace("TRM_b", "TRM_B", regex=False)
    df['tar'] = df['tar'].str.replace("TIM_b", "TIM_B", regex=False)
    df['tar'] = df['tar'].str.replace('TIOM_I', 'TIM_I', regex=False)
    df['tar'] = df['tar'].str.replace("WRK_b", "WRK_B", regex=False)
    df['tar'] = df['tar'].str.replace("WRK_L", "WRK_I", regex=False)
    df['tar'] = df['tar'].str.replace(r'^WRK_$', 'WRK_I', regex=True)
    df['tar'] = df['tar'].str.replace('WRL_I', 'WRK_I', regex=False)
    return df

In [5]:
df_1 = preprocess(df_1)
df_2 = preprocess(df_2)
df_3 = preprocess(df_3)
df_4 = preprocess(df_4)

In [6]:
# df_1.isnull().any(), df_2.isnull().any(), df_3.isnull().any(), df_4.isnull().any() # not found

## Reformat / Tokenization / Integrity Check

In [7]:
train = pd.concat([df_1, df_2, df_3, df_4])

In [8]:
label = train['tar'].unique().tolist()
label.sort()
label = ['[PAD]'] +label
label_dict = {word:i for i, word in enumerate(label)}
# label_dict.update({"[PAD]":len(label_dict)})
index_to_ner = {i:j for j, i in label_dict.items()}

num_labels = len(label)

print(label_dict), print(), print(index_to_ner), print(), print(label), print(), print(f'num_labels: {num_labels}')

{'[PAD]': 0, '-': 1, 'AFW_B': 2, 'AFW_I': 3, 'ANM_B': 4, 'ANM_I': 5, 'CVL_B': 6, 'CVL_I': 7, 'DAT_B': 8, 'DAT_I': 9, 'LOC_B': 10, 'LOC_I': 11, 'MAT_B': 12, 'MAT_I': 13, 'NUM_B': 14, 'NUM_I': 15, 'TIM_B': 16, 'TIM_I': 17, 'TRM_B': 18, 'TRM_I': 19, 'WRK_B': 20, 'WRK_I': 21}

{0: '[PAD]', 1: '-', 2: 'AFW_B', 3: 'AFW_I', 4: 'ANM_B', 5: 'ANM_I', 6: 'CVL_B', 7: 'CVL_I', 8: 'DAT_B', 9: 'DAT_I', 10: 'LOC_B', 11: 'LOC_I', 12: 'MAT_B', 13: 'MAT_I', 14: 'NUM_B', 15: 'NUM_I', 16: 'TIM_B', 17: 'TIM_I', 18: 'TRM_B', 19: 'TRM_I', 20: 'WRK_B', 21: 'WRK_I'}

['[PAD]', '-', 'AFW_B', 'AFW_I', 'ANM_B', 'ANM_I', 'CVL_B', 'CVL_I', 'DAT_B', 'DAT_I', 'LOC_B', 'LOC_I', 'MAT_B', 'MAT_I', 'NUM_B', 'NUM_I', 'TIM_B', 'TIM_I', 'TRM_B', 'TRM_I', 'WRK_B', 'WRK_I']

num_labels: 22


(None, None, None, None, None, None, None)

In [9]:
df_1_list = [list(x) for x in df_1[['index', 'src', 'tar']].to_numpy()]
df_2_list = [list(x) for x in df_2[['index', 'src', 'tar']].to_numpy()]
df_3_list = [list(x) for x in df_3[['index', 'src', 'tar']].to_numpy()]
df_4_list = [list(x) for x in df_4[['index', 'src', 'tar']].to_numpy()]

In [10]:
def makeTups(data, start_num):
    tups = []
    temp_tup = []
    temp_tup.append(data[0][1:])
    for i, j, k in data: # index src tgt
        if i != start_num:
            temp_tup.append([j,k])
        if i == start_num:
            if len(temp_tup) != 0:
                tups.append(temp_tup)
                temp_tup = []
                temp_tup.append([j,k])

    tups[0].pop(0)
    return tups

In [11]:
tups_1 = makeTups(df_1_list, 0) # start_num 0
tups_2 = makeTups(df_2_list, 0) # start_num 0
tups_3 = makeTups(df_3_list, 1) # start_num 1
tups_4 = makeTups(df_4_list, 1) # start_num 1
print(len(tups_1), len(tups_2), len(tups_3), len(tups_4))

1028 151 2236 1299


In [12]:
tups = tups_1 + tups_2 + tups_3 + tups_4
print(f'number of samples: {len(tups)}')

number of samples: 4714


### Check If Empty

In [13]:
# check and remove if there is any empty row

empties = []
for i, row in enumerate(tups):
    if len(row) < 2 and row[0][0] == '':
        # print(i, row)
        empties.append(i)

empties.sort(reverse=True)

print(f'{len(empties)} empty elements removed.')

for idx in empties:
    tups.pop(idx)

for i, row in enumerate(tups):
    if len(row) < 2 and row[0][0] == '':
        print(i, row)

print(f'number of samples: {len(tups)}')

16 empty elements removed.
number of samples: 4698


### Load Tokenizer and Check Special Tokens

In [14]:
tokenizer = ElectraTokenizer.from_pretrained('./tokenizer/')

In [15]:
tokenizer.pad_token_id

0

In [16]:
init_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
init_token, pad_token, unk_token, sep_token

('[CLS]', '[PAD]', '[UNK]', '[SEP]')

In [17]:
tups[0]

[['소재', 'LOC_I'],
 ['창고', 'WRK_B'],
 ['대수선', 'WRK_I'],
 ['공사현장에서', 'WRK_I'],
 ['지붕판넬', 'WRK_B'],
 ['철거작업', 'WRK_I'],
 ['중', '-'],
 ['절단', '-'],
 ['및', '-'],
 ['고정볼트가', 'AFW_I'],
 ['해체된', '-'],
 ['지붕판넬을', 'AFW_B'],
 ['밟아', '-'],
 ['떨어짐.', '-']]

In [18]:
sentences = []
targets = []
for tup in tups:
    sentence = []
    target = []
    for i, j in tup:
        sentence.append(i)
        target.append(j)
    sentences.append(sentence)
    targets.append(target)

In [19]:
k = random.randrange(len(sentences))
print(f'random inddex k is {k}')
print(len(sentences)), print(len(targets))
print(len(sentences[k])), print(len(targets[k]))
print(sentences[k]), print(targets[k])

random inddex k is 3691
4698
4698
18
18
['울산', '제조', '현장', '내', '탱크로리', '지입차주가', '제품을', '로딩한', '후', '쿨다운작업을', '위해', '이동하여', '작업', '중', '탱크로리에서', '사망된', '채로', '발견.']
['LOC_B', 'LOC_I', 'LOC_I', '-', 'CVL_B', 'CVL_I', '-', '-', '-', 'WRK_B', '-', '-', '-', '-', 'LOC_B', '-', '-', '-']


(None, None)

In [20]:
### 길이 검사
for sent, tgt in zip(sentences, targets):
    if len(sent) != len(tgt):
        print(sent)
        print(tgt)

### Tokenization and Label Matching

In [21]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        if label[-1] == 'B' and n_subwords > 1:
            tail = list(label)
            tail[-1] = 'I'
            tail = ''.join(tail)
            labels.extend([label] + [tail]*(n_subwords-1))
        else:
            labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [22]:
tokenized_texts_and_labels = [
                              tokenize_and_preserve_labels(sent, labs)
                              for sent, labs in zip(sentences, targets)
                              ]

In [23]:
# # [(문장, 개체들), (문장, 개체들),...] 형식으로 저장되어 있음.
# k = random.randrange(len(tokenized_texts_and_labels))
# print(f'random inddex k is {k}')
# print(len(tokenized_texts_and_labels)), print(len(tokenized_texts_and_labels))
# print(len(tokenized_texts_and_labels[k][0])), print(len(tokenized_texts_and_labels[k][1]))
# print(tokenized_texts_and_labels[k][0]), print(), print(tokenized_texts_and_labels[k][1])

In [24]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [25]:
# k = random.randrange(len(tokenized_texts))
# print(f'random inddex k is {k}')
# print(len(tokenized_texts)), print(len(labels))
# print(len(tokenized_texts[k])), print(len(labels[k])), print()
# print(tokenized_texts[k]), print(labels[k])
# for tok, lab in zip(tokenized_texts[k], labels[k]):
#     print(tok, '\t',lab)

In [26]:
error_count = 0
for txt, lab in zip(tokenized_texts, labels):
    if len(txt) != len(lab):
        # print(txt)
        # print(lab)
        error_count += 1
print(error_count)

0


In [27]:
lengths = []
for tokens in tokenized_texts:
    lengths.append(len(tokens))

f'the longest is {max(lengths)}'

'the longest is 395'

In [28]:
tokenized_texts_list, labels_list = tokenized_texts, labels

### Add Special Tokens and Pad Inputs

In [29]:
# print(len(label_dict)), print(len(index_to_ner))
# print(label_dict)
# print(index_to_ner)
print(label)

['[PAD]', '-', 'AFW_B', 'AFW_I', 'ANM_B', 'ANM_I', 'CVL_B', 'CVL_I', 'DAT_B', 'DAT_I', 'LOC_B', 'LOC_I', 'MAT_B', 'MAT_I', 'NUM_B', 'NUM_I', 'TIM_B', 'TIM_I', 'TRM_B', 'TRM_I', 'WRK_B', 'WRK_I']


In [30]:
def make_inputs(tokenized_texts_list, labels_list, max_length):
    max_length = max_length
    input_tokens = []
    input_mask = []
    input_labels = []
    num = len(tokenized_texts_list)
    for i in range(num):
        data = tokenized_texts_list[i]
        labels = labels_list[i]
        tmp_tokens = [init_token] + data
        tmp_labels = ['-'] + labels
        if len(tmp_tokens) < max_length-1:
            tmp_tokens += [sep_token]
            tmp_labels += ['-']
            tmp_mask = [1] * len(tmp_tokens)
            for _ in range(max_length-len(tmp_tokens)):
                tmp_tokens.append(pad_token)
                tmp_labels.append('[PAD]')
                tmp_mask.append(0)
            input_tokens.append(tmp_tokens)
            input_labels.append(tmp_labels)
            input_mask.append(tmp_mask)
        elif len(tmp_tokens) > max_length-1:
            tmp_tokens = tmp_tokens[:max_length-1] + [sep_token]
            tmp_labels = tmp_labels[:max_length-1] + ['-']
            tmp_mask = [1] * max_length
            input_tokens.append(tmp_tokens)
            input_labels.append(tmp_labels)
            input_mask.append(tmp_mask)
        elif len(tmp_tokens) == max_length-1:
            tmp_tokens += [sep_token]
            tmp_labels += [label_dict['-']]
            tmp_mask = [1] * max_length
            input_tokens.append(tmp_tokens)
            input_labels.append(tmp_labels)
            input_mask.append(tmp_mask)

    return input_tokens, input_mask, input_labels

In [31]:
max_length = 256
input_tokens, input_mask, input_labels = make_inputs(tokenized_texts, labels, max_length)

In [32]:
print(f'length of input_tokens: {len(input_tokens)}')
print(f'length of input_labels: {len(input_labels)}')
print(f'length of input_mask: {len(input_mask)}')

k = random.randrange(len(input_tokens))
print(f'random inddex k is {k}')
print(len(input_tokens[k])), print(len(input_labels[k])), print(len(input_mask[k]))
print(), print(input_tokens[k]), print(), print(input_labels[k]), print(), print(input_mask[k])

length of input_tokens: 4698
length of input_labels: 4698
length of input_mask: 4698
random inddex k is 2651
256
256
256

['[CLS]', '2021', '##년', '5', '##월', '2', '4', '일', '3', '##시', '30', '##분', '##경', '서울', '노원구', '지상', '9', '##층', '상가', '건물', '에서', 'H빔', '철거작업', '을', '하', '##던', '60', '##대', '작업자', '6', '아래', '로', '추락', '##하', '##는', '사고', '##가', '발생', '##하', '##였', '##다', '.', '왼쪽', '다리', '##와', '머리', '등', '##을', '다쳐', '인근', '병원', '##으로', '이송', '##되', '##어', '치료', '중', '이', '##다', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

(None, None, None, None, None, None)

### Check Lengths

In [33]:
temp = 0
num = len(input_tokens)
for i in range(num):
    curr_tokens = input_tokens[i]
    curr_label = input_labels[i]
    curr_mask = input_mask[i]
    
    count_tokens, count_label, count_mask = 0,0,0
    for count, token in enumerate(curr_tokens):
        if token == '[PAD]':
            break
    count_tokens = count
    for count, label in enumerate(curr_label):
        if label == '[PAD]':
            break
    count_label = count
    for count, mask in enumerate(curr_mask):
        if mask == 0:
            break
    count_mask = count
    if not (count_tokens == count_label == count_mask):
        # print(count_tokens, count_label, count_mask)
        temp += 1
print(temp)

0


### Pair Inputs and Split to Ratio

In [34]:
input_pair = []
num = len(input_tokens)
for i in range(num):
    tmp_token_idx = input_tokens[i]
    tmp_mask = input_mask[i]
    tmp_label_idx = input_labels[i]
    input_pair.append((tmp_token_idx, tmp_mask, tmp_label_idx))

In [35]:
# k = random.randrange(len(input_tokens))
# print(input_pair[k])

데이터셋을 8:2 비율로 훈련 데이터셋과 검증 데이터셋으로 분할합니다. 데이터셋의 양이 적어 테스트셋은 생성하지 않았습니다.

In [36]:
train, valid = train_test_split(input_pair, test_size=0.2)

In [37]:
print(len(train)), print(len(valid))
# print(train[0]), print(valid[0])

3758
940


(None, None)

In [38]:
# temp = []
# for e in train[0][0]:
#     encoded = tokenizer.convert_tokens_to_ids(str(e))
#     temp.append(encoded)
# print(temp)

### Delcare Dataset (Iterator)

pytorch에서 데이터를 받아들일 수 있도록 커스텀 데이터셋을 생성합니다. 이 때 문장을 구성하는 토큰들을 고유한 인덱스 번호로 변환합니다. 마찬가지로 레이블도 인덱스 번호로 변환합니다.

In [39]:
class CustomDataset(Dataset): 
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    def __len__(self): 
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx][0]
        masks = self.data[idx][1]
        labels = self.data[idx][2]
        token_ids = []
        for token in tokens:
            ids = tokenizer.convert_tokens_to_ids(str(token))
            token_ids.append(ids)
        label_ids = []
        for label in labels:
            label_ids.append(label_dict[label])
                
        return (torch.LongTensor(token_ids), torch.LongTensor(masks), torch.LongTensor(label_ids))

In [40]:
trainset = CustomDataset(train, tokenizer)
validset = CustomDataset(valid, tokenizer)

배치 크기가 128인 데이터로더를 생성합니다.

In [41]:
cpu_num = os.cpu_count()

In [42]:
batch_size = 16
train_loader = DataLoader(trainset, batch_size = batch_size, shuffle = True, num_workers=cpu_num)
valid_loader = DataLoader(validset, batch_size = batch_size, shuffle = True, num_workers=cpu_num)

In [43]:
for i, el in enumerate(train_loader):
    print(len(el[0][0]))
    print(len(el[1][0]))
    print(len(el[2][0]))
    # print(el[0][0])
    # print(el[1][0])
    # print(el[2][0])
    break

256
256
256


# Create Model / Training / Evaluation / Inference

모델을 생성합니다.

In [44]:
class BERT_BiLSTM_CRF(nn.Module):
    
    def __init__(self, bert, config, need_birnn=False, rnn_dim=128):
        super(BERT_BiLSTM_CRF, self).__init__()
        
        self.num_tags = config.num_labels
        self.bert = bert
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        out_dim = config.hidden_size
        self.need_birnn = need_birnn

        # if False, no use of BiLSTM
        if need_birnn:
            self.birnn = nn.LSTM(config.hidden_size, rnn_dim, num_layers=1, bidirectional=True, batch_first=True)
            out_dim = rnn_dim*2
        
        self.hidden2tag = nn.Linear(out_dim, config.num_labels)
        self.crf = CRF(config.num_labels, batch_first=True)
    
    def predict(self, input_ids, input_mask=None):
        emissions = self.tag_outputs(input_ids, input_mask)
        return self.crf.decode(emissions, input_mask.byte())

    def forward(self, input_ids, tags, input_mask=None):
        emissions = self.tag_outputs(input_ids, input_mask)
        loss = -1*self.crf(emissions, tags.long(), input_mask.byte()) # negative log likelihood loss
        return loss

    def tag_outputs(self, input_ids, input_mask=None):
        outputs = self.bert(input_ids, attention_mask=input_mask)
        sequence_output = outputs[0]
        
        if self.need_birnn:
            self.birnn.flatten_parameters()
            sequence_output, _ = self.birnn(sequence_output)

        sequence_output = self.dropout(sequence_output)
        emissions = self.hidden2tag(sequence_output)
        return emissions

In [45]:
ckpt = "monologg/koelectra-base-v3-discriminator"
bert = ElectraModel.from_pretrained(ckpt, num_labels=22)
tokenizer = ElectraTokenizer.from_pretrained('./tokenizer/')
bert.resize_token_embeddings(len(tokenizer))
config = bert.config
print(config.num_labels)
print(config.hidden_dropout_prob)
print(config.hidden_size)

model = BERT_BiLSTM_CRF(bert, config, need_birnn=True, rnn_dim=128)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


22
0.1
768


모델을 학습시키기 위한 옵티마이저와 목적함수(cross entropy)를 호출합니다.

In [46]:
LEARNING_RATE = 5e-5
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)
# criterion = nn.CrossEntropyLoss(ignore_index = 0)

GPU 연산을 위해 cuda를 호출합니다. 이 때 복수의 연산장치(GPU)가 있을 경우 모든 연산장치를 활용하여 연산의 속도를 높이도록 설정합니다.

In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NGPU = torch.cuda.device_count()
if NGPU > 1:
    model = torch.nn.DataParallel(model, device_ids=list(range(NGPU))).cuda()
    # model = torch.nn.DataParallel(model, device_ids=[0,1])
    # torch.multiprocessing.set_start_method('spawn', force=True)

# model = model.to(device)
# criterion = criterion.to(device)

In [48]:
# model.module.predict()

정확도를 계산하기 위한 함수를 생성합니다.

In [49]:
def categorical_accuracy(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    # max_preds = preds.argmax(dim = -1, keepdim = True) # get the index of the max probability
    non_pad_elements = torch.nonzero((y != tag_pad_idx))
    correct = preds[non_pad_elements].eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]]).to(device)

In [50]:
def categorical_f1(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    # max_preds = preds.argmax(dim = -1, keepdim = True) # get the index of the max probability
    non_pad_elements = torch.nonzero((y != tag_pad_idx))
    preds_no_pad = preds[non_pad_elements].squeeze(1).detach().cpu()
    y_no_pad = y[non_pad_elements].detach().cpu()
    
    f1_macro = f1_score(y_no_pad, preds_no_pad, average='macro')
    f1_micro = f1_score(y_no_pad, preds_no_pad, average='micro')    
    
    return f1_macro, f1_micro

모델을 훈련시키기 위한 함수입니다.

In [51]:
def train(model, iterator, optimizer, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        tags = batch[2].to(device)

        optimizer.zero_grad()

        loss = model(input_ids, tags, attention_mask).mean()
        loss =  loss / batch_size
        # if NGPU > 1:
        #     loss = loss.mean() / batch_size # mean() to average on multi-gpu.
        predictions = model.module.predict(input_ids, attention_mask)
        predictions = list(map(lambda x: x + [0 for _ in range(max_length - len(x))], predictions))
        predictions = torch.LongTensor(predictions).to(device)
        predictions = predictions.view(-1)
        tags = tags.view(-1)
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

모델의 성능을 평가하기 위한 함수입니다.

In [52]:
def evaluate(model, iterator, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    predictions_set = None
    tags_set = None
    
    with torch.no_grad():
        for batch in iterator:
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            tags = batch[2].to(device)
            
            loss = model(input_ids, tags, attention_mask).mean()
            loss =  loss / batch_size
            # if NGPU > 1:
            #     loss = loss.mean() / batch_size # mean() to average on multi-gpu.
            predictions = model.module.predict(input_ids, attention_mask)
            predictions = list(map(lambda x: x + [0 for _ in range(max_length - len(x))], predictions))
            predictions = torch.LongTensor(predictions).to(device)
            predictions = predictions.view(-1)
            tags = tags.view(-1)
            
            if predictions_set == None:
                predictions_set = predictions
                tags_set = tags
            else:
                predictions_set = torch.cat([predictions_set, predictions], dim=0)
                tags_set = torch.cat([tags_set, tags], dim=0)
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)
            

            epoch_loss += loss.item()
            epoch_acc += acc.item()

        f1_macro, f1_micro = categorical_f1(predictions_set, tags_set, tag_pad_idx)
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), f1_macro, f1_micro

에폭마다 걸리는 시간을 계산하기 위한 함수입니다.

In [53]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

실질적으로 모델을 학습시킵니다. 에폭마다 검증데이터셋에서 성능을 평가하여 검증데이터셋에서 가장 좋은 성능을 보여줬을 때의 모델의 weight를 저장합니다.

In [54]:
N_EPOCHS = 30

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, 0)
    valid_loss, valid_acc, f1_mac, f1_mic = evaluate(model, valid_loader, 0)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model-bert-bilstm-crf.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print(f'\t Val. F1 Mac: {f1_mac:.2f} |  Val. F1 Mic: {f1_mic:.2f}')



Epoch: 01 | Epoch Time: 6m 5s
	Train Loss: 17.540 | Train Acc: 66.31%
	 Val. Loss: 10.010 |  Val. Acc: 78.34%
	 Val. F1 Mac: 0.43 |  Val. F1 Mic: 0.79




KeyboardInterrupt: 

학습된 모델을 이용하여 예시 문장으로 inference 합니다.

In [None]:
model.load_state_dict(torch.load("tut2-model-bert-bilstm-crf.pt"))
model.eval()
with torch.no_grad():
    sample = '2013년 11월 18일 인천항 4부두에서 컨테이너 작업 중 핸들러 집게가 풀리면서 피해자 끼임. 허리에 경상을 입음'
    input_ids, token_type_ids, attention_mask = dict(tokenizer(sample, return_tensors='pt')).values()
    input_ids, token_type_ids, attention_mask = input_ids.to(device), token_type_ids.to(device), attention_mask.to(device)
    prediction = model.predict(input_ids, attention_mask)
encoded = tokenizer.encode(sample)
decoded = tokenizer.convert_ids_to_tokens(encoded)
pred2tag = [index_to_ner[idx] for idx in prediction[0]]
for token, tag in zip(decoded, pred2tag):
    print(token, end='\t'), print(tag)

검증 데이터셋에서 5%의 데이터를 임의로 추출하여 inference 한 다음 그 결과를 csv파일로 저장합니다.

In [None]:
import random
import numpy as np

num = len(valid)
random_choice = np.random.choice(num, int(num*0.05), replace=False).tolist()
selected_sample = []

for i in random_choice:
    selected_sample.append(valid[i])
    
print(len(selected_sample))

In [None]:
model.eval()
with torch.no_grad():
    sample_results = []
    for idx, _ in enumerate(selected_sample):
        item = selected_sample[idx]
        token_ids = list(map(lambda x:tokenizer.convert_tokens_to_ids(x), item[0]))
        input_tensor = torch.LongTensor(token_ids).unsqueeze(0).to(device)
        mask_tensor = torch.LongTensor(item[1]).unsqueeze(0).to(device)
        output = model(input_tensor, mask_tensor)
        pred = torch.argmax(output, dim=-1).squeeze().detach().cpu().tolist()
        sample_results.append(pred)

In [None]:
k = random.randrange(len(selected_sample))

id2label = lambda x: index_to_ner[x]
result_labels = []
for id in sample_results[k]:
    result_labels.append(id2label(id))
for tok, id in zip(selected_sample[k][0], result_labels):
    print(tok, id)

In [None]:
f = open('sample_result.csv', 'w', encoding='utf-8-sig')
for idx, item in enumerate(selected_sample):
    all_tokens = item[0]
    if '[PAD]' not in all_tokens:
        continue
    pad_loc = all_tokens.index('[PAD]')
    tokens = all_tokens[:pad_loc][1:-1]
    labels = item[-1][:pad_loc][1:-1]
    predicts = sample_results[idx][:pad_loc][1:-1]
    length = len(tokens)
    
    f.write('token'+','+'label'+','+'predict'+'\n')
    for i in range(length):
        token = tokens[i]
        label = labels[i]
        predict = index_to_ner[predicts[i]]
        f.write(token+','+label+','+predict+'\n')
    f.write('\n')
f.close()

In [None]:
df = pd.read_csv('./sample_result.csv')
df.head(50)

In [None]:
# num = len(df)
# idx_to_label = {0:"pad", 1:"O", 2:"C", 3:"CE", 4:"E"}
# result = []
# for i in range(num):
#     sentence = df.iloc[i,0]
#     tokenizer_result = tokenizer.encode_plus(sentence)
#     tokens = tokenizer.tokenize(sentence)
#     input_ids = tokenizer_result["input_ids"]
#     mask = tokenizer_result["attention_mask"]
#     input_tensor = torch.LongTensor(input_ids).unsqueeze(0).to(device)
#     mask_tensor = torch.LongTensor(mask).unsqueeze(0).to(device)
#     output = model(input_tensor, mask_tensor)
#     pred = torch.argmax(output, dim=-1).squeeze().detach().cpu().tolist()
#     pred_label = list(map(lambda x:idx_to_label[x], pred))
#     result.append([tokens, pred_label[1:-1]])
    

In [None]:
# f = open('./8-9_cause_effec_result.csv', 'w', encoding='utf-8-sig')
# for item in result:
#     length = len(item[0])
#     for i in range(length):
#         f.write(item[0][i]+','+item[1][i]+'\n')
#     f.write('\n')
# f.close()