In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torchmetrics.classification import MulticlassF1Score

from transformers import ElectraModel, ElectraTokenizer

from sklearn.model_selection import train_test_split

import os
import random
import time
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


# Data Preparation

## Loading

In [2]:
file_list = os.listdir('./dataset')
file_path = []
for file_name in file_list:
    tmp_path = os.path.join('./dataset', file_name)
    file_path.append(tmp_path)
print(file_path)
file_path.sort()
print(file_path)

# './dataset/ulsan_2021-2019_NER_v1.xlsx' start_num 1
# './dataset/port_NER_v1.xlsx' start_num 1
# './dataset/eshc_NER_v1.xlsx' start_num 0
# './dataset/port_NER_v1.xlsx' == './dataset/ulsan_2022_NER_v1.xlsx' (동일한 데이터 )


['./dataset/port_NER_v1.xlsx', './dataset/ulsan_2022_NER_v1.xlsx', './dataset/ulsan_2021-2019_NER_v1.xlsx', './dataset/eshc_NER_v1.xlsx']
['./dataset/eshc_NER_v1.xlsx', './dataset/port_NER_v1.xlsx', './dataset/ulsan_2021-2019_NER_v1.xlsx', './dataset/ulsan_2022_NER_v1.xlsx']


In [3]:
load_df = lambda x: pd.read_excel(x, names=['src', 'tar'])
dfs = [load_df(x) for x in file_path]
df_4, df_2, df_1, df_3 = dfs
df_1, df_2, df_3, df_4 = df_1.reset_index(), df_2.reset_index(), df_3.reset_index(), df_4.reset_index()

# train = pd.read_excel("./ulsan_2021-2019_NER_v1.xlsx", names=['src', 'tar'])
# train = train.reset_index()
# for i, row in train.iterrows():
#     print(row['index'], row.src, row.tar)
#     # print(row)

# (df_2 == df_3).nunique()

# df_1, df_2, df_4

## Cleansing

In [4]:
def preprocess(df):
    df['src'] = df['src'].str.replace("．", ".", regex=False)
    df['tar'] = df['tar'].str.replace("NUMB", "NUM_B", regex=False)
    df['tar'] = df['tar'].str.replace("TRM_b", "TRM_B", regex=False)
    df['tar'] = df['tar'].str.replace("WRK_b", "WRK_B", regex=False)
    df['tar'] = df['tar'].str.replace("NUM_b", "NUM_B", regex=False)
    df['tar'] = df['tar'].str.replace("AFW_B", "AWF_B", regex=False)
    df['tar'] = df['tar'].str.replace("ANM_b", "ANM_B", regex=False)
    df['tar'] = df['tar'].str.replace("NIM_B", "NUM_B", regex=False)
    df['tar'] = df['tar'].str.replace("LOC_b", "LOC_B", regex=False)
    df['tar'] = df['tar'].str.replace("AFW_b", "AFW_B", regex=False)
    # Fix Tags
    df['tar'] = df['tar'].str.replace(r'^AF$', 'AFW_B', regex=True)
    df['tar'] = df['tar'].str.replace(r'^WRK_$', 'WRK_I', regex=True)
    df['tar'] = df['tar'].str.replace(r'^N$', 'NUM_I', regex=True)
    df['tar'] = df['tar'].str.replace(r'^LOC_$', 'LOC_I', regex=True)
    df['tar'] = df['tar'].str.replace('AFW _B', 'AFW_B', regex=False)
    df['tar'] = df['tar'].str.replace('AWF_B', 'AFW_B', regex=False)
    df['tar'] = df['tar'].str.replace('AFW)B', 'AFW_B', regex=False)
    df['tar'] = df['tar'].str.replace('AFW0B', 'AFW_B', regex=False)
    df['tar'] = df['tar'].str.replace('AFW)I', 'AFW_I', regex=False)
    df['tar'] = df['tar'].str.replace('AFW_O', 'AFW_I', regex=False)
    df['tar'] = df['tar'].str.replace('DAT+I', 'DAT_I', regex=False)
    df['tar'] = df['tar'].str.replace('DAT0B', 'DAT_B', regex=False)
    df['tar'] = df['tar'].str.replace('DAT_II', 'DAT_I', regex=False)
    df['tar'] = df['tar'].str.replace(',MAT_B', 'MAT_B', regex=False)
    df['tar'] = df['tar'].str.replace('MAT)B', 'MAT_B', regex=False)
    df['tar'] = df['tar'].str.replace('MAT+I', 'MAT_I', regex=False)
    df['tar'] = df['tar'].str.replace('TIOM_I', 'TIM_I', regex=False)
    df['tar'] = df['tar'].str.replace('WRL_I', 'WRK_I', regex=False)
    # Fix Tags
    df['tar'] = df['tar'].str.replace("(삭제)", "-", regex=False)
    df['tar'] = df['tar'].str.replace("TIM_b", "TIM_B", regex=False)
    df = df.fillna("-")
    df['src'] = df['src'].astype(str)
    df['tar'] = df['tar'].astype(str)
    df['src'] = df['src'].str.replace(r'[^ㄱ-ㅣ가-힣0-9a-zA-Z.]+', "", regex=True)
    return df

In [5]:
df_1 = preprocess(df_1)
df_2 = preprocess(df_2)
df_4 = preprocess(df_4)

In [6]:
# df_1[:50] # start_num 1
# df_2[:50] # start_num 1
# df_4[:50] # start_num 0

In [7]:
# df_1.isnull().any(), df_2.isnull().any(), df_4.isnull().any() # not found

## Reformat / Tokenization / Integrity Check

In [8]:
train = pd.concat([df_1, df_2, df_4])

In [9]:
label = train['tar'].unique().tolist()
label.sort()
label = ['[PAD]'] +label
label_dict = {word:i for i, word in enumerate(label)}
# label_dict.update({"[PAD]":len(label_dict)})
index_to_ner = {i:j for j, i in label_dict.items()}

num_labels = len(label)

print(label_dict), print(), print(index_to_ner), print(), print(label), print(), print(f'num_labels: {num_labels}')

{'[PAD]': 0, '-': 1, 'AFW_B': 2, 'AFW_I': 3, 'ANM_B': 4, 'ANM_I': 5, 'CVL_B': 6, 'CVL_I': 7, 'DAT_B': 8, 'DAT_I': 9, 'LOC_B': 10, 'LOC_I': 11, 'MAT_B': 12, 'MAT_I': 13, 'NUM_B': 14, 'NUM_I': 15, 'TIM_B': 16, 'TIM_I': 17, 'TRM_B': 18, 'TRM_I': 19, 'WRK_B': 20, 'WRK_I': 21}

{0: '[PAD]', 1: '-', 2: 'AFW_B', 3: 'AFW_I', 4: 'ANM_B', 5: 'ANM_I', 6: 'CVL_B', 7: 'CVL_I', 8: 'DAT_B', 9: 'DAT_I', 10: 'LOC_B', 11: 'LOC_I', 12: 'MAT_B', 13: 'MAT_I', 14: 'NUM_B', 15: 'NUM_I', 16: 'TIM_B', 17: 'TIM_I', 18: 'TRM_B', 19: 'TRM_I', 20: 'WRK_B', 21: 'WRK_I'}

['[PAD]', '-', 'AFW_B', 'AFW_I', 'ANM_B', 'ANM_I', 'CVL_B', 'CVL_I', 'DAT_B', 'DAT_I', 'LOC_B', 'LOC_I', 'MAT_B', 'MAT_I', 'NUM_B', 'NUM_I', 'TIM_B', 'TIM_I', 'TRM_B', 'TRM_I', 'WRK_B', 'WRK_I']

num_labels: 22


(None, None, None, None, None, None, None)

In [10]:
df_1_list = [list(x) for x in df_1[['index', 'src', 'tar']].to_numpy()] # start_num 1
df_2_list = [list(x) for x in df_2[['index', 'src', 'tar']].to_numpy()] # start_num 1
df_4_list = [list(x) for x in df_4[['index', 'src', 'tar']].to_numpy()] # start_num 0

In [11]:
def makeTups(data, start_num):
    tups = []
    temp_tup = []
    temp_tup.append(data[0][1:])
    for i, j, k in data: # index src tgt
        if i != start_num:
            temp_tup.append([j,k])
        if i == start_num:
            if len(temp_tup) != 0:
                tups.append(temp_tup)
                temp_tup = []
                temp_tup.append([j,k])

    tups[0].pop(0)
    return tups

In [12]:
tups_1 = makeTups(df_1_list, 1)
tups_2 = makeTups(df_2_list, 1)
tups_4 = makeTups(df_4_list, 0)
print(len(tups_1), len(tups_2), len(tups_4))

2236 1299 1028


In [13]:
tups = tups_1 + tups_2 + tups_4
print(len(tups))

4563


### Check If Empty

In [14]:
# check and remove if there is any empty row

empties = []
for i, row in enumerate(tups):
    if len(row) < 2 and row[0][0] == '':
        # print(i, row)
        empties.append(i)

empties.sort(reverse=True)

print(f'{len(empties)} empty elements')

for idx in empties:
    tups.pop(idx)

for i, row in enumerate(tups):
    if len(row) < 2 and row[0][0] == '':
        print(i, row)

len(tups)

16 empty elements


4547

### Load Tokenizer and Check Special Tokens

In [15]:
tokenizer = ElectraTokenizer.from_pretrained('./tokenizer/')

In [16]:
tokenizer.pad_token_id

0

In [17]:
init_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
init_token, pad_token, unk_token, sep_token

('[CLS]', '[PAD]', '[UNK]', '[SEP]')

In [18]:
sentences = []
targets = []
for tup in tups:
    sentence = []
    target = []
    # sentence.append(init_token)
    # target.append(label_dict['-'])
    for i, j in tup:
        sentence.append(i)
        target.append(j)
    # sentence.append(sep_token)
    # target.append(label_dict['-'])
    sentences.append(sentence)
    targets.append(target)

In [19]:
k = random.randrange(len(sentences))
print(f'random inddex k is {k}')
print(len(sentences)), print(len(targets))
print(len(sentences[k])), print(len(targets[k]))
print(sentences[k]), print(targets[k])

random inddex k is 3478
4547
4547
24
24
['2022년', '2월', '4일', '오후', '4시경', '용인의', '반도체', '장비', '제조업체에서', '작업자가', '기계에', '목부분이', '끼이는', '사고가', '발생하였다.', '이', '사고로', '작업자는', '의식', '불명인', '상태로', '병원으로', '이송이', '되었다.']
['DAT_B', 'DAT_I', 'DAT_I', 'TIM_B', 'TIM_I', 'LOC_B', 'LOC_I', 'LOC_I', 'LOC_I', 'CVL_B', '-', 'ANM_B', '-', '-', '-', '-', '-', 'CVL_B', '-', '-', '-', 'LOC_B', '-', '-']


(None, None)

In [20]:
for sent, tgt in zip(sentences, targets):
    if len(sent) != len(tgt):
        print(sent)
        print(tgt)

### Tokenization and Label Matching

In [21]:
# tokenized_sentence_ = []
# labels_ = []

# word_ = '우리나라만세만세만세'
# label_ = 'WRK_B'

# tokenized_word_ = tokenizer.tokenize(word_)
# n_subwords = len(tokenized_word_)

# tokenized_sentence_.extend(tokenized_word_)
# if label_[-1] == 'B':
#     tail = list(label_)
#     tail[-1] = 'I'
#     tail = ''.join(tail)
#     labels_.extend([label_] + [tail]*(n_subwords-1))
# else:
#     labels_.extend([label_] * n_subwords)

# tokenized_sentence_, labels_

In [22]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []
    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        if label[-1] == 'B' and n_subwords > 1:
            tail = list(label)
            tail[-1] = 'I'
            tail = ''.join(tail)
            labels.extend([label] + [tail]*(n_subwords-1))
        else:
            labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [23]:
tokenized_texts_and_labels = [
                              tokenize_and_preserve_labels(sent, labs)
                              for sent, labs in zip(sentences, targets)]

In [24]:
# # [(문장, 개체들), (문장, 개체들),...] 형식으로 저장되어 있음.
# k = random.randrange(len(tokenized_texts_and_labels))
# print(f'random inddex k is {k}')
# print(len(tokenized_texts_and_labels)), print(len(tokenized_texts_and_labels))
# print(len(tokenized_texts_and_labels[k][0])), print(len(tokenized_texts_and_labels[k][1]))
# print(tokenized_texts_and_labels[k][0]), print(), print(tokenized_texts_and_labels[k][1])

In [25]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [26]:
# k = random.randrange(len(tokenized_texts))
# print(f'random inddex k is {k}')
# print(len(tokenized_texts)), print(len(labels))
# print(len(tokenized_texts[k])), print(len(labels[k])), print()
# print(tokenized_texts[k]), print(labels[k])
# for tok, lab in zip(tokenized_texts[k], labels[k]):
#     print(tok, '\t',lab)

In [27]:
error_count = 0
for txt, lab in zip(tokenized_texts, labels):
    if len(txt) != len(lab):
        # print(txt)
        # print(lab)
        error_count += 1
print(error_count)

0


In [28]:
lengths = []
for tokens in tokenized_texts:
    lengths.append(len(tokens))

f'the longest is {max(lengths)}'

'the longest is 241'

In [29]:
tokenized_texts_list, labels_list = tokenized_texts, labels

### Add Special Tokens and Pad Inputs

In [30]:
# print(len(label_dict)), print(len(index_to_ner))
# print(label_dict)
# print(index_to_ner)
print(label)

['[PAD]', '-', 'AFW_B', 'AFW_I', 'ANM_B', 'ANM_I', 'CVL_B', 'CVL_I', 'DAT_B', 'DAT_I', 'LOC_B', 'LOC_I', 'MAT_B', 'MAT_I', 'NUM_B', 'NUM_I', 'TIM_B', 'TIM_I', 'TRM_B', 'TRM_I', 'WRK_B', 'WRK_I']


In [31]:
def make_inputs(tokenized_texts_list, labels_list, max_length):
    max_length = max_length
    input_tokens = []
    input_mask = []
    input_labels = []
    num = len(tokenized_texts_list)
    for i in range(num):
        data = tokenized_texts_list[i]
        labels = labels_list[i]
        tmp_tokens = [init_token] + data
        tmp_labels = ['-'] + labels
        if len(tmp_tokens) < max_length-1:
            tmp_tokens += [sep_token]
            tmp_labels += ['-']
            tmp_mask = [1] * len(tmp_tokens)
            for _ in range(max_length-len(tmp_tokens)):
                tmp_tokens.append(pad_token)
                tmp_labels.append('[PAD]')
                tmp_mask.append(0)
            input_tokens.append(tmp_tokens)
            input_labels.append(tmp_labels)
            input_mask.append(tmp_mask)
        elif len(tmp_tokens) > max_length-1:
            tmp_tokens = tmp_tokens[:max_length-1] + [sep_token]
            tmp_labels = tmp_labels[:max_length-1] + ['-']
            tmp_mask = [1] * max_length
            input_tokens.append(tmp_tokens)
            input_labels.append(tmp_labels)
            input_mask.append(tmp_mask)
        elif len(tmp_tokens) == max_length-1:
            tmp_tokens += [sep_token]
            tmp_labels += [label_dict['-']]
            tmp_mask = [1] * max_length
            input_tokens.append(tmp_tokens)
            input_labels.append(tmp_labels)
            input_mask.append(tmp_mask)

    return input_tokens, input_mask, input_labels

In [32]:
max_length = 256
input_tokens, input_mask, input_labels = make_inputs(tokenized_texts, labels, max_length)

In [33]:
print(f'length of input_tokens: {len(input_tokens)}')
print(f'length of input_labels: {len(input_labels)}')
print(f'length of input_mask: {len(input_mask)}')

k = random.randrange(len(input_tokens))
print(f'random inddex k is {k}')
print(len(input_tokens[k])), print(len(input_labels[k])), print(len(input_mask[k]))
print(), print(input_tokens[k]), print(), print(input_labels[k]), print(), print(input_mask[k])

length of input_tokens: 4547
length of input_labels: 4547
length of input_mask: 4547
random inddex k is 1327
256
256
256

['[CLS]', '2021', '##년', '3', '##월', '8', '##일', '오전', '9', '##시', '4', '0', '##분', '##경', '창원', '##의', '한', '사업장', '##에', '##서', '원자로', '설비', '부품', '100', '##토', '##을', '크레인', '을', '이용해', '트레일러', '##에', '싣', '##는', '작업', '을', '하', '##다가', '부품', '##에', '깔리', '##는', '사고', '##가', '발생', '##하', '##였', '##다', '.', '재해자', '인', '운전자', '는', '미끄럼', '방지', '나무', '깔', '##판', '##을', '이', '동', '시키', '##기', '위해', '상체', '##를', '부품', '##과', '트레일러', '사이', '##에', '넣', '##었', '##다가', '부품', '##이', '움직여', '협착', '된', '것', '##으로', '알려졌', '##다', '.', '병원', '##으로', '후송', '##되', '##었', '##으나', '결국', '사망', '##하', '##였', '##다', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

(None, None, None, None, None, None)

### Check Lengths

In [34]:
temp = 0
num = len(input_tokens)
for i in range(num):
    curr_tokens = input_tokens[i]
    curr_label = input_labels[i]
    curr_mask = input_mask[i]
    
    count_tokens, count_label, count_mask = 0,0,0
    for count, token in enumerate(curr_tokens):
        if token == '[PAD]':
            break
    count_tokens = count
    for count, label in enumerate(curr_label):
        if label == '[PAD]':
            break
    count_label = count
    for count, mask in enumerate(curr_mask):
        if mask == 0:
            break
    count_mask = count
    if not (count_tokens == count_label == count_mask):
        # print(count_tokens, count_label, count_mask)
        temp += 1
print(temp)

0


### Pair Inputs and Split to Ratio

In [35]:
input_pair = []
num = len(input_tokens)
for i in range(num):
    tmp_token_idx = input_tokens[i]
    tmp_mask = input_mask[i]
    tmp_label_idx = input_labels[i]
    input_pair.append((tmp_token_idx, tmp_mask, tmp_label_idx))

In [36]:
# k = random.randrange(len(input_tokens))
# print(input_pair[k])

데이터셋을 8:2 비율로 훈련 데이터셋과 검증 데이터셋으로 분할합니다. 데이터셋의 양이 적어 테스트셋은 생성하지 않았습니다.

In [37]:
train, valid = train_test_split(input_pair, test_size=0.2)

In [38]:
print(len(train)), print(len(valid))
# print(train[0]), print(valid[0])

3637
910


(None, None)

In [39]:
# temp = []
# for e in train[0][0]:
#     encoded = tokenizer.convert_tokens_to_ids(str(e))
#     temp.append(encoded)
# print(temp)

### Delcare Dataset (Iterator)

pytorch에서 데이터를 받아들일 수 있도록 커스텀 데이터셋을 생성합니다. 이 때 문장을 구성하는 토큰들을 고유한 인덱스 번호로 변환합니다. 마찬가지로 레이블도 인덱스 번호로 변환합니다.

In [40]:
class CustomDataset(Dataset): 
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    def __len__(self): 
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx][0]
        masks = self.data[idx][1]
        labels = self.data[idx][2]
        token_ids = []
        for token in tokens:
            ids = tokenizer.convert_tokens_to_ids(str(token))
            token_ids.append(ids)
        label_ids = []
        for label in labels:
            label_ids.append(label_dict[label])
                
        return (torch.LongTensor(token_ids), torch.LongTensor(masks), torch.LongTensor(label_ids))

In [41]:
trainset = CustomDataset(train, tokenizer)
validset = CustomDataset(valid, tokenizer)

배치 크기가 128인 데이터로더를 생성합니다.

In [42]:
train_loader = DataLoader(trainset, batch_size = 128, shuffle = True)
valid_loader = DataLoader(validset, batch_size = 128, shuffle = True)

In [43]:
for i, el in enumerate(train_loader):
    print(len(el[0][0]))
    print(len(el[1][0]))
    print(len(el[2][0]))
    # print(el[0][0])
    # print(el[1][0])
    # print(el[2][0])
    break

256
256
256


# Create Model / Training / Evaluation / Inference

모델을 생성합니다.

In [44]:
class BERTPoSTagger(nn.Module):
    def __init__(self,
                 bert,
                 output_dim, 
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.fc = nn.Linear(embedding_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, mask):
        
        #text = [batch size, sent len]
        embedded = self.dropout(self.bert(text, mask)[0])
        #embedded = [batch size, seq len, emb dim]
        
        predictions = self.fc(self.dropout(embedded))
        
        
        return predictions

In [45]:
bert = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
print(len(tokenizer))

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


36223


In [46]:
OUTPUT_DIM = num_labels
DROPOUT = 0.25

model = BERTPoSTagger(bert,
                      OUTPUT_DIM, 
                      DROPOUT)
model.bert.resize_token_embeddings(len(tokenizer))

Embedding(36223, 768)

In [47]:
model.bert.config.to_dict()['hidden_size']

768

In [48]:
bert_result = model.bert(trainset[0][0].unsqueeze(0), trainset[0][1].unsqueeze(0))
bert_result.last_hidden_state.shape

torch.Size([1, 256, 768])

In [49]:
tagger_result = model(trainset[0][0].unsqueeze(0), trainset[0][1].unsqueeze(0))
tagger_result.shape

torch.Size([1, 256, 22])

모델을 학습시키기 위한 옵티마이저와 목적함수(cross entropy)를 호출합니다.

In [50]:
LEARNING_RATE = 5e-5

optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [51]:
criterion = nn.CrossEntropyLoss(ignore_index = 0)

GPU 연산을 위해 cuda를 호출합니다. 이 때 복수의 연산장치(GPU)가 있을 경우 모든 연산장치를 활용하여 연산의 속도를 높이도록 설정합니다.

In [52]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NGPU = torch.cuda.device_count()
if NGPU > 1:
    model = torch.nn.DataParallel(model, device_ids=list(range(NGPU)))
    # torch.multiprocessing.set_start_method('spawn', force=True)
model = model.to(device)
criterion = criterion.to(device)

정확도를 계산하기 위한 함수를 생성합니다.

In [53]:
def categorical_accuracy(preds, y, tag_pad_idx):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = -1, keepdim = True) # get the index of the max probability
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]]).to(device)

F1 Macro and F1 Micro

In [54]:
f1_macro = MulticlassF1Score(num_classes=num_labels, average='macro', ignore_index=0).to(device)
f1_micro = MulticlassF1Score(num_classes=num_labels, average='micro', ignore_index=0).to(device)

모델을 훈련시키기 위한 함수입니다.

In [55]:
def train(model, iterator, optimizer, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        text = batch[0].to(device)
        mask = batch[1].to(device)
        tags = batch[2].to(device)

        optimizer.zero_grad()
        
        #text = [sent len, batch size]
        
        # print(text)
        # print()
        # print(mask)
        # print()
        # print(tags)
                
        predictions = model(text, mask)

        # print()
        # print(predictions)
        
        #predictions = [sent len, batch size, output dim]
        #tags = [sent len, batch size]
        # torch.Size([32, 256, 40])
        predictions = predictions.view(-1, predictions.shape[-1]) # shape(batchsize, length, ouputdim) --> shape(batchsize, length*outputdim)
        # torch.Size([8192, 40])
        tags = tags.view(-1)
        
        #predictions = [sent len * batch size, output dim]
        #tags = [sent len * batch size]
        
        loss = criterion(predictions, tags)
                
        acc = categorical_accuracy(predictions, tags, tag_pad_idx)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

모델의 성능을 평가하기 위한 함수입니다.

In [56]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1_mac = 0
    epoch_f1_mic = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text = batch[0].to(device)
            mask = batch[1].to(device)
            tags = batch[2].to(device)
            
            predictions = model(text, mask)
            
            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(predictions, tags)
            
            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            f1_mac = f1_macro(predictions, tags)
            f1_mic = f1_micro(predictions, tags)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_f1_mac += f1_mac
            epoch_f1_mic += f1_mic
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1_mac / len(iterator), epoch_f1_mic / len(iterator)

에폭마다 걸리는 시간을 계산하기 위한 함수입니다.

In [57]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

실질적으로 모델을 학습시킵니다. 에폭마다 검증데이터셋에서 성능을 평가하여 검증데이터셋에서 가장 좋은 성능을 보여줬을 때의 모델의 weight를 저장합니다.

In [58]:
N_EPOCHS = 15

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, 0)
    valid_loss, valid_acc, f1_mac, f1_mic = evaluate(model, valid_loader, criterion, 0)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print(f'\t Val. F1 Mac: {f1_mac:.2f} |  Val. F1 Mic: {f1_mic:.2f}')

Epoch: 01 | Epoch Time: 0m 59s
	Train Loss: 1.968 | Train Acc: 50.09%
	 Val. Loss: 1.203 |  Val. Acc: 69.46%
	 Val. F1 Mac: 0.16 |  Val. F1 Mic: 0.69
Epoch: 02 | Epoch Time: 0m 52s
	Train Loss: 1.009 | Train Acc: 74.36%
	 Val. Loss: 0.707 |  Val. Acc: 81.79%
	 Val. F1 Mac: 0.47 |  Val. F1 Mic: 0.82
Epoch: 03 | Epoch Time: 0m 53s
	Train Loss: 0.672 | Train Acc: 82.84%
	 Val. Loss: 0.541 |  Val. Acc: 85.06%
	 Val. F1 Mac: 0.57 |  Val. F1 Mic: 0.85
Epoch: 04 | Epoch Time: 0m 53s
	Train Loss: 0.531 | Train Acc: 85.73%
	 Val. Loss: 0.473 |  Val. Acc: 86.28%
	 Val. F1 Mac: 0.61 |  Val. F1 Mic: 0.86
Epoch: 05 | Epoch Time: 0m 53s
	Train Loss: 0.452 | Train Acc: 87.24%
	 Val. Loss: 0.465 |  Val. Acc: 85.62%
	 Val. F1 Mac: 0.61 |  Val. F1 Mic: 0.86
Epoch: 06 | Epoch Time: 0m 53s
	Train Loss: 0.399 | Train Acc: 88.46%
	 Val. Loss: 0.429 |  Val. Acc: 86.74%
	 Val. F1 Mac: 0.66 |  Val. F1 Mic: 0.87
Epoch: 07 | Epoch Time: 0m 53s
	Train Loss: 0.376 | Train Acc: 89.18%
	 Val. Loss: 0.462 |  Val. Acc

학습된 모델을 이용하여 예시 문장으로 inference 합니다.

In [59]:
model.load_state_dict(torch.load("tut2-model.pt"))
model.eval()
with torch.no_grad():
    sample = '2013년 11월 18일 인천항 4부두에서 컨테이너 작업 중 핸들러 집게가 풀리면서 피해자 끼임. 허리에 경상을 입음'
    words = tokenizer.tokenize(sample)
    tokens = tokenizer(sample)["input_ids"]
    mask = tokenizer(sample)['attention_mask']
    tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    tensor_mask = torch.LongTensor(mask).unsqueeze(0).to(device)
    logit = model(tensor, tensor_mask)
    pred = torch.argmax(logit, dim=-1).squeeze().detach().cpu().tolist()
    for i in range(len(words)):
        print(words[i], index_to_ner[pred[i+1]])

2013 DAT_B
##년 DAT_I
11 DAT_I
##월 DAT_I
18 DAT_I
##일 DAT_I
인천 LOC_B
항 LOC_I
4 LOC_I
부두 LOC_I
에서 LOC_I
컨테이너 WRK_B
작업 WRK_I
중 WRK_I
핸들러 AFW_B
집게 AFW_I
##가 AFW_I
풀리 -
##면 -
##서 -
피해자 CVL_B
끼 -
##임 -
. -
허리 ANM_B
##에 ANM_I
경상 -
##을 -
입 -
##음 -


검증 데이터셋에서 5%의 데이터를 임의로 추출하여 inference 한 다음 그 결과를 csv파일로 저장합니다.

In [None]:
import random
import numpy as np

num = len(valid)
random_choice = np.random.choice(num, int(num*0.05), replace=False).tolist()
selected_sample = []

for i in random_choice:
    selected_sample.append(valid[i])
    
print(len(selected_sample))

38


In [None]:
model.eval()
with torch.no_grad():
    sample_results = []
    for idx, _ in enumerate(selected_sample):
        item = selected_sample[idx]
        token_ids = list(map(lambda x:tokenizer.convert_tokens_to_ids(x), item[0]))
        input_tensor = torch.LongTensor(token_ids).unsqueeze(0).to(device)
        mask_tensor = torch.LongTensor(item[1]).unsqueeze(0).to(device)
        output = model(input_tensor, mask_tensor)
        pred = torch.argmax(output, dim=-1).squeeze().detach().cpu().tolist()
        sample_results.append(pred)

In [None]:
f = open('sample_result.csv', 'w', encoding='utf-8-sig')
idx_to_label = {0:"pad", 1:"O", 2:"C", 3:"CE", 4:"E"}
for idx, item in enumerate(selected_sample):
    all_tokens = item[0]
    if '[PAD]' not in all_tokens:
        continue
    pad_loc = all_tokens.index('[PAD]')
    tokens = all_tokens[:pad_loc][1:-1]
    labels = item[-1][:pad_loc][1:-1]
    predicts = sample_results[idx][:pad_loc][1:-1]
    length = len(tokens)
    for i in range(length):
        token = tokens[i]
        label = labels[i]
        predict = idx_to_label[predicts[i]]
        f.write(token+','+label+','+predict+'\n')
    f.write('\n')
f.close()

In [None]:
df = pd.read_csv('./8-9_cause_effec.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './8-9_cause_effec.csv'

In [None]:
num = len(df)
idx_to_label = {0:"pad", 1:"O", 2:"C", 3:"CE", 4:"E"}
result = []
for i in range(num):
    sentence = df.iloc[i,0]
    tokenizer_result = tokenizer.encode_plus(sentence)
    tokens = tokenizer.tokenize(sentence)
    input_ids = tokenizer_result["input_ids"]
    mask = tokenizer_result["attention_mask"]
    input_tensor = torch.LongTensor(input_ids).unsqueeze(0).to(device)
    mask_tensor = torch.LongTensor(mask).unsqueeze(0).to(device)
    output = model(input_tensor, mask_tensor)
    pred = torch.argmax(output, dim=-1).squeeze().detach().cpu().tolist()
    pred_label = list(map(lambda x:idx_to_label[x], pred))
    result.append([tokens, pred_label[1:-1]])
    

In [None]:
f = open('./8-9_cause_effec_result.csv', 'w', encoding='utf-8-sig')
for item in result:
    length = len(item[0])
    for i in range(length):
        f.write(item[0][i]+','+item[1][i]+'\n')
    f.write('\n')
f.close()