In [136]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torchmetrics.classification import MulticlassF1Score

from transformers import ElectraModel, ElectraTokenizer

from sklearn.model_selection import train_test_split

import os
import random
import time
import pandas as pd

## Loading

In [137]:
file_list = os.listdir('./inference_data')
file_path = []
for file_name in file_list:
    tmp_path = os.path.join('./inference_data', file_name)
    file_path.append(tmp_path)
file_path.sort()

In [138]:
load_df = lambda x: pd.read_csv(x)
dfs = [load_df(x) for x in file_path]
infer_data = dfs[0]
infer_data.dropna(inplace=True)
# infer_data[pd.isna(infer_data).any(axis=1)]
# infer_data.isna().nunique()
print(f'length of {file_path[0]}: {len(infer_data)}')

length of ./inference_data/event_port.csv: 171


In [139]:
infer_data.isnull().any()

Event    False
dtype: bool

In [140]:
# for i, row in infer_data.iterrows():
#     print(row.Event)
#     print()

## Cleansing

In [141]:
# def preprocess(df):
#     df['Event'] = df['Event'].str.replace("．", ".", regex=False)
#     df['Event'] = df['Event'].astype(str)
#     # df['Event'] = df['Event'].str.replace(r'[^ㄱ-ㅣ가-힣0-9a-zA-Z.]+', " ", regex=True)
#     return df

In [142]:
# infer_data = preprocess(infer_data)

In [143]:
# infer_data[0]

In [144]:
tokenizer = ElectraTokenizer.from_pretrained('./tokenizer/')

In [145]:
tokenizer.pad_token_id

0

In [146]:
init_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
init_token, pad_token, unk_token, sep_token

('[CLS]', '[PAD]', '[UNK]', '[SEP]')

In [147]:
infer_data_lst = infer_data.Event.apply(lambda x: tokenizer(x, max_length=256, padding='max_length', truncation=True))
input_ids_lst = []
input_masks_lst = []
for e in infer_data_lst:
    input_ids_lst.append(e['input_ids'])
    input_masks_lst.append(e['attention_mask'])
len(input_ids_lst), len(input_masks_lst)

(171, 171)

In [149]:
print(input_ids_lst[0], '\n\n', input_masks_lst[0])
print(input_ids_lst[0], '\n\n', input_masks_lst[0])

[2, 7102, 4556, 23, 4501, 6338, 4366, 6224, 30, 7203, 4158, 35137, 14881, 17303, 3130, 36053, 16030, 21211, 4292, 6237, 35045, 3330, 3242, 9214, 35311, 12, 35156, 13, 6215, 24, 2351, 4292, 3027, 4112, 3826, 25, 4071, 4292, 3027, 4031, 8416, 4044, 16030, 20788, 36077, 3232, 2151, 4219, 12282, 4820, 3330, 35037, 2010, 15893, 24542, 4181, 4129, 2682, 4234, 3330, 3014, 4292, 3245, 4219, 2771, 2672, 4047, 16030, 6314, 6800, 4073, 9885, 26509, 6904, 4283, 7087, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 

In [150]:
class BERTPoSTagger(nn.Module):
    def __init__(self,
                 bert,
                 output_dim, 
                 dropout):
        super().__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        self.fc = nn.Linear(embedding_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, text, mask):
        #text = [batch size, sent len]
        embedded = self.dropout(self.bert(text, mask)[0])
        #embedded = [batch size, seq len, emb dim]
        predictions = self.fc(self.dropout(embedded))
        return predictions

In [151]:
bert = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
print(len(tokenizer))

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


36223


In [152]:
OUTPUT_DIM = 22
DROPOUT = 0.25
model = BERTPoSTagger(bert, OUTPUT_DIM, DROPOUT)
model.bert.resize_token_embeddings(len(tokenizer))

Embedding(36223, 768)

In [153]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NGPU = torch.cuda.device_count()
if NGPU > 1:
    model = torch.nn.DataParallel(model, device_ids=list(range(NGPU)))
    # torch.multiprocessing.set_start_method('spawn', force=True)
model = model.to(device)

In [154]:
model.load_state_dict(torch.load("tut2-model.pt"))

<All keys matched successfully>

In [172]:
label2id = {'[PAD]': 0, '-': 1, 'AFW_B': 2, 'AFW_I': 3, 'ANM_B': 4, 'ANM_I': 5, 'CVL_B': 6, 'CVL_I': 7, 'DAT_B': 8, 'DAT_I': 9, 'LOC_B': 10, 'LOC_I': 11, 'MAT_B': 12, 'MAT_I': 13, 'NUM_B': 14, 'NUM_I': 15, 'TIM_B': 16, 'TIM_I': 17, 'TRM_B': 18, 'TRM_I': 19, 'WRK_B': 20, 'WRK_I': 21}
id2label = {0: '[PAD]', 1: '-', 2: 'AFW_B', 3: 'AFW_I', 4: 'ANM_B', 5: 'ANM_I', 6: 'CVL_B', 7: 'CVL_I', 8: 'DAT_B', 9: 'DAT_I', 10: 'LOC_B', 11: 'LOC_I', 12: 'MAT_B', 13: 'MAT_I', 14: 'NUM_B', 15: 'NUM_I', 16: 'TIM_B', 17: 'TIM_I', 18: 'TRM_B', 19: 'TRM_I', 20: 'WRK_B', 21: 'WRK_I'}
labels = ['[PAD]', '-', 'AFW_B', 'AFW_I', 'ANM_B', 'ANM_I', 'CVL_B', 'CVL_I', 'DAT_B', 'DAT_I', 'LOC_B', 'LOC_I', 'MAT_B', 'MAT_I', 'NUM_B', 'NUM_I', 'TIM_B', 'TIM_I', 'TRM_B', 'TRM_I', 'WRK_B', 'WRK_I']

In [169]:
model.eval()
with torch.no_grad():

    # infer_data.Event.apply(lambda x: tokenizer(x, max_length=256, padding='max_length', truncation=True))


    infer_results = []
    for input_ids, input_masks in zip(input_ids_lst, input_masks_lst): 
        # print(input_ids, input_masks)
        input_tensor = torch.LongTensor(input_ids).unsqueeze(0).to(device)
        mask_tensor = torch.LongTensor(input_masks).unsqueeze(0).to(device)
        output = model(input_tensor, mask_tensor)
        pred = torch.argmax(output, dim=-1).squeeze().detach().cpu().tolist()
        infer_results.append(pred)
        break

    
    # for idx, _ in enumerate(selected_sample):
    #     item = selected_sample[idx]
    #     input_ids_lst = list(map(lambda x:tokenizer.convert_tokens_to_ids(x), item[0]))
    #     input_tensor = torch.LongTensor(input_ids_lst).unsqueeze(0).to(device)
    #     mask_tensor = torch.LongTensor(item[1]).unsqueeze(0).to(device)
    #     output = model(input_tensor, mask_tensor)
    #     pred = torch.argmax(output, dim=-1).squeeze().detach().cpu().tolist()
    #     sample_results.append(pred)

In [212]:
tokenizer.convert_tokens_to_string

'[PAD]'

In [219]:
decoded = tokenizer.decode(input_ids_lst[0], skip_special_tokens=True)
subwords = tokenizer.encode(decoded)[1:-1]
subwords = list(map(lambda x: tokenizer.convert_ids_to_tokens(x), subwords))
# subwords = tokenizer.decode(subwords)
print(subwords)


lenn = len(subwords)
print(lenn)


toLabel = lambda x: id2label[x]
tags = []
for result in infer_results:
    result = result[1:lenn]
    for id in result:
        tags.append(toLabel(id))
print(tags)

['2015', '##년', '3', '##월', '15', '##일', '20', ':', '35', '##경', '○○', '에', '선적', '##을', '위해', '인', '(', ')', '에서', '단', '##을', '쌓', '##은', '후', '5', '##단', '##을', '쌓', '##기', '위하', '##여', '을', '깔', '##고', '내려오', '##던', '가', '발이', '미끄러지', '##면', '##서', '몸', '##의', '심', '##을', '잃', '##고', '면', '##과', '사이', '공간', '##에', '추락', '##하여', '사망', '##한', '사례']
57
['DAT_B', 'DAT_I', 'DAT_I', 'DAT_I', 'DAT_I', 'DAT_I', 'TIM_B', 'TIM_I', 'TIM_I', 'TIM_I', 'LOC_B', 'LOC_I', 'LOC_I', 'LOC_I', '-', '-', '-', '-', '-', '-', '-', '-', 'LOC_B', 'LOC_I', 'LOC_I', '-', 'LOC_I', 'LOC_I', 'NUM_B', 'NUM_I', 'NUM_I', '-', '-', '-', 'NUM_B', 'NUM_I', 'NUM_I', '-', '-', '-', '-', '-', '-', '-', 'WRK_I', '-', '-', '-', '-', '-', 'CVL_B', 'CVL_I', 'ANM_B', '-', '-', '-']


In [226]:
# 날짜(DAT), 시간(TIME), 장소(LOC), 작업 정보(WRK)를 추출해서 txt 파일에 저장

def extract(tagtype, tags):
    # if tag does not exists then notify and pass
    # if not '{tag}_I' then stop
    tag_start = f'{tagtype}_B'
    if tag_start in tags:
        b_idx = tags.index(tag_start)
        for i, tag in enumerate(tags[b_idx:]):
            if i > 0 and tag != f'{tagtype}_I':
                # print(tags[b_idx:b_idx+i])
                tagged = subwords[b_idx:b_idx+i]
                tagged = tokenizer.convert_tokens_to_string(tagged)
                print(tagged)
                break        
    else:
        print(f'{tagtype} does not exist.')
        pass

dat = 'DAT'
tim = 'TIM'
loc = 'LOC'
wrk = 'WRK'

tagtype = loc

extract(tagtype, tags)

### source code
# 
# tagtype = dat
# b_idx = tags.index(f'{tagtype}_B')
# for i, tag in enumerate(tags[b_idx:]):
#     if i > 0 and tag != f'{tagtype}_I':
#         print(i)
#         print(tag)
#         print(tags[b_idx:b_idx+i])
#         print(subwords[b_idx:b_idx+i])
#         print(tokenizer.convert_tokens_to_string(subwords[b_idx:b_idx+i]))
#         break

['LOC_B', 'LOC_I', 'LOC_I', 'LOC_I']
○○ 에 선적을


### Pair Inputs and Split to Ratio

In [None]:
input_pair = []
num = len(input_tokens)
for i in range(num):
    tmp_token_idx = input_tokens[i]
    tmp_mask = input_mask[i]
    tmp_label_idx = input_labels[i]
    input_pair.append((tmp_token_idx, tmp_mask, tmp_label_idx))

In [None]:
# k = random.randrange(len(input_tokens))
# print(input_pair[k])

데이터셋을 8:2 비율로 훈련 데이터셋과 검증 데이터셋으로 분할합니다. 데이터셋의 양이 적어 테스트셋은 생성하지 않았습니다.

In [None]:
train, valid = train_test_split(input_pair, test_size=0.2)

In [None]:
print(len(train)), print(len(valid))
# print(train[0]), print(valid[0])

3637
910


(None, None)

In [None]:
# temp = []
# for e in train[0][0]:
#     encoded = tokenizer.convert_tokens_to_ids(str(e))
#     temp.append(encoded)
# print(temp)

### Delcare Dataset (Iterator)

pytorch에서 데이터를 받아들일 수 있도록 커스텀 데이터셋을 생성합니다. 이 때 문장을 구성하는 토큰들을 고유한 인덱스 번호로 변환합니다. 마찬가지로 레이블도 인덱스 번호로 변환합니다.

In [None]:
class CustomDataset(Dataset): 
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    def __len__(self): 
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx][0]
        input_masks_lst = self.data[idx][1]
        labels = self.data[idx][2]
        input_ids_lst = []
        for token in tokens:
            ids = tokenizer.convert_tokens_to_ids(str(token))
            input_ids_lst.append(ids)
        label_ids = []
        for label in labels:
            label_ids.append(label_dict[label])
                
        return (torch.LongTensor(input_ids_lst), torch.LongTensor(input_masks_lst), torch.LongTensor(label_ids))

In [None]:
trainset = CustomDataset(train, tokenizer)
validset = CustomDataset(valid, tokenizer)

배치 크기가 128인 데이터로더를 생성합니다.

In [None]:
train_loader = DataLoader(trainset, batch_size = 128, shuffle = True)
valid_loader = DataLoader(validset, batch_size = 128, shuffle = True)

In [None]:
for i, el in enumerate(train_loader):
    print(len(el[0][0]))
    print(len(el[1][0]))
    print(len(el[2][0]))
    # print(el[0][0])
    # print(el[1][0])
    # print(el[2][0])
    break

256
256
256
