In [268]:
import os
import pandas as pd
from transformers import ElectraModel, ElectraTokenizer

## Load Tokenizer

In [269]:
ckpt = 'monologg/koelectra-base-v3-discriminator'

tokenizer = ElectraTokenizer.from_pretrained(ckpt)

In [275]:
lst = ['b','a','c']
sorted(lst)

['a', 'b', 'c']

## Load Data

In [284]:
DATA_PATH = input('엑셀 파일이 있는 폴더명을 입력하세요')
print('개체명 인식 데이터를 생성합니다.')

In [281]:
DATA_PATH = 'data'
sheets = sorted([os.path.join(DATA_PATH, sheet) for sheet in os.listdir(DATA_PATH)])
col_num_count = lambda sheet: len(pd.read_excel(sheet).columns)
col_nums = [col_num_count(sheet) for sheet in sheets]

dfs = [pd.read_excel(sheet, names=[f'Col {i}' for i in range(col_num)]) for sheet, col_num in zip(sheets, col_nums)]

df2lst = [df.values.tolist() for df in dfs]

raw_data = []
for item in df2lst:
    raw_data.extend(item)
len(raw_data)

638

In [282]:
sheet_1 = 'data/eshc 인과관계 학습용(sample)_rev2.xlsx'
sheet_2 = 'data/port 인과관계 학습용(sample)_rev4.xlsx'
data_1 = pd.read_excel(sheet_1, names=[f'Col {i}' for i in range(10)])
data_2 = pd.read_excel(sheet_2, names=[f'Col {i}' for i in range(9)])

raw_data = data_1.values.tolist() + data_2.values.tolist()
len(raw_data)
# raw_data = [[el for el in lst if type(el) == str] for lst in raw_data]

638

## Delete Duplicate Objects and Split Sentences

In [214]:
dups_removed = []
for lst in raw_data:
    temp = []
    for el in lst:
        if type(el) == str and el not in temp:
            temp.append(el.strip())
    dups_removed.append(temp)

In [215]:
### split by space
tokens_lst = [[el.split() for el in lst] for lst in dups_removed]

## Generate Labels

In [216]:
labels_lst = []
for sample in tokens_lst:
    labels4sample = []
    for idx, tokens in enumerate(sample):
        if idx == 0:
            labels4sample.append(['O' for _ in range(len(tokens))])
        else:
            labels = ['E_B'] + ['E_I' for _ in range(len(tokens)-1)]
            labels4sample.append(labels)
    labels_lst.append(labels4sample)

In [217]:
test_tokens, test_labels = tokens_lst[0], labels_lst[0]

for el in test_tokens:
    print(len(el), el)
for el in test_labels:
    print(len(el), el)

16 ['남양주', '시설', '공사', '현장에서', '철골', '기동의', '수직도를', '맞추는', '작업', '중', '레버풀러의', '체인이', '끊어지며', '튕겨나온', '레버풀러에', '맞음']
3 ['수직도', '맞추는', '작업']
2 ['체인', '끊어지며']
2 ['레버풀러에', '맞음']
16 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
3 ['E_B', 'E_I', 'E_I']
2 ['E_B', 'E_I']
2 ['E_B', 'E_I']


##### 원문의 라벨은 현재 O로 통일되어 있습니다. 객체의 패턴이 원문에서 발견되면 상응하도록 원문의 라벨을 수정합니다.

In [218]:
position_pairs_lst = []
for tokens in tokens_lst:
    source = tokens[0]
    len_source = len(source)
    markers = tokens[1:]

    position_pairs = []
    for idx, _ in enumerate(source):
        for marker in markers:
            len_marker = len(marker)
            if source[idx:idx+len_marker] == marker:
                position_pairs.append([idx, idx+len_marker])

    position_pairs_lst.append(position_pairs)

    # print(position_pairs)

In [219]:
len(position_pairs_lst), len(tokens_lst)

(638, 638)

In [220]:
# for tokens, entity_position in zip(tokens_lst, position_pairs_lst):
#     print(tokens)
#     for pair in entity_position:
#         print(tokens[0][pair[0]:pair[1]])

In [221]:
for labels, position_pairs in zip(labels_lst, position_pairs_lst):
    source_labels = labels[0]
    for pair in position_pairs:
        # print(pair)
        # print(source_labels)
        start = pair[0]
        end = pair[1]
        source_labels[start] = 'E_B'
        if end-start > 1:
            source_labels[start+1:end] = ['E_I' for _ in range(end-start-1)]
        # print(source_labels)
        # print()

In [222]:
idx = 555
test_position_pairs, test_tokens, test_labels = position_pairs_lst[idx], tokens_lst[idx], labels_lst[idx]

for el in test_tokens:
    print(len(el), el)
print()
for el in test_position_pairs:
    print(el)
print()
for el in test_labels:
    print(len(el), el)

28 ['2015년', '3월', '15일(일)', '20시35분경', '경북', '포항시', '소재', 'OO부두에서', '수출용', '파이프를', '선적하기', '위하여', '피재자가', '선창에', '파이프적재', '받침목을', '깔고', '내려오던', '중', '발이', '미끄러지면서', '선체', '벽과', '파이프', '사이의', '개구부로', '떨어져', '사망']
3 ['받침목', '깔고', '내려오던']
2 ['발이', '미끄러져']
1 ['떨어져']
1 ['사망']

[26, 27]
[27, 28]

28 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'E_B', 'E_B']
3 ['E_B', 'E_I', 'E_I']
2 ['E_B', 'E_I']
1 ['E_B']
1 ['E_B']


In [223]:
tokens_lst = [el for tokens in tokens_lst for el in tokens]
labels_lst = [el for labels in labels_lst for el in labels]

In [224]:
def tokenize_and_preserve_labels(tokens_lst, labels_lst):
    tokenized_sentence = []
    labels = []
    for word, label in zip(tokens_lst, labels_lst):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)

        if label[-1] == 'B' and n_subwords > 1:
            tail = list(label)
            tail[-1] = 'I'
            tail = ''.join(tail)
            labels.extend([label] + [tail]*(n_subwords-1))
        else:
            labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [225]:
tokenized_texts_and_labels = [
                              [*tokenize_and_preserve_labels(words, labs)]
                              for words, labs in zip(tokens_lst, labels_lst)
                              ]

In [226]:
for tokens, labels in tokenized_texts_and_labels:
    tokens.append(None)
    labels.append(None)

In [227]:
# for item in tokenized_texts_and_labels:
#     if len(item[0]) != len(item[1]):
#         print(item)

In [228]:
# for tokens, labels in tokenized_texts_and_labels:
#     print(tokens)
#     print(labels)
#     print()

In [229]:
data = {
    'tokens': [],
    'labels': []
}

for tokens, labels in tokenized_texts_and_labels:
    data['tokens'].extend(tokens)
    data['labels'].extend(labels)
    
data_df = pd.DataFrame(data)

In [240]:
# data_df.iloc[200:250, :]

data_df.to_csv(f'./ner_data_gen_results/ner_data.csv', index=False, encoding='utf-8-sig')

In [287]:
pd.read_csv('./ner_data_gen_results/ner_data.csv').head(50)

Unnamed: 0,tokens,labels
0,남양주,O
1,시설,O
2,공사,O
3,현장,O
4,##에,O
5,##서,O
6,철,O
7,##골,O
8,기동,O
9,##의,O
