In [63]:
import pandas as pd
from transformers import ElectraModel, ElectraTokenizer

In [64]:
ckpt = 'monologg/koelectra-base-v3-discriminator'

tokenizer = ElectraTokenizer.from_pretrained(ckpt)

In [65]:
sheet_1 = 'data/eshc 인과관계 학습용(sample)_rev2.xlsx'
sheet_2 = 'data/port 인과관계 학습용(sample)_rev4.xlsx'
data_1 = pd.read_excel(sheet_1, names=[f'Col {i}' for i in range(10)])
data_2 = pd.read_excel(sheet_2, names=[f'Col {i}' for i in range(9)])

In [66]:
raw_data = data_1.values.tolist() + data_2.values.tolist()
# raw_data = [[el for el in lst if type(el) == str] for lst in raw_data]

dups_removed = []
for lst in raw_data:
    temp = []
    for el in lst:
        if type(el) == str and el not in temp:
            temp.append(el)
    dups_removed.append(temp)

In [67]:
### subword tokenization
# tokens_lst = [[tokenizer.tokenize(el) for el in lst] for lst in dups_removed]

### split by space
tokens_lst = [[el.split() for el in lst] for lst in dups_removed]

labels_lst = []
for sample in tokens_lst:
    labels4sample = []
    for idx, tokens in enumerate(sample):
        if idx == 0:
            labels4sample.append(['O' for _ in range(len(tokens))])
        else:
            labels = ['E_B'] + ['E_I' for _ in range(len(tokens)-1)]
            labels4sample.append(labels)
    labels_lst.append(labels4sample)

In [68]:
test_tokens, test_labels = tokens_lst[0], labels_lst[0]

for el in test_tokens:
    print(len(el), el)
for el in test_labels:
    print(len(el), el)

16 ['남양주', '시설', '공사', '현장에서', '철골', '기동의', '수직도를', '맞추는', '작업', '중', '레버풀러의', '체인이', '끊어지며', '튕겨나온', '레버풀러에', '맞음']
3 ['수직도', '맞추는', '작업']
2 ['체인', '끊어지며']
2 ['레버풀러에', '맞음']
16 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
3 ['E_B', 'E_I', 'E_I']
2 ['E_B', 'E_I']
2 ['E_B', 'E_I']


In [48]:
entity_positions = []
for tokens in tokens_lst:
    source = tokens[0]
    len_source = len(source)
    markers = tokens[1:]

    idx = 0
    position_pairs = []
    for marker in markers:

        first_marker = marker[0]
        last_marker = marker[-1] if len(marker) > 1 else False

        # print(len(source), source)
        # print(len(marker), marker)
        
        start = None
        end = None
        
        if first_marker in source[idx:]:
            start = source[idx:].index(first_marker) + idx
            idx = start + 1
            # print(start, idx)
        if last_marker and last_marker in source[idx:]:
            end = source[idx:].index(last_marker) + idx
            idx = end + 1
            # print(end, idx)
            
        position_pairs.append([start, end])

    entity_positions.append(position_pairs)

    # print(position_pairs)

10 11
15 16
21 22
25 26
29 30
34 35
5 6
11 12
25 26
21 22
31 32
32 33
6 7
12 13
32 33
35 36
39 40
23 24
30 31
33 34
40 41
41 42
8 9
18 19
25 26
29 30
41 42
42 43
4 5
8 9
14 15
19 20
20 21
26 27
27 28
32 33
34 35
41 42
42 43
39 40
40 41
66 67
77 78
20 21
22 23
31 32
32 33
18 19
22 23
26 27
30 31
31 32
32 33
9 10
14 15
30 31
33 34
35 36
13 14
15 16
16 17
15 16
20 21
22 23
30 31
35 36
37 38
5 6
8 9
18 19
20 21
23 24
24 25
11 12
28 29
29 30
16 17
17 18
23 24
29 30
30 31
31 32
22 23
24 25
30 31
34 35
48 49
51 52
52 53
10 11
14 15
20 21
24 25
28 29
29 30
11 12
17 18
24 25
30 31
33 34
35 36
36 37
37 38
19 20
25 26
26 27
5 6
13 14
17 18
22 23
23 24
35 36
36 37
38 39
41 42
45 46
46 47
10 11
14 15
23 24
28 29
31 32
32 33
8 9
12 13
23 24
24 25
9 10
17 18
23 24
31 32
32 33
20 21
25 26
30 31
40 41
43 44
44 45
28 29
31 32
39 40
45 46
50 51
51 52
8 9
14 15
19 20
20 21
7 8
10 11
17 18
20 21
40 41
42 43
55 56
62 63
63 64
22 23
24 25
34 35
40 41
42 43
44 45
4 5
23 24
25 26
26 27
23 24
29 30
31 32
40 41


In [49]:
len(entity_positions), len(tokens_lst)

(638, 638)

In [50]:
tokens_lst[1]

[['남원',
  '공사',
  '현장',
  '##에',
  '##서',
  '전기',
  '개',
  '##폐',
  '##기',
  '교체',
  '##작',
  '##업',
  '현장',
  '##에',
  '##서',
  '후진',
  '##하',
  '##는',
  '활',
  '##선',
  '##작',
  '##업',
  '##차',
  '##에',
  '깔',
  '##림'],
 ['전기', '개', '##폐', '##기', '교체', '##작', '##업'],
 ['작업', '##차', '##에', '깔', '##림']]

In [51]:
for el in entity_positions:
    print(el)

[[10, 15], [21, 25], [29, 34]]
[[5, 11], [None, 25]]
[[21, None], [31, 32]]
[[6, 12], [32, 35], [39, None]]
[[23, None], [30, 33], [40, 41]]
[[8, 18], [25, 29], [41, 42]]
[[4, 8], [14, None], [19, 20]]
[[26, 27], [32, 34], [41, 42]]
[[39, 40], [66, 77], [None, None], [None, None]]
[[20, 22], [31, 32]]
[[18, 22], [26, 30], [31, 32]]
[[9, 14], [None, 30], [33, 35]]
[[13, None], [15, None], [16, None]]
[[15, 20], [22, 30], [35, 37]]
[[5, 8], [18, 20], [23, 24]]
[[11, None], [28, 29]]
[[16, 17], [23, 29], [30, 31]]
[[22, 24], [30, 34], [48, 51], [52, None]]
[[10, 14], [20, 24], [28, 29]]
[[11, 17], [24, 30], [33, 35], [36, 37]]
[[19, None], [25, 26]]
[[5, None], [13, 17], [22, 23]]
[[35, 36], [38, 41], [45, 46]]
[[10, 14], [23, 28], [31, 32]]
[[8, 12], [23, 24]]
[[9, None], [17, 23], [31, 32]]
[[20, 25], [30, 40], [43, 44]]
[[28, 31], [39, 45], [50, 51]]
[[8, None], [14, 19], [20, None]]
[[7, 10], [17, 20]]
[[40, 42], [55, None], [62, 63]]
[[22, 24], [34, 40], [42, 44]]
[[4, 23], [25, 26]]