In [163]:
import os
import random
import time
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import ElectraModel, ElectraTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

## Load and Concat dfs

In [164]:
data_path_1 = 'data/2019-2021_ulsan_cause_effect_v2.csv'
data_path_2 = 'data/eshc_cause-effect_v2.csv'
data_path_3 = 'data/port_cause_effect_v2.csv'
data_path_4 = 'data/ulsan_2022_cause-effect_v2.csv'

In [165]:
df_1 = pd.read_csv(data_path_1, names=['tokens', 'labels'], skip_blank_lines=False)
df_2 = pd.read_csv(data_path_2, names=['tokens', 'labels'], skip_blank_lines=False)
df_3 = pd.read_csv(data_path_3, names=['tokens', 'labels'], skip_blank_lines=False)
df_4 = pd.read_csv(data_path_4, names=['tokens', 'labels'], skip_blank_lines=False)

df = pd.concat([df_1, df_2, df_3, df_4]).reset_index(drop=True)

In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176557 entries, 0 to 176556
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tokens  172864 non-null  object
 1   labels  172865 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


## Check if both tokens and labels are NaN

In [167]:
checker1 = df.iloc[:, 0].isna()
temp = df[checker1]
checker2 = temp.iloc[:, 1].isna()
temp[checker2 == False]

Unnamed: 0,tokens,labels
26559,,O


In [168]:
df = df.drop(26559, axis=0).reset_index(drop=True)

## Separate tokens and labels for further investigation

In [169]:
seps = df[df.iloc[:, 1].isna()].index.to_list()

start = 0
tokens_lst, labels_lst = [], []
for idx in seps:
    tokens_lst.append(df.iloc[start:idx, 0].to_list())
    labels_lst.append(df.iloc[start:idx, 1].to_list())
    start = idx + 1
    
len(tokens_lst), len(labels_lst)

(3692, 3692)

## Check if sample length is valid

In [170]:
seps = df[df.iloc[:, 1].isna()].index.to_list()
start = 0
for idx in seps:
    if start + 2 >= idx:
        print(idx)
    start = idx

145322


In [171]:
df.iloc[145322-5:145322+5]

Unnamed: 0,tokens,labels
145317,##할,E
145318,뻔한,E
145319,사고,E
145320,,
145321,.,O
145322,,
145323,포항,O
145324,에,O
145325,정박,C
145326,##하여,C


In [172]:
df = df.drop(145320, axis=0).reset_index(drop=True)

In [173]:
for tokens, labels in zip(tokens_lst, labels_lst):
    if len(tokens) != len(labels):
        print(tokens)
        print(labels)
        print(len(tokens))
        print(len(labels))

## Delete rows with new lines in tokens if their labels are O

In [174]:
df['tokens'] = df.tokens.apply(lambda x: str(x))

In [175]:
w_nls = df[df.tokens.str.contains('\n')].labels.str.contains('O').index.to_list()

In [176]:
df = df.drop(w_nls, axis=0).reset_index(drop=True)

In [177]:
df[df.tokens.str.contains('\n')]

Unnamed: 0,tokens,labels


## Fix Labels

In [178]:
label_lst = list(df.labels.value_counts().to_dict().keys())

In [179]:
df['labels'] = df.labels.replace('C ', 'C')
df['labels'] = df.labels.replace('c', 'C')
df['labels'] = df.labels.replace('CE ', 'CE')
df['labels'] = df.labels.replace(' CE1', 'CE')
df['labels'] = df.labels.replace('CE1', 'CE')
df['labels'] = df.labels.replace('CE2', 'CE')
df['labels'] = df.labels.replace('CEE2', 'CE')
df['labels'] = df.labels.replace('CE3', 'CE')
df['labels'] = df.labels.replace('CE4', 'CE')
df['labels'] = df.labels.replace('CE5', 'CE')
df['labels'] = df.labels.replace('CE6', 'CE')
df['labels'] = df.labels.replace('E1 ', 'E')
df['labels'] = df.labels.replace('E1', 'E')
df['labels'] = df.labels.replace(' ', 'O')
df['labels'] = df.labels.replace('   ', 'O')

In [180]:
label_lst = list(df.labels.value_counts().to_dict().keys())
label_lst

['O', 'CE', 'C', 'E']

## Simplify Labels

In [181]:
df.labels.value_counts()

O     130984
CE     18833
C      14209
E       8735
Name: labels, dtype: int64

In [182]:
df['labels'] = df.labels.replace('CE', 'E')
df['labels'] = df.labels.replace('C', 'E')

In [183]:
df.labels.value_counts()

O    130984
E     41777
Name: labels, dtype: int64

In [184]:
'E_B', 'E_I'

('E_B', 'E_I')

## Random Sample Test

In [185]:
seps = df[df.iloc[:, 1].isna()].index.to_list()

start = 0
tokens_lst, labels_lst = [], []
for idx in seps:
    tokens_lst.append(df.iloc[start:idx, 0].to_list())
    labels_lst.append(df.iloc[start:idx, 1].to_list())
    start = idx + 1
    
len(tokens_lst), len(labels_lst)

(3691, 3691)

In [186]:
seps = df[df.iloc[:, 1].isna()].index.to_list()
start = 0
for idx in seps:
    if start + 2 >= idx:
        print(idx)
    start = idx

In [187]:
idx = random.randrange(0, len(tokens_lst))

print(idx)
print(tokens_lst[idx])
print(labels_lst[idx])

first_e = labels_lst[idx].index('E')

print(first_e)
print(tokens_lst[idx][first_e])
print(labels_lst[idx][first_e])

353
['2020', '##년', '8', '##월', '29', '##일', '오전', '10', '##시', '15', '##분', '##경', '경북', '청송', '군', '현', '동', '면', '내', 'OO', '터널', '및', '국도', '건설공사', '현장', '에서', '교각', '원형', '거푸집', '해체작업', '중', '작업자', '가', '9', '.', '8', '##m', '높이', '에서', '추락', '##하여', '사망', '##하', '##였', '##다', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'E', 'E', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'E', 'O', 'E', 'O', 'O', 'O', 'O']
28
거푸집
E


In [188]:
# tokenizer = ElectraTokenizer.from_pretrained('tokenizer')

# for tokens in tokens_lst:
#     print(tokenizer.convert_tokens_to_string(tokens))
#     print()

In [189]:
# for tokens, labels in zip(df.tokens, df.labels):
#     print(tokens, labels)

## Apply B-I Scheme

In [190]:
# tokens_lst, labels_lst

In [191]:
for labels in labels_lst:
    for idx, label in enumerate(labels):
        if label == 'E':
            if idx == 0:
                labels[idx] = 'E_B'
            elif labels[idx-1] == 'O':
                labels[idx] = 'E_B'
            else:
                labels[idx] = 'E_I'

In [192]:
data = {
    'tokens': tokens_lst,
    'labels': labels_lst
}
data_df = pd.DataFrame(data)

In [196]:
data_df.to_pickle('data/preprocessed.pkl')

In [197]:
pd.read_pickle('data/preprocessed.pkl')

Unnamed: 0,tokens,labels
0,"[2020, ##년, 2, ##월, 13, ##일, 새, 벽, 4, 시, 30, #...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[2019, ##년, 12, ##월, 30, ##일, 오후, 1, ##시, ##30...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[2019, ##년, 12, ##월, 30, ##일, 오후, ##1, ##시, ##...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[2019, ##년, 12, ##월, 29, ##일, 오후, 6, ##시, 4, 0...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[', 2019, ##년, 12, ##월, 30, ##일, 새, 벽, 2, ##시,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
3686,"[2022, ##년, 1, ##월, 5, ##일, 오후, 2, ##시, 15, ##...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3687,"[2021, ##년, 1, ##월, 4, 일, 오후, 4, 시경, 충북, 청주시, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3688,"[2021, ##년, 1, ##월, 4, 일, 오전, 7, ##시, 23, ##분,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3689,"[2022, ##년, 1, ##월, 2, ##일, 오후, 1, ##시, 30, ##...","[O, O, O, O, O, O, O, O, O, O, O, O, O, E_B, E..."
