In [1]:
import os
import random
import time
import pandas as pd
import numpy as np

## Load and Concat dfs

In [2]:
data_path_1 = 'data/2019-2021_ulsan_cause_effect_v2.csv'
data_path_2 = 'data/eshc_cause-effect_v2.csv'
data_path_3 = 'data/port_cause_effect_v2.csv'
data_path_4 = 'data/ulsan_2022_cause-effect_v2.csv'

In [3]:
df_1 = pd.read_csv(data_path_1, names=['tokens', 'labels'], skip_blank_lines=False)
df_2 = pd.read_csv(data_path_2, names=['tokens', 'labels'], skip_blank_lines=False)
df_3 = pd.read_csv(data_path_3, names=['tokens', 'labels'], skip_blank_lines=False)
df_4 = pd.read_csv(data_path_4, names=['tokens', 'labels'], skip_blank_lines=False)

df = pd.concat([df_1, df_2, df_3, df_4]).reset_index(drop=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176557 entries, 0 to 176556
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tokens  172864 non-null  object
 1   labels  172865 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


## Check if both tokens and labels are NaN

In [5]:
checker1 = df.iloc[:, 0].isna()
temp = df[checker1]
checker2 = temp.iloc[:, 1].isna()
temp[checker2 == False]

Unnamed: 0,tokens,labels
26559,,O


In [6]:
df = df.drop(26559, axis=0).reset_index(drop=True)

## Check if sample length is valid

In [7]:
seps = df[df.iloc[:, 1].isna()].index.to_list()
start = 0
for idx in seps:
    if start + 2 >= idx:
        print(idx)
    start = idx

145322


In [8]:
df.iloc[145322-5:145322+5]

Unnamed: 0,tokens,labels
145317,##할,E
145318,뻔한,E
145319,사고,E
145320,,
145321,.,O
145322,,
145323,포항,O
145324,에,O
145325,정박,C
145326,##하여,C


In [9]:
df = df.drop(145320, axis=0).reset_index(drop=True)

## Delete rows with new lines in tokens if their labels are O

In [10]:
df['tokens'] = df.tokens.apply(lambda x: str(x))

In [11]:
w_nls = df[df.tokens.str.contains('\n')].labels.str.contains('O').index.to_list()

In [12]:
df = df.drop(w_nls, axis=0).reset_index(drop=True)

In [13]:
df[df.tokens.str.contains('\n')]

Unnamed: 0,tokens,labels


In [14]:
# for token, label in zip(df.tokens, df.labels):
#     print(token, label)

## Count Labels

In [15]:
df.labels.value_counts()

O       130880
C        13718
E         8732
CE        7756
CE1       6361
CE2       3638
CE3        877
C          489
CE4        166
           103
CE5         22
CE           7
E1           3
 CE1         3
c            2
CE6          2
             1
CEE2         1
Name: labels, dtype: int64

## Fix Labels

In [16]:
label_cats = list(df.labels.value_counts().to_dict().keys())
label_cats

['O',
 'C',
 'E',
 'CE',
 'CE1',
 'CE2',
 'CE3',
 'C ',
 'CE4',
 ' ',
 'CE5',
 'CE ',
 'E1',
 ' CE1',
 'c',
 'CE6',
 '   ',
 'CEE2']

In [17]:
##################

df['labels'] = df.labels.replace('c', 'C')
df['labels'] = df.labels.replace('C ', 'C')

df['labels'] = df.labels.replace('CE ', 'CE')

df['labels'] = df.labels.replace('CE1', 'CE1')
df['labels'] = df.labels.replace(' CE1', 'CE1')
df['labels'] = df.labels.replace('E1 ', 'CE1')
df['labels'] = df.labels.replace('E1', 'CE1')

df['labels'] = df.labels.replace('CE2', 'CE2')
df['labels'] = df.labels.replace('CEE2', 'CE2')

df['labels'] = df.labels.replace('CE3', 'CE3')
df['labels'] = df.labels.replace('CE4', 'CE4')
df['labels'] = df.labels.replace('CE5', 'CE5')
df['labels'] = df.labels.replace('CE6', 'CE6')

df['labels'] = df.labels.replace(' ', 'O')
df['labels'] = df.labels.replace('   ', 'O')

In [18]:
df.labels.value_counts()

O      130984
C       14209
E        8732
CE       7763
CE1      6367
CE2      3639
CE3       877
CE4       166
CE5        22
CE6         2
Name: labels, dtype: int64

In [19]:
label_cats = list(df.labels.value_counts().to_dict().keys())
label_cats

['O', 'C', 'E', 'CE', 'CE1', 'CE2', 'CE3', 'CE4', 'CE5', 'CE6']

## Separate tokens and labels

In [20]:
seps = df[df.iloc[:, 1].isna()].index.to_list()

start = 0
tokens_lst, labels_lst = [], []
for idx in seps:
    tokens_lst.append(df.iloc[start:idx, 0].to_list())
    labels_lst.append(df.iloc[start:idx, 1].to_list())
    start = idx + 1
    
len(tokens_lst), len(labels_lst)

(3691, 3691)

In [21]:
for tokens, labels in zip(tokens_lst, labels_lst):
    if len(tokens) != len(labels):
        print(tokens)
        print(labels)
        print(len(tokens))
        print(len(labels))

## Apply B-I Scheme

In [22]:
# tokens_lst, labels_lst

In [23]:
len(labels_lst)

3691

In [24]:
label_cats.pop(0)
label_cats

['C', 'E', 'CE', 'CE1', 'CE2', 'CE3', 'CE4', 'CE5', 'CE6']

In [25]:
for labels in labels_lst:
    for idx, label in enumerate(labels):
        if label in label_cats:
            if idx == 0:
                labels[idx] = labels[idx] + '_B'
            elif labels[idx-1] == 'O' or labels[idx] != labels[idx-1][:-2]:
                labels[idx] = labels[idx] + '_B'
            else:
                labels[idx] = labels[idx] + '_I'

## Simplify Labels

In [27]:
label_cats

['C', 'E', 'CE', 'CE1', 'CE2', 'CE3', 'CE4', 'CE5', 'CE6']

In [28]:
for labels in labels_lst:
    for idx, label in enumerate(labels):
        for checker in label_cats:
            if label != 'O' and label[:-2] == checker:
                labels[idx] = labels[idx].replace(checker, 'E')

## Create Dataset

In [30]:
data = {
    'tokens': tokens_lst,
    'labels': labels_lst
}
data_df = pd.DataFrame(data)

In [31]:
for idx, row in data_df.iterrows():
    print(row.tokens)
    print(row.labels)
    print()

['2020', '##년', '2', '##월', '13', '##일', '새', '벽', '4', '시', '30', '##분', '##경', '미국', '뉴', '##버리', '포트', '의', '한', '화학', '물질', '제조공장', '에서', '위험', '##물', '##질', '폭발', '##사고', '##가', '발생', '##하', '##였', '##다', '.', '공장', '건축물', '내부', '##에', '##서', '총', '여섯', '##차례', '##의', '폭발', '##이', '일어났', '##으며', '폭발', '##로', '인하', '##여', '공장', '지붕', '에', '1', '.', '5', '~', '2', '.', '5', '##m', '##의', '구멍', '##이', '발생', '##하', '##였', '##다', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'E_B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'E_B', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'E_I', 'O', 'O', 'O', 'O']

['2019', '##년', '12', '##월', '30', '##일', '오후', '1', '##시', '##30', '##분', '##경', '인도', 'K', '##and', '##la', 'P', '##ort', 'IM', '##C', '탱크', '터미널', '에서', '메탄', '##올', '저장탱크', '를

In [32]:
data_df.to_pickle('data/preprocessed.pkl')

In [33]:
pd.read_pickle('data/preprocessed.pkl')

Unnamed: 0,tokens,labels
0,"[2020, ##년, 2, ##월, 13, ##일, 새, 벽, 4, 시, 30, #...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[2019, ##년, 12, ##월, 30, ##일, 오후, 1, ##시, ##30...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[2019, ##년, 12, ##월, 30, ##일, 오후, ##1, ##시, ##...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[2019, ##년, 12, ##월, 29, ##일, 오후, 6, ##시, 4, 0...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[', 2019, ##년, 12, ##월, 30, ##일, 새, 벽, 2, ##시,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
3686,"[2022, ##년, 1, ##월, 5, ##일, 오후, 2, ##시, 15, ##...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3687,"[2021, ##년, 1, ##월, 4, 일, 오후, 4, 시경, 충북, 청주시, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3688,"[2021, ##년, 1, ##월, 4, 일, 오전, 7, ##시, 23, ##분,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3689,"[2022, ##년, 1, ##월, 2, ##일, 오후, 1, ##시, 30, ##...","[O, O, O, O, O, O, O, O, O, O, O, O, O, E_B, E..."
