In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import csv
import re

## Data Load

In [39]:
# train & validation data load
data = pd.read_csv('train.csv')
val_data = pd.read_csv('dev.csv')

## Text preprocessing

In [40]:
# Remove sentence missing
data = data.dropna(axis=0)
val_data = val_data.dropna(axis=0)


# Remove duplicate sentences
data = data.drop_duplicates(['문장'], keep='first')
val_data = val_data.drop_duplicates(['문장'], keep='first')


# text col
text = data['문장']
val_text = val_data['문장']

# label col
tag = data['태그']
val_tag = val_data['태그']


In [41]:
# 어절(띄어쓰기) 기준 tokenizing
def tokenizing_text(texts):
    corpus = []
    for s in texts:
        result = re.split(' ',str(s))
        corpus.append(result)
    return corpus

In [42]:
text = tokenizing_text(text)
val_text = tokenizing_text(val_text)

In [43]:
for s in text:
    if (s[0] == '줘') or (s[0] == '놔'):
        s.pop(0)

for s in text:
    if s[-1] == '할.':
        s[-1] = '줘.'
        
for s in text:
    if s[-1] == '등.':
        s[-1] = '줘.'
        
for s in text:
    if s[-1] == '길.':
        s[-1] = '줘.'
        
for s in text:
    if s[-1] == '길.':
        s[-1] = '줘.'
        
for s in text:
    if s[-1] == '이.':
        s[-1] = '줘.'

In [44]:
# same preprocess val_text

for s in val_text:
    if (s[0] == '줘') or (s[0] == '놔'):
        s.pop(0)
        
for s in val_text:
    if s[-1] == '할.':
        s[-1] = '줘.'
        
for s in val_text:
    if s[-1] == '등.':
        s[-1] = '줘.'
        
for s in val_text:
    if s[-1] == '길.':
        s[-1] = '줘.'
        
for s in val_text:
    if s[-1] == '주.':
        s[-1] = '줘.'

for s in val_text:
    if s[-1] == '이.':
        s[-1] = '줘.'

In [45]:
# sentence summation
def str_sum(text):
    temp = list()
    for s in text:
        temp.append(' '.join(s))
    return temp

text = str_sum(text)
val_text = str_sum(val_text)

## Data augmentation

In [46]:
aug = data

In [47]:
# aug datafarme shuffle
aug = aug.sample(frac=1).reset_index(drop=True)

aug_text = list(aug['문장'])
aug_tag = list(aug['태그'])

In [48]:
import random

def RandSwap(text):
    temp = []
    for s in text:
        a = random.randint(0,len(s)-1)
        b = random.randint(0,len(s)-1)
        if a!=b:
            s[a],s[b] = s[b],s[a]
        temp.append(s)
    return temp

swap_text = tokenizing_text(aug_text)
swap_text = RandSwap(swap_text)
swap_text = str_sum(swap_text)
aug1 = pd.DataFrame(swap_text,aug_tag)

#swap_text[:10000]
#aug_tag[:10000]

In [49]:
def RandDel(text):
    temp = []
    for s in text:
        a = random.randint(0,len(s)-1)
        s.pop(a)
    temp.append(s)
    return temp

del_text = tokenizing_text(aug_text)
del_text = RandSwap(del_text)
del_text = str_sum(del_text)
aug1 = pd.DataFrame(del_text,aug_tag)

#del_text[:10000]
#aug_tag[:10000]

In [50]:
#aug_tag[:10000]

idx_encode = preprocessing.LabelEncoder()  
idx_encode.fit(aug_tag)
Label_aug = idx_encode.transform(aug_tag) # 주어진 고유한 정수로 변환
label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))


## Label encoding

In [52]:
df = pd.DataFrame(del_text,aug_tag)

In [53]:
#text = np.array(text)
val_text = np.array(val_text)

#tag = np.array(tag)
val_tag = np.array(val_tag)

In [54]:
# train data label encoding
idx_encode = preprocessing.LabelEncoder()  
idx_encode.fit(tag)
Label_train = idx_encode.transform(tag) # 주어진 고유한 정수로 변환

label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))

In [55]:
# validation data label encoding
idx_encode = preprocessing.LabelEncoder()  
idx_encode.fit(val_tag)
Label_test = idx_encode.transform(val_tag) # 주어진 고유한 정수로 변환

label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))

## decoder만들기

In [56]:
# decoder = pd.DataFrame(label_idx)
raw = []
encode = []
for key in label_idx:
    raw.append(key)
#print(raw)
#print(len(raw))
for val in label_idx.values():
    encode.append(val)
decoder = pd.DataFrame({'label_num':encode, 'label_text':raw})

In [57]:
decoder.to_csv('decoder.txt', index=False)
file = open('decoder.txt','r',encoding='utf-8')
#file_open = file.read()
#file.close()
Decoder = pd.read_csv('decoder.txt')


## Data shuffle

In [59]:
text = text + swap_text[:20000]
text = text + del_text[20000:30000]

len(text)

89622

In [60]:
Label_train  = list(Label_train)
Label_aug = list(Label_aug)

In [61]:
Label_train = Label_train + Label_aug[:20000] + Label_aug[20000:30000]
print(len(Label_train))

89622


In [62]:
text = np.array(text)
Label_train = np.array(Label_train)

In [63]:
# 데이터 순서 섞기

s = np.arange(text.shape[0])
np.random.shuffle(s)
x_train = text[s]
y_train = Label_train[s]

'''
s = np.arange(val_text.shape[0])
np.random.shuffle(s)
x_test = val_text[s]
y_test = Label_test[s]
'''
x_test = val_text
y_test = Label_test
# dev.txt를 validation set으로 사용 -----> split 함수 사용하지 X 
#x_train, x_test = train_test_split(TextData, test_size=0.2, shuffle=False)
#y_train, y_test = train_test_split(LabelData, test_size=0.2, shuffle=False)

## Save in .TSV

In [64]:
#write in tsv

with open('0105_train.tsv', 'wt', newline='', encoding='utf-8-sig') as f:
    print('Write train data to {} ...'.format('train.tsv'))
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(zip(x_train, y_train))
with open('0105_test.tsv', 'w', newline='', encoding='utf-8-sig') as f:
    print('Write test data to {} ...'.format('test.tsv'))
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(zip(x_test, y_test))

Write train data to train.tsv ...
Write test data to test.tsv ...
