In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import csv
import re

## Data Load

In [None]:
# train & validation data load
data = pd.read_csv('train.csv')
val_data = pd.read_csv('dev.csv')

## Text preprocessing

In [None]:
# Remove sentence missing
data = data.dropna(axis=0)
val_data = val_data.dropna(axis=0)


# Remove duplicate sentences
data = data.drop_duplicates(['문장'], keep='first')
val_data = val_data.drop_duplicates(['문장'], keep='first')


# text col
text = data['문장']
val_text = val_data['문장']

# label col
tag = data['태그']
val_tag = val_data['태그']


In [None]:
# 어절(띄어쓰기) 기준 tokenizing
def tokenizing_text(texts):
    corpus = []
    for s in texts:
        result = re.split(' ',str(s))
        corpus.append(result)
    return corpus

In [None]:
text = tokenizing_text(text)
val_text = tokenizing_text(val_text)

In [None]:
for s in text:
    if (s[0] == '줘') or (s[0] == '놔'):
        s.pop(0)


In [None]:
for s in text:
    if s[-1] == '할.':
        s[-1] = '줘.'
        print(s)

In [None]:
for s in text:
    if s[-1] == '등.':
        s[-1] = '줘.'
        print(s)

In [None]:
for s in text:
    if s[-1] == '길.':
        s[-1] = '줘.'
        print(s)

In [None]:
for s in text:
    if s[-1] == '주.':
        s[-1] = '줘.'
        print(s)

In [None]:
for s in text:
    if s[-1] == '이.':
        s[-1] = '줘.'
        print(s)

In [None]:
# same preprocess val_text

for s in val_text:
    if (s[0] == '줘') or (s[0] == '놔'):
        s.pop(0)
        
for s in val_text:
    if s[-1] == '할.':
        s[-1] = '줘.'
        print(s)
        
for s in val_text:
    if s[-1] == '등.':
        s[-1] = '줘.'
        print(s)
        
for s in val_text:
    if s[-1] == '길.':
        s[-1] = '줘.'
        print(s)
        
for s in val_text:
    if s[-1] == '주.':
        s[-1] = '줘.'
        print(s)

for s in val_text:
    if s[-1] == '이.':
        s[-1] = '줘.'
        print(s)

In [None]:
# sentence summation
def str_sum(text):
    temp = list()
    for s in text:
        temp.append(' '.join(s))
    return temp

text = str_sum(text)
val_text = str_sum(val_text)

## Label encoding

In [None]:
text = np.array(text)
val_text = np.array(val_text)

tag = np.array(tag)
val_tag = np.array(val_tag)

In [None]:
# train data label encoding
idx_encode = preprocessing.LabelEncoder()  
idx_encode.fit(tag)
Label_train = idx_encode.transform(tag) # 주어진 고유한 정수로 변환

label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))
#print(label_idx)
#print(Label_train)

In [None]:
# validation data label encoding
idx_encode = preprocessing.LabelEncoder()  
idx_encode.fit(val_tag)
Label_test = idx_encode.transform(val_tag) # 주어진 고유한 정수로 변환

label_idx = dict(zip(list(idx_encode.classes_), idx_encode.transform(list(idx_encode.classes_))))
#print(Label_test)
#print(type(Label_test))

## Data shuffle

In [None]:
# 데이터 순서 섞기

s = np.arange(text.shape[0])
np.random.shuffle(s)
x_train = text[s]
y_train = Label_train[s]


s = np.arange(val_text.shape[0])
np.random.shuffle(s)
x_test = val_text[s]
y_test = Label_test[s]


# dev.txt를 validation set으로 사용 -----> split 함수 사용하지 X 
#x_train, x_test = train_test_split(TextData, test_size=0.2, shuffle=False)
#y_train, y_test = train_test_split(LabelData, test_size=0.2, shuffle=False)

In [None]:
print("text ", type(text))
print("val_text ", type(val_text))
print("tag ", type(tag))
print("val_tag ", type(val_tag))

text  <class 'numpy.ndarray'>
val_text  <class 'numpy.ndarray'>
tag  <class 'numpy.ndarray'>
val_tag  <class 'numpy.ndarray'>


## Save in .TSV

In [None]:
#write in tsv

with open('전처리train.tsv', 'wt', newline='', encoding='utf-8-sig') as f:
    print('Write train data to {} ...'.format('train.tsv'))
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(zip(x_train, y_train))
with open('전처리test.tsv', 'w', newline='', encoding='utf-8-sig') as f:
    print('Write test data to {} ...'.format('test.tsv'))
    writer = csv.writer(f, delimiter='\t')
    writer.writerows(zip(x_test, y_test))

Write train data to train.tsv ...
Write test data to test.tsv ...


In [None]:
# 주석..........

In [None]:
'''
#### define preprocessing func ####

# 특수문자 제거
def clean_text(texts):
    corpus = []
    for s in texts:
        result = re.sub('[^ ㄱ-ㅣ가-힣]+','', s) 
        corpus.append(result)
    return corpus


# 어절(띄어쓰기) 기준 tokenizing
def tokenizing_text(texts):
    corpus = []
    for s in texts:
        result = re.split(' ',s)
        corpus.append(result)
    return corpus



## pos tagging (형태소 품사) 기준 tokenizing
from konlpy.tag import Kkma 
kkma = Kkma()

def pos_tokenizing(texts):
    corpus = []
    for s in texts:
        result = kkma.morphs(s)
        corpus.append(result)
    return corpus



## stopwords
stop_words = "하 어 가 아"    # 불용어 지정
stop_words = stop_words.split(' ')

def stopwords(texts):
    temp1 = []
    for sentence in texts:
        temp2 = []
        for words in sentence:
            if words not in sotp_words:
                temp2.append(words)
            temp1.append(temp2)
    return temp1

# 중복 문장 확인
data['문장'].value_counts()

# 문장 다시 합치기
def str_sum(text):
    temp = list()
    for s in text:
        temp.append(' '.join(s))
    return temp



'''


'\n#### define preprocessing func ####\n\n# 특수문자 제거\ndef clean_text(texts):\n    corpus = []\n    for s in texts:\n        result = re.sub(\'[^ ㄱ-ㅣ가-힣]+\',\'\', s) \n        corpus.append(result)\n    return corpus\n\n\n# 어절(띄어쓰기) 기준 tokenizing\ndef tokenizing_text(texts):\n    corpus = []\n    for s in texts:\n        result = re.split(\' \',s)\n        corpus.append(result)\n    return corpus\n\n\n\n## pos tagging (형태소 품사) 기준 tokenizing\nfrom konlpy.tag import Kkma \nkkma = Kkma()\n\ndef pos_tokenizing(texts):\n    corpus = []\n    for s in texts:\n        result = kkma.morphs(s)\n        corpus.append(result)\n    return corpus\n\n\n\n## stopwords\nstop_words = "하 어 가 아"    # 불용어 지정\nstop_words = stop_words.split(\' \')\n\ndef stopwords(texts):\n    temp1 = []\n    for sentence in texts:\n        temp2 = []\n        for words in sentence:\n            if words not in sotp_words:\n                temp2.append(words)\n            temp1.append(temp2)\n    return temp1\n\n# 중복 문장 확인\nda