# 자연어 전처리 테스트 모듈

In [4]:
import numpy as np
from os.path import join
from os import listdir
from pathlib import Path

In [170]:
root_path = '/data1'
translation_data_path = join(root_path, 'translation_data', 'test') # https://github.com/jungyeul/korean-parallel-corpora/tree/master/korean-english-news-v1 다운로드
category_data_path = join(root_path, 'cs_emulator_data', '7category_data', 'test') # 회사 데이터이기 때문에 샘플 약간만 노출

## 설명: 기본 구조는 다음과 같다.
## 1. 폴더 내의 모든 파일은 text 또는 numpy라고 가정하고, 모든 path list를 읽는다. 단, 동일 폴더 내의 모든 파일은 같은 data type을 갖는 것을 가정한다.
## 2. token화 한다.
## 3. token to idx를 구현
## 4. 데이터 세트 0 padding 및 numpy로 가공
## 5. Token화 된 데이터를 seq2seq / category 형태 변환 및 재조립 한다. (seq2seq일 때는 두 가지 documents가 필요)

# 1. 자연어 데이터를 로드한다.

In [72]:
def read_data(data_path_list, data_type):
    all_documents = list()
    all_documents_names = list()
    for data_path in data_path_list:
        if 'numpy' == data_type: # Path(data_path).suffix
            documents = np.load(data_path)
        elif 'text' == data_type:
            documents = list()
            with open(data_path, 'r', encoding='utf-8') as f:
                new_data_lines = f.read().split('\n')
                documents.extend(new_data_lines)
        else:
            print('Proper data_type is not presented.')
        all_documents.append(documents)
        all_documents_names.append(Path(data_path).stem)
    return all_documents, all_documents_names

In [73]:
dpl = listdir(category_data_path)
dpl = [join(category_data_path, d) for d in dpl]
all_documents, all_documents_names = read_data(data_path_list=dpl, data_type='numpy')
print(all_documents_names)
print(all_documents)

['conn', 'acc', 'event', 'bill', 'bug', 'rep', 'game']
[array(['접속이 안되요.. 저캐릭만 접속이 안됩니다  다른캐릭은ㅈ되고요  간단한 문제같은데 빨리 해결좀 해결조 군주캐릭이라 ㅠㅠ',
       '아이폰6 와이파이 접속 아이폰 6에서만 와이파이가 잘 됨에도불구하고 자꾸 네트워크가안좋다고 나옵니다 마치 lte사용을 유도하는것 처럼^^',
       '아직까지 패치자체 않되는 ㅜㅜ', ..., '장비던전 튕김 장비던전 들어가자마자 튕겼습니다.. 보상부탁드립니다.',
       '비정상적인 행동으로 요즘 자주튕기는데 ★비정상적인 행동★ 이거 때문에 여러번 문의 했었는데 지우고 새로도 깔아봤는데 또 튕기네요 왜 이런현상 생기는지 정확하게 답변해주세요 애매모호하게 넘기지 마시구요',
       '안녕 도대체가 게임을 하라고하는거요뭐요 5분도 안되는 시간에 수시로 팅기고 아.욕나오네 니.미 겜 안할거니까 쓴거 다 토해내쇼.아 웬만해야 참지 진짜 앞에 잇으면 패 죽이고싶네 시.벌'],
      dtype=object), array(['연동계정 케릭이 사라짐 군주케릭이라 빠릌복구 부탁드려요', '계정연동', '하딘10섭 현승', ...,
       '캐릭터가삭제되었습니다', '계정 연동이  해제되고  로그아웃하니까 계정이 사라졌습니다  복구부탁드려요',
       '연동안된지모르고휴대폰바꿧어요'], dtype=object), array(['사전예약쿠폰이안대요', '접속다이아 접속다이아가 안들어와요',
       '접속이벤트 보상이 안뜹니다 2일차까지 수령받았는데 3일차부터 수령이 안되네요 확인부탁드립니다', ...,
       '서버이전권 서버이전 하고 싶은데 서버이전권 언제 나오나요??',
       '노트8 쿠폰 입력이 안됩니다  쿠폰 등록이 안됩니다. 노트8 레볼루션 쿠폰번호보다 받은 쿠폰 번호가 하나가 더 많아요',
       '이벤트창을 누름 이벤트로 안넘어가짐'], dtype=object),

# 2. 자연어 데이터를 토큰화 한다.

In [22]:
from konlpy.tag import Okt

In [32]:
okt = Okt()

In [36]:
def tokenize(documents, token_type):
    if token_type == 'char':
        return [char for document in documents for char in document]
    elif token_type == 'word': # token_type == word
        return [okt.morphs(document, norm=False, stem=False) for document in documents]
    elif token_type == 'ngram': # 'ngram'
        tokenized_sentence = sentence.split(' ')
        if kor_tokenizer_max_word_char == -1:
            return [word for word in tokenized_sentence]
        else:
            return [word[:kor_tokenizer_max_word_char] for word in tokenized_sentence]
    else:
        print('Not implemented token type:', token_type)

In [67]:
all_tokenized_documents = list()
for documents in all_documents:
    tokenized_documents = tokenize(documents=documents, token_type='word')
    all_tokenized_documents.append(tokenized_documents)
print(all_tokenized_documents)

[[['접속', '이', '안되요', '..', '저', '캐릭', '만', '접속', '이', '안됩니다', '다른', '캐릭', '은', 'ㅈ', '되고요', '간단한', '문제', '같은데', '빨리', '해결', '좀', '해결', '조', '군주', '캐릭', '이라', 'ㅠㅠ'], ['아이폰', '6', '와이파이', '접속', '아이폰', '6', '에서만', '와이파이', '가', '잘', '됨에도', '불구', '하고', '자꾸', '네트워크', '가', '안좋다고', '나옵니다', '마치', 'lte', '사용', '을', '유도', '하는것', '처럼', '^^'], ['아직', '까지', '패치', '자체', '않', '되는', 'ㅜㅜ'], ['오렌', '10', '러키', '쏭', '케릭', '빠른', '정상화', '바', '래', '요'], ['게임', '중', '팅기더니', '케릭', '이', '접속', '이', '안되요', '게임', '중', '팅기더니', '케릭', '접속', '이', '안되요'], ['게임', '꺼짐', '현상', '튕', '김현', '상', '으로', '인해', '물약', '도', '계속', '나가고', '특히', '오만', '의', '탑', '을', '하다가', '튕', '기면', '보상', '을', '하나', '도', '못', '받네요', '예', '를', '들어', '20', '층', '부터', '30', '층', '을', '연속', '으로', '깨다가', '튕기', '면', '20~29', '층', '보상', '은', '못', '받더군요', '어떻게', '하나요'], ['잘', '하고있다가', '케릭', '하나', '만', '게임', '접속', '이', '안되네요', '..', '다른', '케릭', '키우다가', '4시', '되서', '접속', '하려니', '자꾸', '만들어', '가지네요', ',,', '키우지', '말', '라는', '건가', '..', '이제', '전직', '하려고하는데', 'ㅡㅡ'

# 3. token2idx 및 word dictionary 생성

In [77]:
def token2idx(tokenized_documents, token2idx_dict=dict(), token_counter_dict=dict()):
    """
    1. will change token to idx and 2. token2idx dictionary and 3. counted token dictionary
    """
    indiced_documents = list()
    
    if len(token2idx_dict) == 0:
        token2idx_dict['<pad>'] = 0
        token2idx_dict['<start>'] = 1
        token2idx_dict['<end>'] = 2
        token2idx_dict['<unk>'] = 3
        
    for tokenized_document in tokenized_documents:
        indiced_document = list()
        for token in tokenized_document:
            if token not in token2idx_dict:
                token2idx_dict[token] = len(list(token2idx_dict.keys()))
            if token not in token_counter_dict:
                token_counter_dict[token] = 0
            token_counter_dict[token] += 1
            
            indiced_document.append(token2idx_dict[token])
        indiced_documents.append(indiced_document)
    return indiced_documents, token2idx_dict, token_counter_dict

In [78]:

token2idx_dict = dict()
token_counter_dict = dict()
all_indiced_documents = list()

for tokenized_documents in all_tokenized_documents:
    indiced_documents, token2idx_dict, token_counter_dict = token2idx(tokenized_documents=tokenized_documents, 
                                                                      token2idx_dict=token2idx_dict, 
                                                                      token_counter_dict=token_counter_dict)
    all_indiced_documents.append(indiced_documents)

In [79]:
print(all_indiced_documents)

[[[4, 5, 6, 7, 8, 9, 10, 4, 5, 11, 12, 9, 13, 14, 15, 16, 17, 18, 19, 20, 21, 20, 22, 23, 9, 24, 25], [26, 27, 28, 4, 26, 27, 29, 28, 30, 31, 32, 33, 34, 35, 36, 30, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46], [47, 48, 49, 50, 51, 52, 53], [54, 55, 56, 57, 58, 59, 60, 61, 62, 63], [64, 65, 66, 58, 5, 4, 5, 6, 64, 65, 66, 58, 4, 5, 6], [64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 42, 82, 69, 83, 84, 42, 85, 75, 86, 87, 88, 89, 90, 91, 92, 93, 94, 92, 42, 95, 72, 96, 97, 98, 99, 92, 84, 13, 86, 100, 101, 102], [31, 103, 58, 85, 10, 64, 4, 5, 104, 7, 12, 58, 105, 106, 107, 4, 108, 35, 109, 110, 111, 112, 113, 114, 115, 7, 116, 117, 118, 119], [58, 4, 120, 36, 121, 122, 4, 5, 123, 4, 120, 36, 121, 124, 125, 126, 4, 123, 127, 128, 129, 130, 58, 4, 131], [132, 133, 134, 89, 135, 133, 136, 137], [138, 139, 140, 4, 141, 31, 142, 116], [143, 4, 11, 144], [145, 146, 65, 147, 148, 68], [149, 150, 65, 151, 152, 153, 149, 150, 65, 151, 68, 72, 73, 149, 75, 86, 154, 150, 155, 156, 1

In [80]:
print(token2idx_dict)

{'<pad>': 0, '<start>': 1, '<end>': 2, '<unk>': 3, '접속': 4, '이': 5, '안되요': 6, '..': 7, '저': 8, '캐릭': 9, '만': 10, '안됩니다': 11, '다른': 12, '은': 13, 'ㅈ': 14, '되고요': 15, '간단한': 16, '문제': 17, '같은데': 18, '빨리': 19, '해결': 20, '좀': 21, '조': 22, '군주': 23, '이라': 24, 'ㅠㅠ': 25, '아이폰': 26, '6': 27, '와이파이': 28, '에서만': 29, '가': 30, '잘': 31, '됨에도': 32, '불구': 33, '하고': 34, '자꾸': 35, '네트워크': 36, '안좋다고': 37, '나옵니다': 38, '마치': 39, 'lte': 40, '사용': 41, '을': 42, '유도': 43, '하는것': 44, '처럼': 45, '^^': 46, '아직': 47, '까지': 48, '패치': 49, '자체': 50, '않': 51, '되는': 52, 'ㅜㅜ': 53, '오렌': 54, '10': 55, '러키': 56, '쏭': 57, '케릭': 58, '빠른': 59, '정상화': 60, '바': 61, '래': 62, '요': 63, '게임': 64, '중': 65, '팅기더니': 66, '꺼짐': 67, '현상': 68, '튕': 69, '김현': 70, '상': 71, '으로': 72, '인해': 73, '물약': 74, '도': 75, '계속': 76, '나가고': 77, '특히': 78, '오만': 79, '의': 80, '탑': 81, '하다가': 82, '기면': 83, '보상': 84, '하나': 85, '못': 86, '받네요': 87, '예': 88, '를': 89, '들어': 90, '20': 91, '층': 92, '부터': 93, '30': 94, '연속': 95, '깨다가': 96, '튕기': 97, '면': 98, '20~29

In [81]:
print(token_counter_dict)

{'접속': 2898, '이': 6108, '안되요': 262, '..': 955, '저': 215, '캐릭': 791, '만': 1086, '안됩니다': 433, '다른': 427, '은': 1033, 'ㅈ': 8, '되고요': 5, '간단한': 2, '문제': 371, '같은데': 86, '빨리': 381, '해결': 309, '좀': 848, '조': 72, '군주': 45, '이라': 69, 'ㅠㅠ': 279, '아이폰': 271, '6': 121, '와이파이': 38, '에서만': 13, '가': 3368, '잘': 248, '됨에도': 1, '불구': 16, '하고': 660, '자꾸': 168, '네트워크': 203, '안좋다고': 1, '나옵니다': 45, '마치': 7, 'lte': 5, '사용': 396, '을': 2104, '유도': 18, '하는것': 1, '처럼': 47, '^^': 37, '아직': 93, '까지': 291, '패치': 159, '자체': 66, '않': 103, '되는': 88, 'ㅜㅜ': 143, '오렌': 35, '10': 293, '러키': 1, '쏭': 1, '케릭': 1532, '빠른': 563, '정상화': 1, '바': 42, '래': 52, '요': 1548, '게임': 1048, '중': 873, '팅기더니': 35, '꺼짐': 18, '현상': 434, '튕': 140, '김현': 82, '상': 178, '으로': 1432, '인해': 81, '물약': 143, '도': 1441, '계속': 418, '나가고': 43, '특히': 2, '오만': 42, '의': 569, '탑': 44, '하다가': 76, '기면': 5, '보상': 1902, '하나': 162, '못': 893, '받네요': 12, '예': 24, '를': 1545, '들어': 358, '20': 89, '층': 38, '부터': 182, '30': 105, '연속': 32, '깨다가': 2, '튕기': 38, '면': 236, '

# 4. Padding & formatting

In [101]:
def pad_format(indiced_documents, max_seq_length):
    padded_data = np.empty(shape=(0, max_seq_length))
    
    for indiced_document in indiced_documents:
        np_transformed = np.zeros(shape=(max_seq_length, ))
        for idx, index in enumerate(indiced_document):
            if idx == max_seq_length:
                break
            np_transformed[idx] = index
        padded_data = np.insert(padded_data, padded_data.shape[0], np_transformed, axis=0)
    
    return padded_data
        

In [118]:
all_padded_documents = list()
for indiced_documnets in all_indiced_documents:
    padded_documents = pad_format(indiced_documnets, max_seq_length=50)
    all_padded_documents.append(padded_documents)

# 5. Classification 형태 또는 Seq2seq 형태로 만들어주기

## 5-1. Classification 형태로 만들기

In [137]:
def map_document_label(documents, label, label2idx=dict()):
    x_data = documents
    y_data = list()
    idx = len(label2idx)
    
    if label not in label2idx:
        label2idx[label] = idx
    labels = [label2idx[label]] * documents.shape[0]
    y_data.extend(labels)
    
    return x_data, y_data, label2idx

In [138]:
label2idx = dict()
x_data = list()
y_data = list()

for padded_documents, label in zip(all_padded_documents, all_documents_names):
    _x_data, _y_data, label2idx = map_document_label(documents=padded_documents, label=label, label2idx=label2idx)
    x_data.append(_x_data)
    y_data.append(_y_data)

x_data = np.concatenate(x_data, axis=0)
y_data = np.concatenate(y_data, axis=0)

for idx, (x, y) in enumerate(zip(x_data, y_data)):
    if idx == 10:
        break
    print('label:', y)
    print('data:', x)

label: 0
data: [ 4.  5.  6.  7.  8.  9. 10.  4.  5. 11. 12.  9. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 20. 22. 23.  9. 24. 25.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
label: 0
data: [26. 27. 28.  4. 26. 27. 29. 28. 30. 31. 32. 33. 34. 35. 36. 30. 37. 38.
 39. 40. 41. 42. 43. 44. 45. 46.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
label: 0
data: [47. 48. 49. 50. 51. 52. 53.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
label: 0
data: [54. 55. 56. 57. 58. 59. 60. 61. 62. 63.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
label: 0
data: [64. 65. 66. 58.  5.  4.  5.  6. 64. 65. 66. 58.  4.  5.  6.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.

# 5-2. Seq2seq
## Encoder 데이터 또는 Decoder 데이터인지는 사용자측에서 결정할 수 있도록 inputs와 outputs 생성 함수만 구현한다.

In [160]:
def map_document_ae(documents):
    x_data = documents
    y_data = documents[:, 1:]
    y_data = np.insert(y_data, y_data.shape[1], 0, axis=1)
    return x_data, y_data

In [161]:
x_data = list()
y_data = list()

for padded_documents in all_padded_documents:
    _x_data, _y_data = map_document_ae(documents=padded_documents)
    x_data.append(_x_data)
    y_data.append(_y_data)

x_data = np.concatenate(x_data, axis=0)
y_data = np.concatenate(y_data, axis=0)

In [162]:
x_data

array([[4.000e+00, 5.000e+00, 6.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.600e+01, 2.700e+01, 2.800e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.700e+01, 4.800e+01, 4.900e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [3.760e+02, 2.760e+02, 7.473e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.405e+03, 1.440e+02, 1.890e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.971e+03, 3.405e+03, 2.971e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

In [163]:
y_data

array([[5.000e+00, 6.000e+00, 7.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [2.700e+01, 2.800e+01, 4.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [4.800e+01, 4.900e+01, 5.000e+01, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       ...,
       [2.760e+02, 7.473e+03, 6.590e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [1.440e+02, 1.890e+03, 5.730e+02, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [3.405e+03, 2.971e+03, 3.405e+03, ..., 0.000e+00, 0.000e+00,
        0.000e+00]])

# Extras: 활용할 때 어떻게 할지 모르겠다고요? 
# Dataset 모듈로 만들고 활용해보기!

In [175]:
class Dataset:
    def __init__(self):
        self.okt = Okt()
    
    """
    1. 다음의 두 함수는 소단위로 구현된 아래 함수들[2]를 응용하는 함수입니다. 필요에 따라 추후에 merge하면 좋겠지요 ^^
    """
    def get_classification_data(self, data_root_path, data_type, max_seq_length):
        # 1. Read Data
        data_path_list = listdir(data_root_path)
        data_path_list = [join(data_root_path, d) for d in data_path_list]
        all_documents, all_documents_names = self.read_data(data_path_list=data_path_list, data_type=data_type)
#         print(all_documents_names)
#         print(all_documents)

        # 2. Tokenize data
        all_tokenized_documents = list()
        for documents in all_documents:
            tokenized_documents = self.tokenize(documents=documents, token_type='word')
            all_tokenized_documents.append(tokenized_documents)
        print(all_tokenized_documents)
        
        # 3. token to index
        token2idx_dict = dict()
        token_counter_dict = dict()
        all_indiced_documents = list()

        for tokenized_documents in all_tokenized_documents:
            indiced_documents, token2idx_dict, token_counter_dict = self.token2idx(tokenized_documents=tokenized_documents, 
                                                                              token2idx_dict=token2idx_dict, 
                                                                              token_counter_dict=token_counter_dict)
            all_indiced_documents.append(indiced_documents)
        
        # 4. padd & formatting
        all_padded_documents = list()
        for indiced_documnets in all_indiced_documents:
            padded_documents = self.pad_format(indiced_documnets, max_seq_length=max_seq_length)
            all_padded_documents.append(padded_documents)
        
        # 5. Make classification dataset
        label2idx = dict()
        x_data = list()
        y_data = list()

        for padded_documents, label in zip(all_padded_documents, all_documents_names):
            _x_data, _y_data, label2idx =self. map_document_label(documents=padded_documents, label=label, label2idx=label2idx)
            x_data.append(_x_data)
            y_data.append(_y_data)

        x_data = np.concatenate(x_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        for idx, (x, y) in enumerate(zip(x_data, y_data)):
            if idx == 10:
                break
#             print('label:', y)
#             print('data:', x)
        
        return x_data, y_data
        
        
    def get_encoder_decoder_data(self, encoder_root_path, decoder_root_path, data_type, encoder_language, decoder_langeuage,
                                max_enc_seq_length, max_dec_seq_length):
        # 1. Read Data
        encoder_path_list = listdir(encoder_root_path)
        encoder_path_list = [join(encoder_root_path, d) for d in encoder_path_list]
        
        decoder_path_list = listdir(decoder_root_path)
        decoder_path_list = [join(decoder_root_path, d) for d in decoder_path_list]
        
        data_path_list = np.concatenate([encoder_path_list, decoder_path_list], axis=0)
        ########## same logic below
        all_documents, all_documents_names = self.read_data(data_path_list=data_path_list, data_type=data_type)
#         print(all_documents_names)
#         print(all_documents)

        # 2. Tokenize data
        all_tokenized_documents = list()
        for documents in all_documents:
            tokenized_documents = self.tokenize(documents=documents, token_type='word')
            all_tokenized_documents.append(tokenized_documents)
        print(all_tokenized_documents)
        
        ########## same logic above
        
        # 3. token to index
        encoder_token2idx_dict = dict()
        encoder_token_counter_dict = dict()
        encoder_all_indiced_documents = list()
        
        for tokenized_documents in all_tokenized_documents[:len(encoder_path_list)]:
            indiced_documents, encoder_token2idx_dict, encoder_token_counter_dict = \
                    self.token2idx(tokenized_documents=tokenized_documents,
                                   token2idx_dict=encoder_token2idx_dict, 
                                   token_counter_dict=encoder_token_counter_dict)
            encoder_all_indiced_documents.append(indiced_documents)
        
        
        decoder_token2idx_dict = dict()
        decoder_token_counter_dict = dict()
        decoder_all_indiced_documents = list()

        for tokenized_documents in all_tokenized_documents[len(encoder_path_list):]:
            indiced_documents, decoder_token2idx_dict, decoder_token_counter_dict = \
                    self.token2idx(tokenized_documents=tokenized_documents,
                                   token2idx_dict=decoder_token2idx_dict, 
                                   token_counter_dict=decoder_token_counter_dict)
            decoder_all_indiced_documents.append(indiced_documents)
            
        
        # 4. padd & formatting
        encoder_all_padded_documents = list()
        for indiced_documnets in encoder_all_indiced_documents:
            padded_documents = self.pad_format(indiced_documnets, max_seq_length=max_enc_seq_length)
            encoder_all_padded_documents.append(padded_documents)
            
        decoder_all_padded_documents = list()
        for indiced_documnets in decoder_all_indiced_documents:
            padded_documents = self.pad_format(indiced_documnets, max_seq_length=max_enc_seq_length)
            decoder_all_padded_documents.append(padded_documents)
            
        # 5. Make encoder decoder dataset
        encoder_inputs = np.concatenate(encoder_all_padded_documents, axis=0)
        decoder_inputs = list()
        decoder_outputs = list()

        for padded_documents in decoder_all_padded_documents:
            _x_data, _y_data = self.map_document_ae(documents=padded_documents)
            decoder_inputs.append(_x_data)
            decoder_outputs.append(_y_data)

        decoder_inputs = np.concatenate(decoder_inputs, axis=0)
        decoder_outputs = np.concatenate(decoder_outputs, axis=0)
        
        return encoder_inputs, decoder_inputs, decoder_outputs
    
    
    """
    2. 아래의 함수들은 위에 구현된 함수들 복사 붙여넣기 후 파라미터에 가장 앞에 self를 추가해줍니다 ^^
    """
    def read_data(self, data_path_list, data_type):
        all_documents = list()
        all_documents_names = list()
        for data_path in data_path_list:
            if 'numpy' == data_type: # Path(data_path).suffix
                documents = np.load(data_path)
            elif 'text' == data_type:
                documents = list()
                with open(data_path, 'r', encoding='utf-8') as f:
                    new_data_lines = f.read().split('\n')
                    documents.extend(new_data_lines)
            else:
                print('Proper data_type is not presented.')
            all_documents.append(documents)
            all_documents_names.append(Path(data_path).stem)
        return all_documents, all_documents_names
    
    def tokenize(self, documents, token_type):
        if token_type == 'char':
            return [char for document in documents for char in document]
        elif token_type == 'word': # token_type == word
            return [self.okt.morphs(document, norm=False, stem=False) for document in documents]
        elif token_type == 'ngram': # 'ngram'
            tokenized_sentence = sentence.split(' ')
            if kor_tokenizer_max_word_char == -1:
                return [word for word in tokenized_sentence]
            else:
                return [word[:kor_tokenizer_max_word_char] for word in tokenized_sentence]
        else:
            print('Not implemented token type:', token_type)
    
    def token2idx(self, tokenized_documents, token2idx_dict=dict(), token_counter_dict=dict()):
        """
        1. will change token to idx and 2. token2idx dictionary and 3. counted token dictionary
        """
        indiced_documents = list()

        if len(token2idx_dict) == 0:
            token2idx_dict['<pad>'] = 0
            token2idx_dict['<start>'] = 1
            token2idx_dict['<end>'] = 2
            token2idx_dict['<unk>'] = 3

        for tokenized_document in tokenized_documents:
            indiced_document = list()
            for token in tokenized_document:
                if token not in token2idx_dict:
                    token2idx_dict[token] = len(list(token2idx_dict.keys()))
                if token not in token_counter_dict:
                    token_counter_dict[token] = 0
                token_counter_dict[token] += 1

                indiced_document.append(token2idx_dict[token])
            indiced_documents.append(indiced_document)
        return indiced_documents, token2idx_dict, token_counter_dict
    
    def pad_format(self, indiced_documents, max_seq_length):
        padded_data = np.empty(shape=(0, max_seq_length))

        for indiced_document in indiced_documents:
            np_transformed = np.zeros(shape=(max_seq_length, ))
            for idx, index in enumerate(indiced_document):
                if idx == max_seq_length:
                    break
                np_transformed[idx] = index
            padded_data = np.insert(padded_data, padded_data.shape[0], np_transformed, axis=0)

        return padded_data
    
    def map_document_label(self, documents, label, label2idx=dict()):
        x_data = documents
        y_data = list()
        idx = len(label2idx)

        if label not in label2idx:
            label2idx[label] = idx
        labels = [label2idx[label]] * documents.shape[0]
        y_data.extend(labels)

        return x_data, y_data, label2idx
    
    def map_document_ae(self, documents):
        x_data = documents
        y_data = documents[:, 1:]
        y_data = np.insert(y_data, y_data.shape[1], 0, axis=1)
        return x_data, y_data

# Test #1. Get Classification Data (분류 모델을 위한 데이터 가지고 오기)

In [165]:
dataset = Dataset()

In [166]:
x_data, y_data = dataset.get_classification_data(data_root_path=category_data_path, data_type='numpy', max_seq_length=50)

[[['접속', '이', '안되요', '..', '저', '캐릭', '만', '접속', '이', '안됩니다', '다른', '캐릭', '은', 'ㅈ', '되고요', '간단한', '문제', '같은데', '빨리', '해결', '좀', '해결', '조', '군주', '캐릭', '이라', 'ㅠㅠ'], ['아이폰', '6', '와이파이', '접속', '아이폰', '6', '에서만', '와이파이', '가', '잘', '됨에도', '불구', '하고', '자꾸', '네트워크', '가', '안좋다고', '나옵니다', '마치', 'lte', '사용', '을', '유도', '하는것', '처럼', '^^'], ['아직', '까지', '패치', '자체', '않', '되는', 'ㅜㅜ'], ['오렌', '10', '러키', '쏭', '케릭', '빠른', '정상화', '바', '래', '요'], ['게임', '중', '팅기더니', '케릭', '이', '접속', '이', '안되요', '게임', '중', '팅기더니', '케릭', '접속', '이', '안되요'], ['게임', '꺼짐', '현상', '튕', '김현', '상', '으로', '인해', '물약', '도', '계속', '나가고', '특히', '오만', '의', '탑', '을', '하다가', '튕', '기면', '보상', '을', '하나', '도', '못', '받네요', '예', '를', '들어', '20', '층', '부터', '30', '층', '을', '연속', '으로', '깨다가', '튕기', '면', '20~29', '층', '보상', '은', '못', '받더군요', '어떻게', '하나요'], ['잘', '하고있다가', '케릭', '하나', '만', '게임', '접속', '이', '안되네요', '..', '다른', '케릭', '키우다가', '4시', '되서', '접속', '하려니', '자꾸', '만들어', '가지네요', ',,', '키우지', '말', '라는', '건가', '..', '이제', '전직', '하려고하는데', 'ㅡㅡ'

In [168]:
y_data.shape

(12912,)

# Test #2. Get Seq2Seq Data (Encoder-Decoder 모델을 위한 데이터 가지고 오기)

In [176]:
seq2seq_dataset = Dataset()

In [177]:
encoder_root_path = join(translation_data_path, 'english')
decoder_root_path = join(translation_data_path, 'korean')

In [178]:
encoder_inputs, decoder_inputs, decoder_outputs = seq2seq_dataset.get_encoder_decoder_data(
    encoder_root_path=encoder_root_path, decoder_root_path=decoder_root_path, 
    data_type='text', encoder_language='english', decoder_langeuage='korean',
    max_enc_seq_length=50, max_dec_seq_length=50)

