In [15]:
# 다대다 문자 단위 RNN(Char RNN) - 원핫인코딩 & 더 많은 데이터

# 모듈 불러오기
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

### 데이터 전처리 ###
# 문자열 데이터
sentence = ("if you want to build a ship, don't drum up people together to "
            "collect wood and don't assign them tasks and work, but rather "
            "teach them to long for the endless immensity of the sea.")

# 단어집합
char_set = list(set(sentence)) # 중복 제거 단어집합

# 정수 인코딩
char_dic = {c: i for i, c in enumerate(char_set)}
print(char_dic)
dic_size = len(char_dic)
print('문자 집합의 크기 : {}'.format(dic_size))

# 하이퍼파라미터 초기화
input_size = dic_size # 입력의 차원
hidden_size = dic_size # 은닉상태 크기
sequence_length = 10 # 시점 개수 - 임의로 10개 단위로 끊어서 샘플 만들어보자
learning_rate = 0.1 # 학습률

{'d': 0, 'n': 1, 's': 2, 'o': 3, 'i': 4, 'u': 5, 'r': 6, 'a': 7, 'g': 8, 'f': 9, 'k': 10, 'm': 11, '.': 12, ',': 13, 'p': 14, 'c': 15, 'y': 16, ' ': 17, 'e': 18, 't': 19, 'b': 20, 'l': 21, "'": 22, 'w': 23, 'h': 24}
문자 집합의 크기 : 25


In [16]:
# 데이터셋 생성
x_data = []
y_data = []
for i in range(0, len(sentence) - sequence_length):
    x_str = sentence[i:i + sequence_length]
    y_str = sentence[i + 1:i + sequence_length + 1]
    print(i, x_str, '->', y_str)
    
    x_data.append([char_dic[c] for c in x_str])
    y_data.append([char_dic[c] for c in y_str])
print(x_data[0]) # if you wan에 해당되는 정수 인코딩
print(y_data[0]) # f you wnat에 해당되는 정수 인코딩

0 if you wan -> f you want
1 f you want ->  you want 
2  you want  -> you want t
3 you want t -> ou want to
4 ou want to -> u want to 
5 u want to  ->  want to b
6  want to b -> want to bu
7 want to bu -> ant to bui
8 ant to bui -> nt to buil
9 nt to buil -> t to build
10 t to build ->  to build 
11  to build  -> to build a
12 to build a -> o build a 
13 o build a  ->  build a s
14  build a s -> build a sh
15 build a sh -> uild a shi
16 uild a shi -> ild a ship
17 ild a ship -> ld a ship,
18 ld a ship, -> d a ship, 
19 d a ship,  ->  a ship, d
20  a ship, d -> a ship, do
21 a ship, do ->  ship, don
22  ship, don -> ship, don'
23 ship, don' -> hip, don't
24 hip, don't -> ip, don't 
25 ip, don't  -> p, don't d
26 p, don't d -> , don't dr
27 , don't dr ->  don't dru
28  don't dru -> don't drum
29 don't drum -> on't drum 
30 on't drum  -> n't drum u
31 n't drum u -> 't drum up
32 't drum up -> t drum up 
33 t drum up  ->  drum up p
34  drum up p -> drum up pe
35 drum up pe -> rum up peo
36

In [17]:
# 데이터 원-핫벡터 변환
x_one_hot = [np.eye(dic_size)[x] for x in x_data]
X = torch.FloatTensor(x_one_hot)
Y = torch.LongTensor(y_data)
print('훈련 데이터의 크기 : {}'.format(X.shape))
print('레이블의 크기 : {}'.format(Y.shape))
print(X[0])
print(Y[0])

훈련 데이터의 크기 : torch.Size([170, 10, 25])
레이블의 크기 : torch.Size([170, 10])
tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.

In [19]:
### RNN ###
# RNN셀 구현 - 은닉층 2개 사용
class Net(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, layers):
        super(Net, self).__init__()
        self.rnn = torch.nn.RNN(input_dim, hidden_dim, num_layers=layers, batch_first=True) # RNN셀 구현 - 은닉층 2개 사용
        self.fc = torch.nn.Linear(hidden_dim, hidden_dim, bias=True) # 출력층 구현
    def forward(self, x):
        x, _status = self.rnn(x)
        x = self.fc(x)
        return x

net = Net(input_size, hidden_size, 2)
outputs = net(X)
print(outputs.shape) # 3차원 텐서
    # RNN셀의 리턴 : 배치 크기, 총 시점 개수, 출력 크기
print(outputs.view(-1, dic_size).shape) # 2차원 텐서 변환
print(Y.shape)
print(Y.view(-1).shape) # 2차원 텐서 변환

# 최적화함수, 손실함수 정의
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), learning_rate)

torch.Size([170, 10, 25])
torch.Size([1700, 25])
torch.Size([170, 10])
torch.Size([1700])


In [20]:
# 학습
for i in range(100):
    # 최적화함수 초기화
    optimizer.zero_grad()
    
    # 학습
    outputs = net(X) # (170, 10, 25) 3D 텐서를 매 에포크마다 모델의 입력으로 사용
    loss = criterion(outputs.view(-1, dic_size), Y.view(-1)) # batch차원 제거를 위해 np.view 사용
    loss.backward() # 손실함수를 이용한 기울기 계산
    optimizer.step() # 최적화함수를 이용한 파라미터 업데이트
    
    # 모델 예측값 확인
    results = outputs.argmax(dim=2) # results 텐서 크기 : (170, 10)
    predict_str = ""
    for j, result in enumerate(results):
        if j == 0: # 처음에는 예측 결과 전부 가져옴
            predict_str += ''.join([char_set[t] for t in result]) # 정수 인코딩 풀어씀
        else: # 그 이후에는 마지막 글자만 반복해서 추가
            predict_str += char_set[result[-1]] # 정수 인코딩 풀어씀
    
    print(predict_str)

ppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppwpppppppp
                                                                                                                                                                                   
t          t          t     t          t                             t                                          t                             t                                    
eht ehohehohhehlhehohhehohehdhehohehehoeheehoehohhhehohehoheetoeehohhehehhhhehhhehohehehehhhehhohhehohehehdlehohelehdhehehehoehohehehohhetlhehdhehhhehohehehehehehohhhhhhoehhhhoo h
elboelketkoeonloemoetoelneeot elnetoemodmnloeeoeekoeknetleeoekneloetoelnlohenlnonmnetoemnetooekneoeleeboeldoeknetoelbonlneeboelneboeloeoeloeloelelnotmn bonlnlneelnoeeeekoemoelkeem
  t tonlnt ton nt r tononln n nlnonmnlen rono t nt rrn nt t n n    nonl   t t ton nlnonln  t tn t n 

p you want to build a ship, don't drum up people together te collect wood and don't assign them tasks and dork, but rather teach them to long for the endless immensity of the eea.
p you want to build a ship, don't arum up people together te collect wood and don't assign them tasks and dork, but rather teach them to long for the endless immensity of the eea.
p you want to build a ship, don't drum up people together to collect wood and don't assign them tasks and dork, but rather toach them to long for the endless immensity of the eea.
p you want to build a ship, don't drum up people together te collect wood and don't assign them tasks and dork, but rather teach them to long for the endless immensity of the eea.
p you want to build a ship, don't drum up people together to collect wood and don't assign them tasks and dork, but rather teach them to long for the endless immensity of the eea.
p you want to build a ship, don't drum up people together to collect wood and don't assign them task