<a href="https://colab.research.google.com/github/chaiminwoo0223/Deep-Learning/blob/main/14%20-%20Char_RNN_Naive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Download

In [1]:
!rm -r data
import os 

try:
  os.mkdir("./data")
except:
  pass

!wget https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt -P ./data

--2023-05-08 11:34:24--  https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘./data/input.txt’


2023-05-08 11:34:24 (20.6 MB/s) - ‘./data/input.txt’ saved [1115394/1115394]



In [2]:
!pip install unidecode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Import

In [3]:
import torch
import torch.nn as nn
import unidecode
import string
import random
import re
import time, math

# Hyperparameter

In [4]:
num_epochs = 2000
print_every = 100
plot_every = 10
chunk_len = 200
hidden_size = 100
batch_size = 1
num_layers = 1
embedding_size = 70
lr = 0.002

# Data
## 1.Prepare Characters

In [5]:
# import string에서 출력가능한 문자들을 전부 불러온다.
all_characters = string.printable
n_characters = len(all_characters)
print(all_characters)
print("num_chars =", n_characters)

0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 	

num_chars = 100


## 2.Get Text Data

In [6]:
# 다운받은 텍스트 파일을 읽어온다.
file = unidecode.unidecode(open('./data/input.txt').read())
file_len = len(file)
print("file_len =", file_len)

file_len = 1115394


# Functions For Text Processing
## 1.Random Chunk

In [7]:
# 텍스트 파일의 일부분을 랜덤하게 불러온다.
# (시작지점 < 텍스트 파일 전체길이 - 불러오는 텍스트의 길이)가 되도록 시작점과 끝점을 정한다.
def random_chunk():
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]

print(random_chunk())

he haunts
wakes, fairs and bear-baitings.

AUTOLYCUS:
Very true, sir; he, sir, he; that's the rogue that
put me into this apparel.

Clown:
Not a more cowardly rogue in all Bohemia: if you had
but looke


## 2.Character To Tensor

In [8]:
# 문자열을 받았을 때, 이를 인덱스의 배열로 바꾼다.
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return tensor
print(char_tensor('ABCdef'))

tensor([36, 37, 38, 13, 14, 15])


## 3.Chunk Into Input & Label

In [9]:
# 랜덤한 텍스트 chunk를 불러와서, 이를 입력과 목표값으로 바꾼다.
# 예) pytorch -> 입력 = pytorc, 목표값 = ytorch
def random_training_set():
    chunk = random_chunk()
    inp = char_tensor(chunk[:-1])
    target = char_tensor(chunk[1:])
    return inp, target

# RNN

## 1.Model

In [10]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers = 1):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.encoder = nn.Embedding(self.input_size, self.embedding_size)
        self.rnn = nn.RNN(self.embedding_size, self.hidden_size, self.num_layers)
        self.decoder = nn.Linear(self.hidden_size, self.output_size)
    
    def forward(self, input, hidden):
        out = self.encoder(input.view(1,-1))
        out, hidden = self.rnn(out, hidden)
        out = self.decoder(out.view(batch_size,-1))
        return out, hidden
    
    def init_hidden(self):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden

model = RNN(n_characters, embedding_size, hidden_size, n_characters, num_layers=2)        

In [11]:
# 모델 테스트
inp = char_tensor("A")
print(inp)
hidden = model.init_hidden()
print(hidden.size())
out, hidden = model(inp, hidden)
print(out.size())

tensor([36])
torch.Size([2, 1, 100])
torch.Size([1, 100])


## 2.Loss & Optimizer

In [12]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## 3.Test Function

In [13]:
# 임의의 문자(start_str)로 시작하는 길이 200짜리 모방 글을 생성한다.
# torch.multinomial() : 주어진 입력 텐서 input의 각 행마다, 다항분포에서 표본을 추출한다.
def test():
    start_str = "b"
    inp = char_tensor(start_str)
    hidden = model.init_hidden()
    x = inp
    print(start_str, end="")

    for i in range(200):
        output, hidden = model(x, hidden)
        output_dist = output.data.view(-1).div(0.8).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        predicted_char = all_characters[top_i]
        print(predicted_char, end="")
        x = char_tensor(predicted_char)

# Train

In [14]:
for i in range(num_epochs):
    inp, label = random_training_set() # 랜덤한 텍스트 덩어리를 샘플링하고, 이를 텐서로 변환한다.
    hidden = model.init_hidden()
    loss = torch.tensor([0]).type(torch.FloatTensor)
    optimizer.zero_grad()

    for j in range(chunk_len-1):
        x = inp[j]
        y_= label[j].unsqueeze(0).type(torch.LongTensor)
        y, hidden = model(x, hidden)
        loss += loss_func(y, y_)
    
    loss.backward()
    optimizer.step()

    if i % 10 == 0:
        print("\n", loss/chunk_len, "\n")
        test()
        print("\n", "="*100)


 tensor([4.5949], grad_fn=<DivBackward0>) 

b_xH	M?M!@jcUh"cpP@{>PDJfT%6BiP{z.}^VE}G)3ix`9vv,.<#1Q3r{Y*%T@%(}yxHD?t[-	{D'Kj.#tjU)T[L
x=G;wUM.5;\-^q5{<DBFd/*%#T%_D'&{j|deW%/hyvvXosHI/?h66P5+c1<Odj_sc(l*CLmBn'ss,*IV_{<>[<L(>Y$w^n97'Da'd"
+`


 tensor([3.2770], grad_fn=<DivBackward0>) 

bC@` ie hrus ioe tsyotd1n ,GyneNhdi  s dnup tPkib-pnn teiLi,6  gle  ,aaoedl n eni*ith dtres{ai7uee Wi 
 aii`-.y.tu tn, os V tcsisnsyA aao' amtiitmorh:n fJiuitcuh,-sraowcU u iu nn rstohh o5ooL rl 
ue  o

 tensor([3.3283], grad_fn=<DivBackward0>) 

bn4 
tAP
9b3^["7OK,u Aatn
 smyn, nou r arUl
wemo
 d l th irkher mhi oltm rtn drt

I
nengd n 
edPh ae

wsesl oe f at
s voed er f
hd  d  n  rcrhho 
,an:
oea id eg.r I siem e dhIr
onrnto  Dad!znn t 

 tensor([3.0990], grad_fn=<DivBackward0>) 

bKra hrarCewd thenn  aise tu is te te sos lee  inecttiib; orwhom fby foan ateIrhErabtie oem ensefH oh srrsehVo Scumi ed dwe
o paratn a ar Gotbot tihelf
fhaa yea co me,; mef be
door mo fher nn scegv Kl

 tensor([2.9280],