<a href="https://colab.research.google.com/github/dasom222g/learn-LLM/blob/main/01_1_RNN_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sequential Data 준비

In [None]:
import torch
import torch.nn as nn

# Hyperparameters
input_size = 5  # 입력 벡터의 크기 (각 토큰/단어의 특성 벡터 차원)
hidden_size = 10  # 히든 상태 크기
sequence_length = 6  # 시퀀스 길이 (토큰이나 단어의 개수)
batch_size = 3  # 배치 크기 (한 번에 처리되는 문장/시퀀스의 개수)

In [None]:
# 임의의 입력 데이터 (배치 크기, 시퀀스 길이, 입력 차원)
inputs = torch.randn(sequence_length, batch_size, input_size) # 순서 잘못됐는데 하단에서 transpose해줌
inputs.shape # 시퀀스 길이, 배치 크기, 입력(임베딩) 차원

# 배열의 깊이: n차원 "텐서"
# 요소의 갯수: n차원 "벡터" or 입력 차원

torch.Size([6, 3, 5])

## RNN 모델 사용하기 기본

RNN 공식문서 : https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

데이터의 차원과 모델의 입출력 차원을 주의깊게 확인합시다

In [None]:
# RNN 모델
rnn = nn.RNN(input_size, hidden_size, batch_first=False)
# batch_first=False가 디폴트이므로 생략 가능

# RNN 실행
# output: 모든 타임스텝에 대한 RNN의 출력 (밖으로 나가는 값)
# hidden: 마지막 타임스텝의 히든 상태 (다음 스텝으로 넘어가는 값
# output과 hidden의 값은 같음
output, hidden = rnn(inputs)

print("Output shape:", output)  # (seq_len, batch, hidden_size)
print("Hidden shape:", hidden)  # (num_layers, batch, hidden_size)

Output shape: tensor([[[-7.1568e-01,  1.9286e-01,  4.9208e-01,  4.4872e-01,  8.2368e-01,
          -1.9171e-01, -5.3505e-01, -4.4872e-01, -1.3475e-01, -5.9510e-01],
         [ 6.3350e-02,  2.2056e-01, -3.9921e-01, -5.5007e-01,  1.3412e-01,
          -1.1354e-01, -3.8379e-01,  3.0552e-03,  4.4738e-01,  3.2139e-01],
         [-8.6666e-01, -1.2599e-01,  7.8862e-01,  4.9334e-01,  7.8039e-01,
          -1.3577e-01,  3.3897e-01, -5.8275e-01, -4.9020e-01, -3.3513e-01]],

        [[-5.0238e-01,  2.4940e-02,  1.1960e-01,  3.6320e-01,  3.6992e-01,
          -1.0578e-01, -3.7739e-02, -4.8493e-02,  3.4798e-01, -5.0073e-01],
         [-5.4984e-01,  3.2763e-01,  5.3948e-01,  3.0824e-01,  6.7658e-01,
           5.6096e-03, -5.6853e-01, -3.1793e-01, -1.5277e-01, -1.0775e-01],
         [-8.7698e-01, -5.9686e-01,  4.6487e-01,  1.7759e-01,  4.4985e-01,
          -5.3241e-01,  3.4904e-01,  2.1645e-01, -4.3878e-01, -1.2425e-01]],

        [[-6.1331e-01,  3.8120e-02,  2.5483e-01,  5.0383e-01,  7.2624e-01,
 

In [None]:
# RNN 모델
rnn = nn.RNN(input_size, hidden_size, batch_first=True)

# RNN 실행
# output: 모든 타임스텝에 대한 RNN의 출력
# hidden: 마지막 타임스텝의 히든 상태
# 마지막 output값과 hidden값은 같음
output, hidden = rnn(inputs.transpose(0, 1)) # 6, 3, 5 -> 3, 6, 5

print("Output shape:", output.shape)  # (batch_size, sequence_length, hidden_size) -> 3차원 텐서
print("Hidden shape:", hidden.shape)  # (num_layers, batch_size, hidden_size)

output

Output shape: torch.Size([3, 6, 10])
Hidden shape: torch.Size([1, 3, 10])


tensor([[[-4.1885e-01,  6.0842e-01,  5.6368e-01,  3.0921e-01,  1.2162e-01,
          -1.9593e-01,  4.2528e-01, -3.3651e-01, -5.5549e-01, -2.5051e-01],
         [-6.4564e-01,  4.1438e-01,  3.7881e-01,  3.5976e-01,  5.3905e-01,
           4.5546e-02,  1.3136e-01, -8.2601e-01, -2.4693e-01,  8.7032e-02],
         [-7.7738e-01, -5.1519e-02,  3.6897e-01, -1.4760e-01,  1.8256e-01,
           1.8522e-01,  4.0770e-01, -7.9235e-01,  1.4749e-01,  2.5158e-01],
         [-5.7527e-01,  1.8924e-01, -3.0826e-01, -3.1468e-01,  3.4373e-01,
           7.7289e-02,  3.0908e-01, -5.7196e-01,  3.3536e-02, -1.0096e-02],
         [-4.4764e-01,  2.6334e-01,  1.6981e-01,  6.8910e-02,  3.8579e-01,
           2.1305e-02,  1.7999e-01, -7.2224e-01, -1.5625e-01,  1.0449e-01],
         [-2.6448e-01,  3.1103e-01,  3.0608e-01,  1.7974e-01, -2.7373e-02,
           1.9205e-01,  2.5127e-01, -4.7278e-01, -1.7284e-01, -1.4480e-01]],

        [[-8.1686e-02,  8.2762e-02, -4.9191e-01, -1.7225e-01,  4.9303e-01,
          -1.2664

In [None]:
# 마지막 배치 샘플의 모든 시간 단계와 모든 hidden 차원을 선택
output[-1]

tensor([[ 0.4811,  0.1955, -0.1521,  0.5460, -0.1141, -0.5754, -0.1652,  0.6976,
         -0.2336, -0.5983],
        [-0.1442, -0.0678,  0.2548, -0.2921, -0.3747, -0.4995,  0.1132,  0.8637,
         -0.1424, -0.7099],
        [-0.8132, -0.6771,  0.2329,  0.0566, -0.4394, -0.6033,  0.0671, -0.5342,
         -0.9211, -0.2553],
        [ 0.3065, -0.5189, -0.3251,  0.5928,  0.3215,  0.3675, -0.2593,  0.5227,
         -0.5719, -0.0708],
        [ 0.5156, -0.3762,  0.0249,  0.0448, -0.3481, -0.5472, -0.3675, -0.0805,
         -0.7982, -0.6370],
        [-0.5113, -0.5349, -0.0451,  0.3965, -0.2692, -0.1819,  0.4943,  0.1494,
         -0.7939,  0.0140]], grad_fn=<SelectBackward0>)

In [None]:
# 모든 배치에 대해, 마지막 시간 단계의, 모든 hidden 차원을 선택
output[:, -1, :]

tensor([[-0.2645,  0.3110,  0.3061,  0.1797, -0.0274,  0.1921,  0.2513, -0.4728,
         -0.1728, -0.1448],
        [ 0.2712, -0.4295,  0.1663, -0.1453, -0.6062,  0.5938,  0.5080, -0.6879,
          0.0030,  0.1728],
        [ 0.0658,  0.2410,  0.0663, -0.0043,  0.0598,  0.1110,  0.3407, -0.3292,
          0.1252,  0.1691]], grad_fn=<SliceBackward0>)

In [None]:
# 모든 층의 최종 은닉 상태
hidden[0]

tensor([[-0.2645,  0.3110,  0.3061,  0.1797, -0.0274,  0.1921,  0.2513, -0.4728,
         -0.1728, -0.1448],
        [ 0.2712, -0.4295,  0.1663, -0.1453, -0.6062,  0.5938,  0.5080, -0.6879,
          0.0030,  0.1728],
        [ 0.0658,  0.2410,  0.0663, -0.0043,  0.0598,  0.1110,  0.3407, -0.3292,
          0.1252,  0.1691]], grad_fn=<SelectBackward0>)

### RNN 모델 학습해보기

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Hyperparameters
## model parameter
input_size = 5
hidden_size = 10

## data parameter
sequence_length = 6
batch_size = 3
num_classes = 2

## training parameter
learning_rate = 0.01
num_epochs = 20

# 간단한 RNN 분류 모델
# num_classes: output_dim
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # RNN과 완전연결계층 연결
        _, hidden = self.rnn(x)
        out = self.fc(hidden[0])  # 마지막 타임스텝의 출력만 사용
        return out

# 모델, 손실 함수, 옵티마이저 초기화
model = SimpleRNN(input_size, hidden_size, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 임의의 입력 데이터와 레이블
inputs = torch.randn(batch_size, sequence_length, input_size)
labels = torch.tensor([0, 1, 0]) # 배치사이즈 3이므로 label도 3개

# 학습
for epoch in range(num_epochs):
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 5 == 0: # 5바퀴마다
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [5/20], Loss: 0.4133
Epoch [10/20], Loss: 0.1958
Epoch [15/20], Loss: 0.0717
Epoch [20/20], Loss: 0.0265


### multi layer RNN

In [None]:
num_layers = 4 # 층수
rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
outputs, hidden = rnn(inputs) # inputs: (batch_size, seq_length, input_size)
print("Output shape:", outputs.shape) # (batch_size, seq_length, hidden_size)
print("Hidden shape:", hidden.shape) # (num_layers, batch_size, hidden_size)
inputs.shape

Output shape: torch.Size([6, 3, 10])
Hidden shape: torch.Size([4, 6, 10])


torch.Size([6, 3, 5])

In [None]:
outputs

tensor([[[-6.0321e-02, -1.1036e-01, -3.8680e-01,  4.0996e-01,  1.3605e-01,
           1.4679e-01,  3.3409e-01, -3.9453e-01,  1.7790e-01, -5.4677e-02],
         [-2.3547e-01, -5.3780e-01, -7.3781e-01,  6.7565e-01, -2.8192e-01,
           1.7024e-01,  1.4052e-01, -4.8983e-01, -3.4341e-01,  3.8422e-02],
         [-9.9470e-02, -2.7106e-01, -6.7541e-01,  5.1435e-01, -2.6804e-01,
          -8.8027e-02,  1.5239e-01, -4.5595e-01, -2.7170e-02, -5.1990e-01]],

        [[-4.8150e-02, -1.6367e-01, -3.6468e-01,  4.2031e-01,  6.2057e-02,
           2.0494e-01,  3.3399e-01, -3.7709e-01,  8.8626e-02, -7.7415e-02],
         [-2.2989e-01, -4.4576e-01, -7.5957e-01,  6.4519e-01, -1.5392e-01,
           3.7693e-02,  1.4025e-01, -4.9696e-01, -2.2738e-01,  5.5977e-03],
         [-1.6598e-01, -3.1098e-01, -6.8307e-01,  5.8199e-01, -2.9750e-01,
           1.5884e-02,  2.0567e-01, -5.0048e-01, -7.2553e-03, -4.6127e-01]],

        [[-8.5282e-02, -8.5344e-02, -3.0648e-01,  4.6345e-01,  7.2852e-02,
           1.76

In [None]:
outputs[:, -1, :]

tensor([[-0.0780, -0.4109, -0.6987, -0.5977, -0.6827, -0.0499, -0.0374, -0.2129,
          0.1733,  0.4696],
        [-0.0020, -0.3693, -0.6721, -0.5313, -0.6638, -0.0070, -0.0690, -0.2668,
          0.2105,  0.4653],
        [-0.0764, -0.4854, -0.7654, -0.6909, -0.6927, -0.1700, -0.0544, -0.1507,
          0.1012,  0.4473],
        [ 0.1698, -0.3215, -0.6675, -0.5530, -0.6117,  0.0966, -0.1849, -0.2177,
          0.2710,  0.5522],
        [-0.0141, -0.4239, -0.7258, -0.6475, -0.6878, -0.0706, -0.0863, -0.1808,
          0.1743,  0.4917],
        [ 0.1054, -0.4292, -0.7287, -0.6091, -0.6273, -0.0515, -0.1865, -0.1488,
          0.2054,  0.5665]], grad_fn=<SliceBackward0>)

In [None]:
hidden[0]

tensor([[ 0.4036,  0.3313,  0.4341, -0.0653, -0.4499, -0.3079,  0.0767, -0.6630,
          0.0095,  0.0548],
        [ 0.2720,  0.0203, -0.0760,  0.0577, -0.7869, -0.3961,  0.3737,  0.1212,
         -0.0698,  0.6628],
        [ 0.4125,  0.8248,  0.2614, -0.8441,  0.3923, -0.4826,  0.7592, -0.2425,
          0.8120,  0.0971],
        [ 0.4810, -0.3693,  0.0860,  0.2265, -0.3943, -0.2195,  0.5495,  0.0157,
         -0.0257,  0.4527],
        [ 0.2624,  0.3486,  0.3045, -0.3112, -0.0886,  0.0737,  0.0924, -0.4091,
          0.5287,  0.1673],
        [ 0.0163, -0.1539,  0.1215, -0.2946,  0.1504, -0.6154,  0.2515,  0.3272,
          0.5939,  0.0699]], grad_fn=<SelectBackward0>)

In [None]:
class MultiLayerRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(MultiLayerRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :]) # 마지막 은닉상태
        return out

num_layers = 4
model = MultiLayerRNN(input_size, hidden_size, num_layers, num_classes)

# 모델, 손실 함수, 옵티마이저 초기화
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 임의의 입력 데이터와 레이블
inputs = torch.randn(batch_size, sequence_length, input_size)
labels = torch.tensor([0, 1, 0])

# 학습
for epoch in range(num_epochs):
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [5/20], Loss: 0.5117
Epoch [10/20], Loss: 0.1344
Epoch [15/20], Loss: 0.0332
Epoch [20/20], Loss: 0.0134


### 양방향 RNN

In [None]:
# batch_first=True, bidirectional=True 로 설정하면 양방향
rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
outputs, hidden = rnn(inputs)

# input_size = 5
# hidden_size = 10
# sequence_length = 6
# batch_size = 3
# num_layers = 4
print("Output shape:", outputs.shape) # (batch_size, seq_length, hidden_size * 2)
print("Hidden shape:", hidden.shape) # (num_layers * 2, batch_size, hidden_size)

Output shape: torch.Size([3, 6, 20])
Hidden shape: torch.Size([8, 3, 10])


In [None]:
# RNN의 hidden은 첫번째 레이어부터 마지막 레이어까지 forward, backward가 순차적으로 쌓인 것
out_forward = outputs[:, -1, :hidden_size]  # forward 방향의 마지막 타임스텝 출력
out_backward = outputs[:, 0, hidden_size:]  # backward 방향의 첫 번째 타임스텝 출력 (뒤에서 앞으로)

# forward와 backward 방향의 출력을 결합
out_combined = torch.cat((out_forward, out_backward), dim=1)

In [None]:
# 또는 hidden을 이용할 수도 있음
#https://stackoverflow.com/questions/63121983/bidirectional-rnn-implementation-pytorch
out_combined = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)

In [None]:
class BiDirectionalRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(BiDirectionalRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # 양방향이므로 Linear에 넣어야 하는 히든 크기가 2배

    def forward(self, x):
        out, hidden = self.rnn(x)

        # hidden[-2,:,:]: forward의 마지막 레이어 마지막 타임스텝 히든 상태
        # hidden[-1,:,:]: backward의 마지막 레이어 마지막 타임스텝 히든 상태
        out_combined = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)  # 마지막 타임스텝의 출력만 사용
        out = self.fc(out_combined)
        return out

# 모델 초기화 (양방향 설정)
model = BiDirectionalRNN(input_size, hidden_size, num_layers=2, num_classes=num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 임의의 입력 데이터와 레이블
inputs = torch.randn(batch_size, sequence_length, input_size)
labels = torch.tensor([0, 1, 0])

# 학습
for epoch in range(num_epochs):
    outputs = model(inputs)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [5/20], Loss: 2.2887
Epoch [10/20], Loss: 1.8511
Epoch [15/20], Loss: 1.6225
Epoch [20/20], Loss: 1.4792
