### Pad_Packed Sequence

In [1]:
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_sequence, pack_padded_sequence, pad_packed_sequence

# Random word from random word generator
data = ['hello world',
        'midnight',
        'calculation',
        'path',
        'short circuit']
data

['hello world', 'midnight', 'calculation', 'path', 'short circuit']

### char vocab 만들기

In [8]:
# Make dictionary
char_set = ['<pad>'] + list(set(char for seq in data for char in seq)) # Get all characters and include pad token
print('char_set:', char_set)

char_set: ['<pad>', 'c', 'p', ' ', 's', 'h', 'r', 'e', 'a', 'i', 'n', 'g', 't', 'o', 'u', 'l', 'm', 'd', 'w']


In [9]:
char2idx = {char: idx for idx, char in enumerate(char_set)} # Constuct character to index dictionary
print(char2idx)
print('char_set length:', len(char_set))

{'<pad>': 0, 'c': 1, 'p': 2, ' ': 3, 's': 4, 'h': 5, 'r': 6, 'e': 7, 'a': 8, 'i': 9, 'n': 10, 'g': 11, 't': 12, 'o': 13, 'u': 14, 'l': 15, 'm': 16, 'd': 17, 'w': 18}
char_set length: 19


#### 문장별로 idx화

In [10]:
# Convert character to index and
# Make list of tensors
X = [torch.LongTensor([char2idx[char] for char in seq]) for seq in data]
X

[tensor([ 5,  7, 15, 15, 13,  3, 18, 13,  6, 15, 17]),
 tensor([16,  9, 17, 10,  9, 11,  5, 12]),
 tensor([ 1,  8, 15,  1, 14, 15,  8, 12,  9, 13, 10]),
 tensor([ 2,  8, 12,  5]),
 tensor([ 4,  5, 13,  6, 12,  3,  1,  9,  6,  1, 14,  9, 12])]

### Pad Sequence

In [13]:
# Make a Tensor of shape (Batch x Maximum_Sequence_Length)
padded_sequence = pad_sequence(X, batch_first=True) # X is now padded sequence
print(padded_sequence)
print(padded_sequence.shape)

tensor([[ 5,  7, 15, 15, 13,  3, 18, 13,  6, 15, 17,  0,  0],
        [16,  9, 17, 10,  9, 11,  5, 12,  0,  0,  0,  0,  0],
        [ 1,  8, 15,  1, 14, 15,  8, 12,  9, 13, 10,  0,  0],
        [ 2,  8, 12,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 4,  5, 13,  6, 12,  3,  1,  9,  6,  1, 14,  9, 12]])
torch.Size([5, 13])


#### 내림차순 정렬(Packed 위해서는)

In [21]:
# Make length tensor (will be used later in 'pack_padded_sequence' function)
raw_lengths = [len(seq) for seq in X]
print('lengths:', raw_lengths)

lengths: [11, 8, 11, 4, 13]


In [22]:
# 내림차순 : reverse = True
sorted_idx = sorted(range(len(lengths)), key=raw_lengths.__getitem__, reverse=True)
sorted_idx

[4, 0, 2, 1, 3]

In [15]:
sorted_X = [X[idx] for idx in sorted_idx]
sorted_X 

[tensor([ 4,  5, 13,  6, 12,  3,  1,  9,  6,  1, 14,  9, 12]),
 tensor([ 5,  7, 15, 15, 13,  3, 18, 13,  6, 15, 17]),
 tensor([ 1,  8, 15,  1, 14, 15,  8, 12,  9, 13, 10]),
 tensor([16,  9, 17, 10,  9, 11,  5, 12]),
 tensor([ 2,  8, 12,  5])]

### Packed Sequence : 세로로 자른다(?)

In [25]:
packed_sequence = pack_sequence(sorted_X)
print(packed_sequence)

PackedSequence(data=tensor([ 4,  5,  1, 16,  2,  5,  7,  8,  9,  8, 13, 15, 15, 17, 12,  6, 15,  1,
        10,  5, 12, 13, 14,  9,  3,  3, 15, 11,  1, 18,  8,  5,  9, 13, 12, 12,
         6,  6,  9,  1, 15, 13, 14, 17, 10,  9, 12]), batch_sizes=tensor([5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 1, 1]), sorted_indices=None, unsorted_indices=None)


### One-hot Embeding 해보기

In [27]:
# one-hot embedding using PaddedSequence
eye = torch.eye(len(char_set)) # Identity matrix of shape (len(char_set), len(char_set))
eye.shape

torch.Size([19, 19])

#### Pad One-hot : 문장별 무조건 13개씩을 vocab[19] 기준으로 one-hot ( 5 X 13 = 65개를 One-Hot )

In [38]:
embedded_tensor = eye[padded_sequence]
print(embedded_tensor.shape) # shape: (Batch_size, max_sequence_length, number_of_input_tokens)

torch.Size([5, 13, 19])


#### Packed One-hot : 존재하는 대상 47개만 vocab[19] 기준으로 one-hot ( Pad 대비 효율성 초점 )

In [39]:
# one-hot embedding using PackedSequence
embedded_packed_seq = pack_sequence([eye[X[idx]] for idx in sorted_idx])
print(embedded_packed_seq.data.shape)

torch.Size([47, 19])


##### (예) Pad 1개만 확인해보기

In [36]:
padded_sequence

tensor([[ 5,  7, 15, 15, 13,  3, 18, 13,  6, 15, 17,  0,  0],
        [16,  9, 17, 10,  9, 11,  5, 12,  0,  0,  0,  0,  0],
        [ 1,  8, 15,  1, 14, 15,  8, 12,  9, 13, 10,  0,  0],
        [ 2,  8, 12,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 4,  5, 13,  6, 12,  3,  1,  9,  6,  1, 14,  9, 12]])

In [37]:
embedded_tensor[0]

tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [43]:
# declare RNN
rnn = torch.nn.RNN(input_size=len(char_set), hidden_size=30, batch_first=True)

# Try out PaddedSequence
rnn_output, hidden = rnn(embedded_tensor)
print(rnn_output.shape) # shape: (batch_size, max_seq_length, hidden_size)
# print(hidden.shape)     # shape: (num_layers * num_directions, batch_size, hidden_size)

torch.Size([5, 13, 30])


In [44]:
# Try out PackedSequence
rnn_output, hidden = rnn(embedded_packed_seq)
print(rnn_output.data.shape)
# print(hidden.data.shape)

torch.Size([47, 30])


### pad ← packed : pad_packed_sequence()

In [45]:
# Try out pad_packed_sequence
unpacked_sequence, seq_lengths = pad_packed_sequence(embedded_packed_seq, batch_first=True)
print(unpacked_sequence.shape)
print(seq_lengths)

torch.Size([5, 13, 19])
tensor([13, 11, 11,  8,  4])


### packed ← pad : pack_padded_sequence()

In [47]:
# Construct embedded_padded_sequence
embedded_padded_sequence = eye[pad_sequence(sorted_X, batch_first=True)]
print(embedded_padded_sequence.shape)

torch.Size([5, 13, 19])


In [48]:
# Try out pack_padded_sequence
sorted_lengths = sorted(lengths, reverse=True)
new_packed_sequence = pack_padded_sequence(embedded_padded_sequence, sorted_lengths, batch_first=True)
print(new_packed_sequence.data.shape)
print(new_packed_sequence.batch_sizes)

torch.Size([47, 19])
tensor([5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 1, 1])
