# pytorch master [doc](https://pytorch.org/docs/stable/nn.html)

In [90]:
import torch
import torch.nn as nn
import math
from torch.autograd import Variable

## get some batch 

In [2]:
from process import get_data
from torchtext import data

In [3]:
data_path = "dataset/"
file_name = "train."

In [4]:
txt_en, train_en = get_data(file_path = data_path + file_name + 'en',
                           field_name = 'en')

In [5]:
train_loader = data.Iterator(train_en, batch_size = 3,
                            device = None, # if using GPU, type "cuda" 
                            repeat = False)

In [6]:
for batch in train_loader:
    break

In [7]:
a = batch.en

In [8]:
del txt_en, train_en, train_loader, batch.en

In [9]:
a.shape

torch.Size([3, 26])

In [17]:
b = a[1]

## Embeddings
- super().__init__() : nn.Module의 __init__() 상속
[ref](https://rednooby.tistory.com/56)
- nn.Embedding(V, D) : V = #(Vocab), D = #(Dim)
V, D 만큼의 Embedding 생성

In [14]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model, padding_idx = 1)
        self.d_model = d_model
        
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [106]:
emb = nn.Embedding(25000,128, padding_idx = 1)

In [172]:
emb = Embeddings(d_model = 128, vocab = 25000)

In [173]:
x = emb(b).unsqueeze(0)

In [176]:
x

tensor([[[ -2.6353, -15.7527,  -7.7449,  ..., -16.0526,   3.2284,   0.6601],
         [ -0.2340,  -6.2271, -14.6666,  ...,   6.2826,  -1.4128,   9.5129],
         [ 11.2771,   0.1608,  -9.7236,  ..., -11.5052,  -4.3630,  -2.4111],
         ...,
         [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000],
         [  0.0000,   0.0000,   0.0000,  ...,   0.0000,   0.0000,   0.0000]]],
       grad_fn=<UnsqueezeBackward0>)

In [174]:
x.shape

torch.Size([1, 26, 128])

In [175]:
x.size(1)

26

## Positional Encoding

"::" : seq[start:end:step] 

torch.nn.register_buffer : Adds a persistent buffer to the module.
torch.autograd.Variable : 이후 Gradient 흘려줄 변수로 지정

In [165]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len = 5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p = dropout)
        
        pe = torch.zeros(max_len, d_model, dtype = torch.float)
        position = torch.arange(0, max_len, dtype = torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype = torch.float) / 
                             d_model * -math.log(1e+4))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], requires_grad = False)
        return self.dropout(x)

In [166]:
d_model = 128
max_len = 5000

In [167]:
PE = PositionalEncoding(d_model = 128, dropout = .1)

In [168]:
x.shape

torch.Size([1, 26, 128])

In [171]:
PE.forward(x)

tensor([[[ 2.2729,  2.7363, -0.4536,  ...,  1.0792, -0.8988,  0.6352],
         [-0.2253,  0.4553,  1.4877,  ...,  2.5368, -0.1377,  0.1559],
         [-0.4945,  1.2978, -0.6154,  ...,  0.8582, -0.0852,  2.1106],
         ...,
         [-0.9402, -0.5920,  0.9734,  ...,  1.1111,  0.0030,  1.1111],
         [-1.0062,  0.4713,  1.0388,  ...,  1.1111,  0.0031,  1.1111],
         [-0.0000,  1.1013,  0.3727,  ...,  1.1111,  0.0032,  1.1111]]],
       grad_fn=<MulBackward0>)

1. 판깔기

In [132]:
pe = torch.zeros(max_len, d_model, dtype = torch.float)

2. 각 텀 만들기

In [133]:
position = torch.arange(0, max_len, dtype = torch.float).unsqueeze(1)

In [134]:
position

tensor([[0.0000e+00],
        [1.0000e+00],
        [2.0000e+00],
        ...,
        [4.9970e+03],
        [4.9980e+03],
        [4.9990e+03]])

In [135]:
div_term = torch.exp(torch.arange(0, d_model, 2, dtype = torch.float) / d_model * -math.log(1e+4))

In [136]:
# pow 연산 위치에 따른 비교
div_term - torch.pow(10000, -torch.arange(0, d_model, 2, dtype = torch.float) / d_model)

tensor([ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        -2.9802e-08, -2.9802e-08,  2.9802e-08,  0.0000e+00, -2.9802e-08,
        -2.9802e-08,  0.0000e+00,  0.0000e+00, -1.4901e-08,  0.0000e+00,
         0.0000e+00, -7.4506e-09,  0.0000e+00, -7.4506e-09, -7.4506e-09,
        -7.4506e-09,  3.7253e-09,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        -1.8626e-09, -3.7253e-09, -3.7253e-09,  0.0000e+00, -2.7940e-09,
         0.0000e+00, -2.7940e-09,  0.0000e+00,  9.3132e-10, -1.3970e-09,
         4.6566e-10, -9.3132e-10,  0.0000e+00, -9.3132e-10, -2.3283e-10,
        -9.3132e-10, -4.6566e-10,  0.0000e+00, -4.6566e-10,  0.0000e+00,
        -3.4925e-10, -1.1642e-10, -3.4925e-10, -1.1642e-10,  1.1642e-10,
        -1.1642e-10,  5.8208e-11, -1.1642e-10, -2.9104e-11, -1.4552e-10,
         0.0000e+00,  5.8208e-11,  8.7311e-11, -1.0186e-10, -4.3656e-11,
         0.0000e+00,  2.9104e-11, -7.2760e-11, -4.3656e-11])

3 합치기

In [137]:
print(div_term.shape, position.shape, pe.shape)

torch.Size([64]) torch.Size([5000, 1]) torch.Size([5000, 128])


In [138]:
(position * div_term).shape

torch.Size([5000, 64])

In [139]:
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)

In [140]:
pe = pe.unsqueeze(0)

In [22]:
torch.arange(0, 5000).unsqueeze(1)

tensor([[   0],
        [   1],
        [   2],
        ...,
        [4997],
        [4998],
        [4999]])

In [141]:
pe.shape

torch.Size([1, 5000, 128])

def forward 분석

In [143]:
pe[:, ].shape

torch.Size([1, 5000, 128])

In [146]:
pe[:, :x.size(1)].shape

torch.Size([1, 26, 128])

In [148]:
Variable(pe[:, :x.size(1)]).shape

torch.Size([1, 26, 128])

tensor([    2,   135,    55,    11,  3553,    35,  9896,  1040,     6,   900,
           26, 21687,    19,  1405,    10,     4,   109,   668,     3,     1,
            1,     1,     1,     1,     1,     1])