In [124]:
import torch
import torch.nn as nn
import numpy as np
import math
import torch.nn.functional as F

query = torch.FloatTensor([[1,2,3],[4,5,6],[7,8,9],[3,2,3],[3,2,1],[1,2,3]])
key = torch.FloatTensor([[2.2,4,3],[6,9,1],[8,2,9],[1,4,5],[2,4,5],[3,2,9]])
value = torch.FloatTensor([[1,1,1],[1,1,1],[1,1,1],[1,1,1],[1,1,1],[4,8,9]])

def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query,key.transpose(-2, -1)) / math.sqrt(d_k)
    
    if mask is not None:
        # masked_fill(mask,value)
        # fills elements of self tensor with value where mask is one
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores,dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
        
    # value에 확률값 곱하고 더한 벡터, 확률값
    return torch.matmul(p_attn,value), p_attn

In [125]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask = None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # linear projections in batch from d_model
        query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1,2)
                            for l, x in zip(self.linears, (query, key, value))]
        
        # apply attention on all the projected vectors in batch
        x, self.attn = attention(query, key, value, mask = mask,
                                dropout = self.dropout)
 
        # concat using a view and apply a final linear
        x = x.transpose(1,2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

(tensor([[1.1582, 1.3692, 1.4219],
         [1.0000, 1.0001, 1.0001],
         [1.0000, 1.0000, 1.0000],
         [1.0005, 1.0012, 1.0014],
         [1.0003, 1.0006, 1.0007],
         [1.1582, 1.3692, 1.4219]]),
 tensor([[1.0261e-05, 9.2672e-04, 9.4587e-01, 1.6396e-04, 2.9206e-04, 5.2739e-02],
         [4.6087e-13, 5.4256e-06, 9.9998e-01, 2.9438e-11, 2.9640e-10, 9.6648e-06],
         [1.9581e-20, 3.0047e-08, 1.0000e+00, 4.9996e-18, 2.8452e-16, 1.6753e-09],
         [1.3386e-08, 9.7284e-05, 9.9973e-01, 5.3508e-08, 3.0244e-07, 1.7329e-04],
         [6.8323e-06, 4.9994e-01, 4.9994e-01, 2.7126e-06, 1.5332e-05, 8.6660e-05],
         [1.0261e-05, 9.2672e-04, 9.4587e-01, 1.6396e-04, 2.9206e-04, 5.2739e-02]]))

In [102]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()

tensor([[25, 25, 14, 14],
        [25, 25, 14, 14],
        [25, 25, 14, 14],
        [25, 25, 14, 14]])

In [36]:
query = torch.tensor([[1,2,3],[1,2,3]])
query = torch.tensor([[1,2,3]])

query.transpose(-1)

TypeError: transpose() missing 1 required positional arguments: "dim1"

In [32]:
query = np.array([[1,2,3],[1,2,3]])
query = torch.tensor([[1,2,3],[1,2,3]])

print(query,"\n", key.transpose(-2,-1))

tensor([[1, 2, 3],
        [1, 2, 3]]) 
 tensor([[2, 2],
        [4, 4],
        [5, 5]])


In [None]:
query, key, value = \
        [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
        for l, x in zip(self.linears, (query, key, value))]