/
attention.py
109 lines (92 loc) · 3.55 KB
/
attention.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import math
from typing import Optional
try:
import torch
import torch.nn as nn
except ModuleNotFoundError:
raise ImportError("These classes require PyTorch to be installed")
class ScaledDotProductAttention(nn.Module):
"""The Scaled Dot Production Attention operation from `Attention Is All You Need <https://arxiv.org/abs/1706.03762>_` paper.
Example
-------
>>> from deepchem.models import ScaledDotProductAttention as SDPA
>>> attn = SDPA()
>>> x = torch.ones(1, 5)
>>> # Linear layers for making query, key, value
>>> Q, K, V = nn.Parameter(torch.ones(5)), nn.Parameter(torch.ones(5)), nn.Parameter(torch.ones(5))
>>> query, key, value = Q * x, K * x, V * x
>>> x_out, attn_score = attn(query, key, value)
"""
def __init__(self):
self.epsilon = -1e9
super(ScaledDotProductAttention, self).__init__()
def forward(self,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
mask: Optional[torch.Tensor] = None,
dropout: Optional[nn.Dropout] = None):
"""
Parameters
----------
query: torch.Tensor
Query tensor for attention
key: torch.Tensor
Key tensor for attention
value: torch.Tensor
Value tensor for attention
mask: torch.Tensor (optional)
Mask to apply during attention computation
dropout: nn.Dropout (optional)
Dropout layer for attention output
"""
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, self.epsilon)
p_attn = scores.softmax(dim=-1)
if dropout is not None:
p_attn = dropout(p_attn)
return torch.matmul(p_attn, value), p_attn
class SelfAttention(nn.Module):
"""SelfAttention Layer
Given $X\in \mathbb{R}^{n \times in_feature}$, the attention is calculated by: $a=softmax(W_2tanh(W_1X))$, where
$W_1 \in \mathbb{R}^{hidden \times in_feature}$, $W_2 \in \mathbb{R}^{out_feature \times hidden}$.
The final output is $y=aX$ where $y \in \mathbb{R}^{n \times out_feature}$.
Parameters
----------
in_features: int
Dimension of input features
out_features: int
Dimension of output features
hidden_size: int
Dimension of hidden layer
"""
def __init__(self, in_features, out_features, hidden_size=128):
super(SelfAttention, self).__init__()
self.w1 = torch.nn.Parameter(torch.FloatTensor(hidden_size,
in_features))
self.w2 = torch.nn.Parameter(
torch.FloatTensor(out_features, hidden_size))
self.reset_parameters()
def reset_parameters(self):
nn.init.xavier_normal_(self.w1)
nn.init.xavier_normal_(self.w2)
def forward(self, X):
"""The forward function.
Parameters
----------
X: torch.Tensor
input feature of shape $\mathbb{R}^{n \times in_feature}$.
Returns
-------
embedding: torch.Tensor
The final embedding of shape $\mathbb{R}^{out_features \times in_feature}$
attention-matrix: torch.Tensor
The attention matrix
"""
x = torch.tanh(torch.matmul(self.w1, X.transpose(1, 0)))
x = torch.matmul(self.w2, x)
attn = torch.nn.functional.softmax(x, dim=-1)
x = torch.matmul(attn, X)
return x, attn