/
model.py
224 lines (178 loc) · 7.71 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
""" Define the PointerSentinel model class """
import torch as th
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
class PointerSentinel(nn.Module):
""" Implements a Pointer Sentinel Mixture Model of Merity et al. [1]
Parameters
----------
lstm_size : int
size of the hidden representation in LSTM layer
lstm_layers : int
number of LSTM layers to use
embedding_size : int
rank of the embedding matrix
vocab_size : int
total number of words in the vocabulary
k : int
Number of steps to perform before caching the hidden state.
The hidden state returned by the network during training is NOT the state after processing the whole input:
it is the state after processing first `k` elements of the sequence.
Attributes
----------
embed : nn.Embedding
Holds embeddings of words. Maps integer indices to d-dimensional vectors.
lstm : nn.LSTM
LSTM network. Used to compute representation of a sequence of words
query : nn.Linear
Linear projection. Followed by tanh nolinearity maps last hidden vector to query vector
sentinel : Tensor
Learned embedding vector for the sentinel, used to compute value of the gating function `g`.
Size: 1-D (hidden_size, )
U : Tensor
Linear projection matrix used to compute logits for the whole vocabulary given last hidden state of LSTM
Size: 2-D (hidden_size, vocab_size)
References
----------
.. [1] Merity, S., Xiong, C., Bradbury, J., & Socher, R. (2016). Pointer Sentinel Mixture Models.
arXiv preprint arXiv:1609.07843.
"""
def __init__(self, lstm_size, lstm_layers, embedding_size, vocab_size, k):
super().__init__()
self.lstm_size = lstm_size
self.n_layers = lstm_layers
self.k = k
self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
self.lstm = nn.LSTM(input_size=embedding_size, hidden_size=lstm_size,
num_layers=lstm_layers, batch_first=True)
self.query = nn.Linear(in_features=lstm_size, out_features=lstm_size)
self.sentinel = nn.Parameter(th.zeros(1, 1, lstm_size))
self.U = nn.Parameter(th.zeros(lstm_size, vocab_size))
self.b = nn.Parameter(th.zeros(vocab_size, ))
self.reset_parameters()
def initial_state(self, batch_size):
""" Returns tuple of zero tensors in the shape expected by the lstm """
shape = self.n_layers, batch_size, self.lstm_size
h0 = Variable(self.U.data.new(*shape).zero_())
c0 = Variable(self.U.data.new(*shape).zero_())
return h0, c0
def reset_parameters(self):
""" Reset parmeters of the mdoel """
for child in self.children():
child.reset_parameters()
self.sentinel.data.normal_(std=0.01)
self.U.data.normal_(std=0.01)
def mixture_train(self, ptr_probas, rnn_probas, gates,
x, y):
""" Compute the log-probabilities assigned by the model to
the target words specified by `y`
Parameters
----------
ptr_probas : FloatTensor
see the return values of `forward`
rnn_probas : FloatTensor
see the return values of `forward`
gates : FloatTensor
see the return values of `forward`
y : LongTensor
indices of the target words for each sequence in the batch.
size : 2-D, (batch-size, 1)
x : LongTensor
indices of the input words for each sequence in the batch.
size : 2-D, (batch-size, seq-length)
"""
ptr_mask = (x == y.unsqueeze(1).expand_as(x)).type_as(ptr_probas)
ptr_scores = (ptr_probas * ptr_mask).sum(1)
rnn_scores = rnn_probas.gather(dim=1, index=y.unsqueeze(1)).squeeze()
gates = gates.squeeze()
p = gates * rnn_scores + ptr_scores
log_p = th.log(p + 1e-12)
return log_p
def mixture_sample(self, ptr_probas, rnn_probas, gates, x):
if x.size(0) != 1:
raise RuntimeError(f"Sampling is implemented for 'batches' "
"of size (1), but {x.size(0)} was found.")
ptr_probas_expaned = th.zeros_like(rnn_probas)
ptr_probas_expaned.index_add_(1, x.squeeze(), ptr_probas)
probas = rnn_probas * gates + ptr_probas_expaned
return probas
def forward(self, x, lstm_state):
""" Compute the distribution over words in the sequence, dist.
over words in vocabulary, and the gating function
Parameters
----------
x : LongTensor
batch of word sequences represented as indices from the vocabulary
size : 2-D, (batch-size, sequence-len)
lstm_state : Tuple[FloatTensor]
initial state of the LSTM (hidden + cell)
Returns
-------
ptr_probas : FloatTensor
probability mass asigned by the model to the words in the input sequence
Size: 2-D, (batch-size, sequence-len)
rnn_probas : FloatTensor
probability mass asigned by the model to the words in the vocabulary
Size: 2-D, (batch-size, vocab-size)
gates : FloatTensor
values of the gating function
Size: 1-D, (batch-size, )
"""
H, sk, hT = self._core(x, lstm_state)
ptr_probas, gates = self._pointer(H, hT)
rnn_probas = self._vanilla_softmax(hT)
return (ptr_probas, rnn_probas, gates), sk
def _core(self, x, s0):
""" Core part of the network, shared between `softmax-rnn` and `pointer` parts
Parameters
----------
x : LongTensor
See `forward` method for details.
s0 : Tuple[FloatTensor]
See `lstm_state` arg in the `forward`
Returns
-------
H : FloatTensor
Outputs from the last layer of the LSTM
Size: (batch-size, seq-len, hid-size)
sk : Tuple[FloatTensors]
k-th state (includes cell state) of the all LSTM layers
hT : FloatTensor
Last hidden state of the last LSTM layer
Size: (batch-size, hid-size)
"""
x = self.embed(x)
H0, sk = self.lstm(x[:, :self.k, :], s0)
H1, _ = self.lstm(x[:, self.k:, :], sk)
H = th.cat((H0, H1), 1)
hT = H[:, -1, :]
return H, sk, hT
def _pointer(self, H, last):
""" The `pointer` part of the network
Parameters
----------
H : Tensor
Hidden representations for each timestep extracted from the last layer of LSTM
size : (batch-size, sequence-len, hidden-size)
last : Tensor
Representations of the whole sequences (Last vectors from H, extracted earlier for efficiency)
size : (batch-size, hidden-size)
"""
batch_size, _, hid_size = H.size()
sentinel = self.sentinel.expand(batch_size, 1, hid_size)
latents = th.cat((H, sentinel), 1) # ::(b, s+1, h)
query = F.tanh(self.query(last))
query = query.unsqueeze(2) # ::(b, h, 1)
logits = th.bmm(latents, query).squeeze(2) # batched-matrix-multiply, ::(b, s+1)
weigths = F.softmax(logits, dim=1) # ::(b, s+1)
probas = weigths[:, :-1]
gates = weigths[:, -1].unsqueeze(1)
return probas, gates
def _vanilla_softmax(self, last):
""" The `softmax-rnn` part of the network
computes distribution over words in vocab given last hidden state
using a standard linear layer followed by a softmax
"""
probas = F.softmax(last @ self.U, dim=1)
return probas