Skip to content

Commit

Permalink
Changes the API for LMs to be closer to tagger (#554)
Browse files Browse the repository at this point in the history
* Changes the API for LMs to be closer to tagger

clean up the LM

* generic name (generator)
* refactor TLM serialization to check old and new attr
* fix broken TLM code
* change names between PyTorch and TF to be same

update w/ MLM for TF

* update to `generator`

* get rid of legacy wp vectorizer

* update LM docs (WIP)
  • Loading branch information
dpressel committed Apr 26, 2020
1 parent 2fca5a0 commit ccf2874
Show file tree
Hide file tree
Showing 15 changed files with 520 additions and 328 deletions.
63 changes: 0 additions & 63 deletions addons/embed_tlm_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,69 +14,6 @@
from eight_mile.pytorch.serialize import load_tlm_npz
import torch.nn as nn

@register_vectorizer(name='tlm-wordpiece')
class WordPieceVectorizer1D(AbstractVectorizer):
"""Define a Baseline Vectorizer that can do WordPiece with BERT tokenizer
If you use tokens=subword, this vectorizer is used, and so then there is
a dependency on bert_pretrained_pytorch
"""

def __init__(self, **kwargs):
"""Loads a BertTokenizer using bert_pretrained_pytorch
:param kwargs:
"""
super().__init__(kwargs.get('transform_fn'))
from pytorch_pretrained_bert import BertTokenizer
self.max_seen = 128
handle = kwargs.get('embed_file')
self.tokenizer = BertTokenizer.from_pretrained(handle, do_lower_case=False)
self.mxlen = kwargs.get('mxlen', -1)

def count(self, tokens):
seen = 0
counter = Counter()
for tok in self.iterable(tokens):
counter[tok] += 1
seen += 1
self.max_seen = max(self.max_seen, seen)
return counter

def iterable(self, tokens):
for tok in tokens:
if tok == '<unk>':
yield '[UNK]'
elif tok == '<EOS>':
yield '[SEP]'
else:
for subtok in self.tokenizer.tokenize(tok):
yield subtok
yield '[CLS]'

def _next_element(self, tokens, vocab):
for atom in self.iterable(tokens):
value = vocab.get(atom)
if value is None:
value = vocab['[UNK]']
yield value

def run(self, tokens, vocab):
if self.mxlen < 0:
self.mxlen = self.max_seen
vec1d = np.zeros(self.mxlen, dtype=np.long)
for i, atom in enumerate(self._next_element(tokens, vocab)):
if i == self.mxlen:
i -= 1
vec1d[i] = vocab.get('[CLS]')
break
vec1d[i] = atom
valid_length = i + 1
return vec1d, valid_length

def get_dims(self):
return self.mxlen,


class TransformerLMEmbeddings(PyTorchEmbeddings):
"""Support embeddings trained with the TransformerLanguageModel class
Expand Down
193 changes: 105 additions & 88 deletions baseline/pytorch/lm/model.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from baseline.utils import write_json
from baseline.pytorch.torchy import *
from baseline.pytorch.transformer import TransformerEncoderStack, subsequent_mask, MultiHeadedAttention
from baseline.model import LanguageModel, register_model
Expand Down Expand Up @@ -26,9 +25,13 @@ def load(cls, filename, **kwargs):
model.gpu = False if device == 'cpu' else model.gpu
return model

def init_hidden(self, batchsz):
def zero_state(self, batchsz):
return None

@property
def requires_state(self):
pass

def make_input(self, batch_dict, numpy_to_tensor=False):
example_dict = dict({})
for key in self.src_keys:
Expand All @@ -51,63 +54,28 @@ def make_input(self, batch_dict, numpy_to_tensor=False):
example_dict['y'] = y
return example_dict

def embed(self, input):
all_embeddings = []
for k in self.src_keys:
embedding = self.embeddings[k]
all_embeddings.append(embedding(input[k]))
embedded = torch.cat(all_embeddings, -1)
embedded_dropout = self.embed_dropout(embedded)
if self.embeddings_proj:
embedded_dropout = self.embeddings_proj(embedded_dropout)
return embedded_dropout

def init_embed(self, embeddings, **kwargs):
pdrop = float(kwargs.get('dropout', 0.5))
self.embed_dropout = nn.Dropout(pdrop)
self.embeddings = EmbeddingsStack(embeddings)
input_sz = 0
for k, embedding in embeddings.items():
if k in self.src_keys:
input_sz += embedding.get_dsz()
projsz = kwargs.get('projsz')
if projsz:
self.embeddings_proj = pytorch_linear(input_sz, projsz)
print('Applying a transform from {} to {}'.format(input_sz, projsz))
return projsz
else:
self.embeddings_proj = None

return input_sz

def init_decode(self, vsz, **kwargs):
pass

def decode(self, emb, hidden):
pass

@classmethod
def create(cls, embeddings, **kwargs):

lm = cls()
lm.gpu = kwargs.get('gpu', True)
lm.hsz = int(kwargs['hsz'])
lm.layers = int(kwargs.get('layers', 1))
lm.tgt_key = kwargs.get('tgt_key')
if lm.tgt_key is None:
raise Exception('Need a `tgt_key` to know which source vocabulary should be used for destination ')

lm.src_keys = kwargs.get('src_keys', embeddings.keys())

lm.dsz = lm.init_embed(embeddings, **kwargs)
lm.init_decode(**kwargs)
lm.init_output(embeddings[lm.tgt_key].get_vsz(), **kwargs)
lm.create_layers(embeddings, **kwargs)
return lm

def forward(self, input, hidden):
emb = self.embed(input)
decoded, hidden = self.decode(emb, hidden)
return self.output(decoded), hidden
def create_layers(self, embeddings, **kwargs):
"""This method defines the model itself, and must be overloaded by derived classes
This function will update `self` with the layers required to execute the `call()` method
:param embeddings: The input feature indices
:param kwargs:
:return:
"""

def predict(self, batch_dict, **kwargs):
numpy_to_tensor = bool(kwargs.get('numpy_to_tensor', True))
Expand All @@ -116,86 +84,135 @@ def predict(self, batch_dict, **kwargs):
step_softmax, _ = self(batch_dict, hidden)
return F.softmax(step_softmax, dim=-1)

def init_output(self, vsz, **kwargs):
unif = float(kwargs.get('unif', 0.0))
do_weight_tying = bool(kwargs.get('tie_weights', False))
self.proj = pytorch_linear(self.hsz, vsz, unif)
if do_weight_tying and self.hsz == self.embeddings[self.tgt_key].get_dsz():
self.proj.weight = self.embeddings[self.tgt_key].embeddings.weight

def output(self, x):
outputs = self.proj(x)
return outputs
class AbstractGeneratorLanguageModel(LanguageModelBase):

def create_layers(self, embeddings, **kwargs):
self.embeddings = self.init_embed(embeddings, **kwargs)
self.embeddings_proj = self.init_embeddings_proj(**kwargs)
self.generator = self.init_generate(**kwargs)
self.output_layer = self.init_output(**kwargs)

@register_model(task='lm', name='default')
class RNNLanguageModel(LanguageModelBase):
def forward(self, input: Dict[str, TensorDef], hidden: TensorDef) -> Tuple[TensorDef, TensorDef]:
emb = self.embed(input)
output, hidden = self.generate(emb, hidden)
return self.output_layer(output), hidden

def __init__(self):
super(RNNLanguageModel, self).__init__()
def embed(self, input):
embedded_dropout = self.embeddings(input)
return self.embeddings_proj(embedded_dropout)

def init_embed(self, embeddings: Dict[str, TensorDef], **kwargs) -> BaseLayer:
"""This method creates the "embedding" layer of the inputs, with an optional reduction
:param embeddings: A dictionary of embeddings
:Keyword Arguments: See below
* *embeddings_reduction* (defaults to `concat`) An operator to perform on a stack of embeddings
* *embeddings_dropout = float(kwargs.get('embeddings_dropout', 0.0))
:return: The output of the embedding stack followed by its reduction. This will typically be an output
with an additional dimension which is the hidden representation of the input
"""
reduction = kwargs.get('embeddings_reduction', 'concat')
embeddings_dropout = float(kwargs.get('embeddings_dropout', 0.0))
return EmbeddingsStack(embeddings, embeddings_dropout, reduction=reduction)

def init_embeddings_proj(self, **kwargs):
input_sz = self.embeddings.output_dim
hsz = kwargs.get('hsz', kwargs.get('d_model'))
if hsz != input_sz:
proj = pytorch_linear(input_sz, hsz)
print('Applying a transform from {} to {}'.format(input_sz, hsz))
else:
proj = nn.Identity()
return proj

def init_hidden(self, batchsz):
weight = next(self.parameters()).data
return (torch.autograd.Variable(weight.new(self.layers, batchsz, self.hsz).zero_()),
torch.autograd.Variable(weight.new(self.layers, batchsz, self.hsz).zero_()))
def init_generate(self, **kwargs):
pass

def init_decode(self, **kwargs):
pdrop = float(kwargs.get('dropout', 0.5))
vdrop = bool(kwargs.get('variational', False))
if vdrop:
self.rnn_dropout = VariationalDropout(pdrop)
def generate(self, emb, hidden):
return self.generator((emb, hidden))

def init_output(self, **kwargs):
vsz = self.embeddings[self.tgt_key].get_vsz()
hsz = kwargs.get('hsz', kwargs.get('d_model'))
unif = float(kwargs.get('unif', 0.0))
do_weight_tying = bool(kwargs.get('tie_weights', False))
if do_weight_tying:
output = WeightTieDense(self.embeddings[self.tgt_key])
else:
self.rnn_dropout = nn.Dropout(pdrop)
output = pytorch_linear(hsz, vsz, unif)
return output

self.rnn = pytorch_lstm(self.dsz, self.hsz, 'lstm', self.layers, pdrop, batch_first=True)

def decode(self, emb, hidden):
output, hidden = self.rnn(emb, hidden)
output = self.rnn_dropout(output).contiguous()
return output, hidden
@register_model(task='lm', name='default')
class RNNLanguageModel(AbstractGeneratorLanguageModel):

def __init__(self):
super().__init__()

def zero_state(self, batchsz):
weight = next(self.parameters()).data
return (torch.autograd.Variable(weight.new(self.num_layers, batchsz, self.hsz).zero_()),
torch.autograd.Variable(weight.new(self.num_layers, batchsz, self.hsz).zero_()))

def _identity(x):
return x
@property
def requires_state(self):
True

def init_generate(self, **kwargs):
pdrop = float(kwargs.get('dropout', 0.5))
self.num_layers = kwargs.get('layers', kwargs.get('num_layers', 1))
self.hsz = kwargs.get('hsz', kwargs.get('d_model'))
return WithDropoutOnFirst(LSTMEncoderWithState(self.hsz, self.hsz, self.num_layers, pdrop, batch_first=True),
pdrop,
kwargs.get('variational', False))


@register_model(task='lm', name='transformer')
class TransformerLanguageModel(LanguageModelBase):
class TransformerLanguageModel(AbstractGeneratorLanguageModel):

def __init__(self):
super().__init__()

@property
def requires_state(self):
False

def init_layer_weights(self, module):
if isinstance(module, (nn.Linear, nn.Embedding, nn.LayerNorm)):
module.weight.data.normal_(mean=0.0, std=self.weight_std)
if isinstance(module, (nn.Linear, nn.LayerNorm)) and module.bias is not None:
module.bias.data.zero_()

def init_decode(self, **kwargs):
pdrop = float(kwargs.get('dropout', 0.5))
layers = kwargs.get('layers', 1)
self.weight_std = kwargs.get('weight_std', 0.02)
def init_generate(self, **kwargs):
pdrop = float(kwargs.get('dropout', 0.1))
layers = kwargs.get('layers', kwargs.get('num_layers', 1))
d_model = int(kwargs.get('d_model', kwargs.get('hsz')))
num_heads = kwargs.get('num_heads', 4)
d_ff = int(kwargs.get('d_ff', 4 * d_model))
rpr_k = kwargs.get('rpr_k')
d_k = kwargs.get('d_k')
scale = bool(kwargs.get('scale', True))
self.proj_to_dsz = pytorch_linear(self.dsz, d_model) if self.dsz != d_model else _identity
activation = kwargs.get('activation', 'gelu')
return TransformerEncoderStack(num_heads, d_model=d_model, pdrop=pdrop, scale=scale,
layers=layers, d_ff=d_ff, rpr_k=rpr_k, d_k=d_k,
activation=activation)

self.transformer = TransformerEncoderStack(num_heads, d_model=d_model, pdrop=pdrop, scale=scale,
layers=layers, d_ff=d_ff, rpr_k=rpr_k, d_k=d_k)
def create_layers(self, embeddings, **kwargs):
super().create_layers(embeddings, **kwargs)
self.weight_std = kwargs.get('weight_std', 0.02)
self.apply(self.init_layer_weights)

def create_mask(self, bth):
T = bth.shape[1]
mask = subsequent_mask(T).type_as(bth)
return mask

def decode(self, bth, hidden):
bth = self.proj_to_dsz(bth)
def generate(self, bth, _):
mask = self.create_mask(bth)
return self.transformer((bth, mask)), None
return self.generator((bth, mask)), None


@register_model(task='lm', name='transformer-mlm')
Expand Down
6 changes: 3 additions & 3 deletions baseline/pytorch/lm/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
class LanguageModelTrainerPyTorch(Trainer):

def __init__(self, model, **kwargs):
super(LanguageModelTrainerPyTorch, self).__init__()
super().__init__()
if type(model) is dict:
model = create_model_for('lm', **model)
self.model = model
Expand Down Expand Up @@ -70,7 +70,7 @@ def test(self, vs, reporting_fns, phase='Valid', **kwargs):
total_loss = 0
total_toks = 0
batchsz, nctx = self._get_dims(vs)
hidden = self._get_pytorch_model().init_hidden(batchsz)
hidden = self._get_pytorch_model().zero_state(batchsz)

for batch_dict in vs:
inputs = self._get_pytorch_model().make_input(batch_dict)
Expand All @@ -95,7 +95,7 @@ def train(self, ts, reporting_fns):
epoch_loss = 0
epoch_toks = 0
batchsz, nctx = self._get_dims(ts)
hidden = self._get_pytorch_model().init_hidden(batchsz)
hidden = self._get_pytorch_model().zero_state(batchsz)

for batch_dict in ts:
if hidden is not None:
Expand Down

0 comments on commit ccf2874

Please sign in to comment.