Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
Update for Block API (#1261)
Browse files Browse the repository at this point in the history
- Remove params and prefix arguments for MXNet 2 and update
  parameter sharing implementation
- Remove Block.name_scope() for MXNet 2
- Remove self.params.get() and self.params.get_constant()
  • Loading branch information
leezu committed Jul 17, 2020
1 parent ea9152b commit 70a1887
Show file tree
Hide file tree
Showing 34 changed files with 1,280 additions and 1,681 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ jobs:
- name: Install Other Dependencies
run: |
python -m pip install --user --upgrade pip
python -m pip install --user setuptools pytest pytest-cov
python -m pip install --user setuptools pytest pytest-cov contextvars
python -m pip install --upgrade cython
python -m pip install --pre --user "mxnet>=2.0.0b20200604,<=2.0.0b20200619" -f https://dist.mxnet.io/python
python -m pip install --pre --user "mxnet>=2.0.0b20200716" -f https://dist.mxnet.io/python
python -m pip install --user -e .[extras]
- name: Test project
run: |
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ First of all, install the latest MXNet. You may use the following commands:
```bash

# Install the version with CUDA 10.1
pip install -U --pre mxnet-cu101>=2.0.0b20200604 -f https://dist.mxnet.io/python
pip install -U --pre mxnet-cu101>=2.0.0b20200716 -f https://dist.mxnet.io/python

# Install the cpu-only version
pip install -U --pre mxnet>=2.0.0b20200604 -f https://dist.mxnet.io/python
pip install -U --pre mxnet>=2.0.0b20200716 -f https://dist.mxnet.io/python
```


Expand Down
7 changes: 3 additions & 4 deletions scripts/conversion_toolkits/convert_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@ def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, elec
assert_allclose(tf_params[k], backbone_params[k])

# Build gluon model and initialize
gluon_model = ElectraModel.from_cfg(cfg, prefix='electra_')
gluon_model = ElectraModel.from_cfg(cfg)
gluon_model.initialize(ctx=ctx)
gluon_model.hybridize()

gluon_disc_model = ElectraDiscriminator(cfg, prefix='electra_')
gluon_disc_model = ElectraDiscriminator(cfg)
gluon_disc_model.initialize(ctx=ctx)
gluon_disc_model.hybridize()

Expand All @@ -283,8 +283,7 @@ def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, elec
word_embed_params=word_embed_params,
token_type_embed_params=token_type_embed_params,
token_pos_embed_params=token_pos_embed_params,
embed_layer_norm_params=embed_layer_norm_params,
prefix='generator_')
embed_layer_norm_params=embed_layer_norm_params)
gluon_gen_model.initialize(ctx=ctx)
gluon_gen_model.hybridize()

Expand Down
2 changes: 1 addition & 1 deletion scripts/conversion_toolkits/convert_mobilebert.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def convert_tf_model(model_dir, save_dir, test_conversion, gpu, mobilebert_dir):
gluon_model.initialize(ctx=ctx)
gluon_model.hybridize()

gluon_pretrain_model = MobileBertForPretrain(cfg, prefix='')
gluon_pretrain_model = MobileBertForPretrain(cfg)
gluon_pretrain_model.initialize(ctx=ctx)
gluon_pretrain_model.hybridize()

Expand Down
2 changes: 1 addition & 1 deletion scripts/conversion_toolkits/convert_tf_hub_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def convert_tf_model(hub_model_dir, save_dir, test_conversion, model_type, gpu):
gluon_model = PretrainedModel.from_cfg(cfg, prefix='', use_pooler=True)
gluon_model.initialize(ctx=ctx)
gluon_model.hybridize()
gluon_mlm_model = PretrainedMLMModel(backbone_cfg=cfg, prefix='')
gluon_mlm_model = PretrainedMLMModel(backbone_cfg=cfg)
gluon_mlm_model.initialize(ctx=ctx)
gluon_mlm_model.hybridize()

Expand Down
4 changes: 2 additions & 2 deletions scripts/machine_translation/train_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def train(args):
for v in model.collect_params().values():
if v.grad_req != 'null':
v.grad_req = 'add'
model.collect_params().zero_grad()
model.zero_grad()
model_averager = AverageSGDTracker(model.collect_params())
log_start_time = time.time()
num_params, num_fixed_params = None, None
Expand Down Expand Up @@ -422,7 +422,7 @@ def train(args):
trainer.step(loss_denom.asnumpy() / rescale_loss)
accum_count = 0
loss_denom = 0
model.collect_params().zero_grad()
model.zero_grad()
if (args.epochs > 0 and epoch_id >= args.epochs - args.num_averages) or \
(args.max_update > 0 and n_train_iters >= args.max_update - args.num_averages * args.save_interval_update):
model_averager.step()
Expand Down
3 changes: 1 addition & 2 deletions scripts/pretraining/run_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,7 @@ def get_pretraining_model(model_name, ctx_l,
tied_generator=False,
tied_embeddings=True,
disallow_correct=False,
weight_initializer=TruncNorm(stdev=0.02),
prefix='Pretrain_')
weight_initializer=TruncNorm(stdev=0.02))
model.initialize(ctx=ctx_l)
model.hybridize()
return cfg, tokenizer, model
Expand Down
72 changes: 30 additions & 42 deletions scripts/question_answering/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,12 @@ class ModelForQABasic(HybridBlock):
another dense layer to map the contextual embeddings to the start scores and end scores.
"""
def __init__(self, backbone, weight_initializer=None, bias_initializer=None,
prefix=None, params=None):
super().__init__(prefix=prefix, params=params)
with self.name_scope():
self.backbone = backbone
self.qa_outputs = nn.Dense(units=2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='qa_outputs_')
def __init__(self, backbone, weight_initializer=None, bias_initializer=None):
super().__init__()
self.backbone = backbone
self.qa_outputs = nn.Dense(units=2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer)

def hybrid_forward(self, F, tokens, token_types, valid_length, p_mask):
"""
Expand Down Expand Up @@ -77,39 +74,30 @@ class ModelForQAConditionalV1(HybridBlock):
"""
def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1,
activation='tanh', weight_initializer=None, bias_initializer=None,
prefix=None, params=None):
super().__init__(prefix=prefix, params=params)
with self.name_scope():
self.backbone = backbone
self.start_scores = nn.Dense(1, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='start_scores_')
self.end_scores = nn.HybridSequential(prefix='end_scores_')
with self.end_scores.name_scope():
self.end_scores.add(nn.Dense(units, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='mid_'))
self.end_scores.add(get_activation(activation))
self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps))
self.end_scores.add(nn.Dense(1, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='out_'))
self.answerable_scores = nn.HybridSequential(prefix='answerable_scores_')
with self.answerable_scores.name_scope():
self.answerable_scores.add(nn.Dense(units, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='mid_'))
self.answerable_scores.add(get_activation(activation))
self.answerable_scores.add(nn.Dropout(dropout_prob))
self.answerable_scores.add(nn.Dense(2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='out_'))
activation='tanh', weight_initializer=None, bias_initializer=None):
super().__init__()
self.backbone = backbone
self.start_scores = nn.Dense(1, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer)
self.end_scores = nn.HybridSequential()
self.end_scores.add(nn.Dense(units, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.end_scores.add(get_activation(activation))
self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps))
self.end_scores.add(nn.Dense(1, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.answerable_scores = nn.HybridSequential()
self.answerable_scores.add(nn.Dense(units, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.answerable_scores.add(get_activation(activation))
self.answerable_scores.add(nn.Dropout(dropout_prob))
self.answerable_scores.add(nn.Dense(2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))

def get_start_logits(self, F, contextual_embedding, p_mask):
"""
Expand Down
8 changes: 3 additions & 5 deletions scripts/question_answering/run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,7 @@ def get_network(model_name,
backbone_params_path, num_params, num_fixed_params))
qa_net = ModelForQAConditionalV1(backbone=backbone,
dropout_prob=dropout,
weight_initializer=TruncNorm(stdev=0.02),
prefix='qa_net_')
weight_initializer=TruncNorm(stdev=0.02))
if checkpoint_path is None:
# Ignore the UserWarning during initialization,
# There is no need to re-initialize the parameters of backbone
Expand Down Expand Up @@ -529,7 +528,7 @@ def train(args):
log_sample_num = 0
if args.num_accumulated != 1:
# set grad to zero for gradient accumulation
qa_net.collect_params().zero_grad()
qa_net.zero_grad()
global_tic = time.time()
while not finish_flag:
epoch_tic = time.time()
Expand Down Expand Up @@ -594,7 +593,7 @@ def train(args):
step_num += 1
if args.num_accumulated != 1:
# set grad to zero for gradient accumulation
qa_net.collect_params().zero_grad()
qa_net.zero_grad()

# saving
if step_num % save_interval == 0 or step_num >= num_train_steps:
Expand Down Expand Up @@ -964,7 +963,6 @@ def eval_validation(ckpt_name, best_eval):

if __name__ == '__main__':
os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
os.environ['MXNET_USE_FUSION'] = '0' # Manually disable pointwise fusion
args = parse_args()
logging_config(args.output_dir, name='finetune_squad{}'.format(args.version))
set_seed(args.seed)
Expand Down
91 changes: 41 additions & 50 deletions src/gluonnlp/attention_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,9 +601,8 @@ class MultiHeadAttentionCell(HybridBlock):
"""
def __init__(self, query_units=None, num_heads=None, attention_dropout=0.0,
scaled: bool = True, normalized: bool = False, eps: float = 1E-6,
dtype='float32', layout='NTK', use_einsum=False,
prefix=None, params=None):
super().__init__(prefix=prefix, params=params)
dtype='float32', layout='NTK', use_einsum=False):
super().__init__()
self._query_units = query_units
self._num_heads = num_heads
self._attention_dropout = attention_dropout
Expand Down Expand Up @@ -705,8 +704,7 @@ def __init__(self, query_units,
dropout: float = 0.0,
dtype='float32',
layout='NTK',
use_einsum=False,
prefix=None, params=None):
use_einsum=False):
"""
Parameters
Expand All @@ -725,10 +723,8 @@ def __init__(self, query_units,
scaled
dtype
layout
prefix
params
"""
super().__init__(prefix=prefix, params=params)
super().__init__()
self._dropout = dropout
self._method = method
self._query_units = query_units
Expand All @@ -744,49 +740,44 @@ def __init__(self, query_units,
self._layout = layout
if self._layout not in ['NKT', 'NTK', 'TNK']:
raise ValueError('layout="{}" is not supported'.format(self._layout))
with self.name_scope():
if method == 'transformer_xl':
if pos_embed_units is None:
pos_embed_units = self._num_heads * self._head_query_units
self._rel_pos_embed = SinusoidalPositionalEmbedding(units=pos_embed_units,
prefix='rel_pos_embed_',
dtype=self._dtype)
self._rel_proj = nn.Dense(units=query_units,
in_units=pos_embed_units,
flatten=False,
use_bias=False,
prefix='rel_proj_',
dtype=self._dtype)
self._dropout_layer = nn.Dropout(dropout)
elif method == 'shaw':
assert self._max_distance is not None, 'Must set max_distance when method="shaw".'
if self._bidirectional:
vocab_size = self._max_distance * 2 + 1
else:
vocab_size = self._max_distance + 1
self._rel_pos_embed = LearnedPositionalEmbedding(
units=self._num_heads * self._head_query_units,
max_length=vocab_size,
weight_initializer=mx.init.Xavier(rnd_type="gaussian",
factor_type="in",
magnitude=1),
prefix='rel_pos_embed_',
mode='wrap' if self._bidirectional else 'raise',
dtype=self._dtype)
elif method == 't5':
if self._num_buckets is None:
self._num_buckets = 32
if self._max_distance is None:
self._max_distance = 128
self._rel_pos_embed = BucketPositionalEmbedding(
units=num_heads,
num_buckets=self._num_buckets,
max_distance=self._max_distance,
bidirectional=self._bidirectional,
prefix='rel_pos_embed_',
dtype=self._dtype)
if method == 'transformer_xl':
if pos_embed_units is None:
pos_embed_units = self._num_heads * self._head_query_units
self._rel_pos_embed = SinusoidalPositionalEmbedding(units=pos_embed_units,
dtype=self._dtype)
self._rel_proj = nn.Dense(units=query_units,
in_units=pos_embed_units,
flatten=False,
use_bias=False,
dtype=self._dtype)
self._dropout_layer = nn.Dropout(dropout)
elif method == 'shaw':
assert self._max_distance is not None, 'Must set max_distance when method="shaw".'
if self._bidirectional:
vocab_size = self._max_distance * 2 + 1
else:
raise NotImplementedError('method="{}" is currently not supported!'.format(method))
vocab_size = self._max_distance + 1
self._rel_pos_embed = LearnedPositionalEmbedding(
units=self._num_heads * self._head_query_units,
max_length=vocab_size,
weight_initializer=mx.init.Xavier(rnd_type="gaussian",
factor_type="in",
magnitude=1),
mode='wrap' if self._bidirectional else 'raise',
dtype=self._dtype)
elif method == 't5':
if self._num_buckets is None:
self._num_buckets = 32
if self._max_distance is None:
self._max_distance = 128
self._rel_pos_embed = BucketPositionalEmbedding(
units=num_heads,
num_buckets=self._num_buckets,
max_distance=self._max_distance,
bidirectional=self._bidirectional,
dtype=self._dtype)
else:
raise NotImplementedError('method="{}" is currently not supported!'.format(method))

def hybrid_forward(self, F, rel_positions, query=None):
"""
Expand Down
4 changes: 2 additions & 2 deletions src/gluonnlp/data/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __init__(self, filename, **kwargs):
else:
raise ValueError('Unsupported extension: %s' % filename)
self._keys = keys
super(NumpyDataset, self).__init__(*data)
super().__init__(*data)

@property
def keys(self):
Expand Down Expand Up @@ -125,7 +125,7 @@ def __init__(self, file_pattern):
files = sorted(files)
if len(files) == 0:
raise ValueError('Cannot find any file with path "%s"' % file_pattern)
super(_PathDataset, self).__init__(files)
super().__init__(files)


def _dataset_worker_fn(urls, dataset_fn, batch_sampler_fn):
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class TruncNorm(Initializer):
"""
def __init__(self, mean: float = 0, stdev: float = 0.01,
scale=2, **kwargs):
super(TruncNorm, self).__init__(**kwargs)
super().__init__(**kwargs)
self._mean = mean
self._stdev = stdev
self._scale = scale
Expand Down

0 comments on commit 70a1887

Please sign in to comment.