Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Commit

Permalink
Use Amazon S3 Transfer Acceleration (#1260)
Browse files Browse the repository at this point in the history
  • Loading branch information
leezu committed Jul 10, 2020
1 parent cd48efd commit 83e1f13
Show file tree
Hide file tree
Showing 8 changed files with 15 additions and 15 deletions.
4 changes: 2 additions & 2 deletions scripts/datasets/language_modeling/prepare_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@
# The original address of Google One Billion Word dataset is
# http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
# We uploaded the file to S3 to accelerate the speed
'gbw': 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz',
'gbw': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz',
# The data is obtained from https://raw.githubusercontent.com/rafaljozefowicz/lm/master/1b_word_vocab.txt
'gbw_vocab': 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt'
'gbw_vocab': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt'
}


Expand Down
2 changes: 1 addition & 1 deletion scripts/datasets/machine_translation/prepare_wmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@
# For the CWMT dataset, you can also download them from the official location: http://nlp.nju.edu.cn/cwmt-wmt/
# Currently, this version is processed via https://gist.github.com/sxjscience/54bedd68ce3fb69b3b1b264377efb5a5
'cwmt': {
'url': 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt.tar.gz',
'url': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt.tar.gz',
'zh-en': {
'en': 'cwmt/cwmt-zh-en.en',
'zh': 'cwmt/cwmt-zh-en.zh'
Expand Down
2 changes: 1 addition & 1 deletion scripts/datasets/pretrain_corpus/prepare_bookcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

_URLS = {
'gutenberg':
'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/Gutenberg.zip',
'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip',
}


Expand Down
2 changes: 1 addition & 1 deletion scripts/datasets/url_checksums/book_corpus.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225
4 changes: 2 additions & 2 deletions scripts/datasets/url_checksums/language_model.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 3c914d1
https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip 0aec09a7537b58d4bb65362fee27650eeaba625a 190229076
http://mattmahoney.net/dc/enwik8.zip d856b1ccd937c51aeb9c342e47666fb8c38e7e72 36445475
http://mattmahoney.net/dc/text8.zip 6c70299b93b7e1f927b42cd8f6ac1a31547c7a2e 31344016
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106
12 changes: 6 additions & 6 deletions scripts/datasets/url_checksums/wmt.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,16 @@ https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01
https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02 bf6b18a33c8cafa6889fd463fa8a2850d8877d35 306221588
https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 1bec5f10297512183e483fdd4984d207700657d1 1073741824
https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01 15df2968bc69ef7662cf3029282bbb62cbf107b1 312943879
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/casia2015.zip b432394685e4c53797e1ac86851f8a013aef27a2 98159063
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/casict2011.zip 769a9a86c24e9507dbf520b950b9026120cb041e 166957775
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/datum2015.zip 6d94cc8d296dd4268ed0a10fa3a419267280363e 100118018
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/datum2017.zip 480fa06760b2dbe7c9a9bd7c3fd5e5b22b860a45 37389573
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/neu2017.zip 532b56ba62f6cffccdc85f4316468873ca739bd1 148681171
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/casia2015.zip b432394685e4c53797e1ac86851f8a013aef27a2 98159063
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/casict2011.zip 769a9a86c24e9507dbf520b950b9026120cb041e 166957775
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/datum2015.zip 6d94cc8d296dd4268ed0a10fa3a419267280363e 100118018
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/datum2017.zip 480fa06760b2dbe7c9a9bd7c3fd5e5b22b860a45 37389573
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/neu2017.zip 532b56ba62f6cffccdc85f4316468873ca739bd1 148681171
http://data.statmt.org/wmt17/translation-task/rapid2016.tgz 8b173ce0bc77f2a1a57c8134143e3b5ae228a6e2 163416042
https://s3-eu-west-1.amazonaws.com/tilde-model/rapid2019.de-en.zip aafe431338abb98fc20951b2d6011223a1b91311 111888392
http://data.statmt.org/wmt19/translation-task/dev.tgz 451ce2cae815c8392212ccb3f54f5dcddb9b2b9e 38654961
http://data.statmt.org/wmt19/translation-task/test.tgz ce02a36fb2cd41abfa19d36eb8c8d50241ed3346 3533424
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt.tar.gz 88c2f4295169e9f0a9834bf8bff87e3fd4c04055 709032378
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt.tar.gz 88c2f4295169e9f0a9834bf8bff87e3fd4c04055 709032378
http://data.statmt.org/news-crawl/de/news.2007.de.shuffled.deduped.gz 9d746b9df345f764e6e615119113c70e3fb0858c 90104365
http://data.statmt.org/news-crawl/de/news.2008.de.shuffled.deduped.gz 185a24e8833844486aee16cb5decf9a64da1c101 308205291
http://data.statmt.org/news-crawl/de/news.2009.de.shuffled.deduped.gz 9f7645fc6467de88f4205d94f483194838bad8ce 317590378
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def get_model_zoo_checksum_dir():
def get_repo_url():
"""Return the base URL for Gluon dataset and model repository """
# TODO(sxjscience) Revise later by calling gluon.utils._get_repo_url
default_repo = 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/'
default_repo = 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/'
repo_url = os.environ.get('MXNET_GLUON_REPO', default_repo)
if repo_url[-1] != '/':
repo_url = repo_url + '/'
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/data/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,7 +1259,7 @@ class SentencepieceTokenizer(BaseTokenizerWithVocab):
Examples
--------
>>> from mxnet import gluon
>>> url = 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/tokenizer_test_models/sentencepiece/test_ende-a9bee4.model'
>>> url = 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/tokenizer_test_models/sentencepiece/test_ende-a9bee4.model'
>>> model_f = gluon.utils.download(url)
-etc-
>>> tokenizer = gluonnlp.data.SentencepieceTokenizer(model_f)
Expand Down

0 comments on commit 83e1f13

Please sign in to comment.