Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Use Amazon S3 Transfer Acceleration #1260

Merged
merged 1 commit into from
Jul 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions scripts/datasets/language_modeling/prepare_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@
# The original address of Google One Billion Word dataset is
# http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
# We uploaded the file to S3 to accelerate the speed
'gbw': 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz',
'gbw': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz',
# The data is obtained from https://raw.githubusercontent.com/rafaljozefowicz/lm/master/1b_word_vocab.txt
'gbw_vocab': 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt'
'gbw_vocab': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt'
}


Expand Down
2 changes: 1 addition & 1 deletion scripts/datasets/machine_translation/prepare_wmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@
# For the CWMT dataset, you can also download them from the official location: http://nlp.nju.edu.cn/cwmt-wmt/
# Currently, this version is processed via https://gist.github.com/sxjscience/54bedd68ce3fb69b3b1b264377efb5a5
'cwmt': {
'url': 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt.tar.gz',
'url': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt.tar.gz',
'zh-en': {
'en': 'cwmt/cwmt-zh-en.en',
'zh': 'cwmt/cwmt-zh-en.zh'
Expand Down
2 changes: 1 addition & 1 deletion scripts/datasets/pretrain_corpus/prepare_bookcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

_URLS = {
'gutenberg':
'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/Gutenberg.zip',
'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip',
}


Expand Down
2 changes: 1 addition & 1 deletion scripts/datasets/url_checksums/book_corpus.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225
4 changes: 2 additions & 2 deletions scripts/datasets/url_checksums/language_model.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 3c914d1
https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip 0aec09a7537b58d4bb65362fee27650eeaba625a 190229076
http://mattmahoney.net/dc/enwik8.zip d856b1ccd937c51aeb9c342e47666fb8c38e7e72 36445475
http://mattmahoney.net/dc/text8.zip 6c70299b93b7e1f927b42cd8f6ac1a31547c7a2e 31344016
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106
12 changes: 6 additions & 6 deletions scripts/datasets/url_checksums/wmt.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,16 @@ https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01
https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02 bf6b18a33c8cafa6889fd463fa8a2850d8877d35 306221588
https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 1bec5f10297512183e483fdd4984d207700657d1 1073741824
https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01 15df2968bc69ef7662cf3029282bbb62cbf107b1 312943879
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/casia2015.zip b432394685e4c53797e1ac86851f8a013aef27a2 98159063
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/casict2011.zip 769a9a86c24e9507dbf520b950b9026120cb041e 166957775
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/datum2015.zip 6d94cc8d296dd4268ed0a10fa3a419267280363e 100118018
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/datum2017.zip 480fa06760b2dbe7c9a9bd7c3fd5e5b22b860a45 37389573
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt/parallel/neu2017.zip 532b56ba62f6cffccdc85f4316468873ca739bd1 148681171
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/casia2015.zip b432394685e4c53797e1ac86851f8a013aef27a2 98159063
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/casict2011.zip 769a9a86c24e9507dbf520b950b9026120cb041e 166957775
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/datum2015.zip 6d94cc8d296dd4268ed0a10fa3a419267280363e 100118018
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/datum2017.zip 480fa06760b2dbe7c9a9bd7c3fd5e5b22b860a45 37389573
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/neu2017.zip 532b56ba62f6cffccdc85f4316468873ca739bd1 148681171
http://data.statmt.org/wmt17/translation-task/rapid2016.tgz 8b173ce0bc77f2a1a57c8134143e3b5ae228a6e2 163416042
https://s3-eu-west-1.amazonaws.com/tilde-model/rapid2019.de-en.zip aafe431338abb98fc20951b2d6011223a1b91311 111888392
http://data.statmt.org/wmt19/translation-task/dev.tgz 451ce2cae815c8392212ccb3f54f5dcddb9b2b9e 38654961
http://data.statmt.org/wmt19/translation-task/test.tgz ce02a36fb2cd41abfa19d36eb8c8d50241ed3346 3533424
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/wmt/cwmt.tar.gz 88c2f4295169e9f0a9834bf8bff87e3fd4c04055 709032378
https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt.tar.gz 88c2f4295169e9f0a9834bf8bff87e3fd4c04055 709032378
http://data.statmt.org/news-crawl/de/news.2007.de.shuffled.deduped.gz 9d746b9df345f764e6e615119113c70e3fb0858c 90104365
http://data.statmt.org/news-crawl/de/news.2008.de.shuffled.deduped.gz 185a24e8833844486aee16cb5decf9a64da1c101 308205291
http://data.statmt.org/news-crawl/de/news.2009.de.shuffled.deduped.gz 9f7645fc6467de88f4205d94f483194838bad8ce 317590378
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def get_model_zoo_checksum_dir():
def get_repo_url():
"""Return the base URL for Gluon dataset and model repository """
# TODO(sxjscience) Revise later by calling gluon.utils._get_repo_url
default_repo = 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/'
default_repo = 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/'
repo_url = os.environ.get('MXNET_GLUON_REPO', default_repo)
if repo_url[-1] != '/':
repo_url = repo_url + '/'
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/data/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,7 +1259,7 @@ class SentencepieceTokenizer(BaseTokenizerWithVocab):
Examples
--------
>>> from mxnet import gluon
>>> url = 'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/tokenizer_test_models/sentencepiece/test_ende-a9bee4.model'
>>> url = 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/tokenizer_test_models/sentencepiece/test_ende-a9bee4.model'
>>> model_f = gluon.utils.download(url)
-etc-
>>> tokenizer = gluonnlp.data.SentencepieceTokenizer(model_f)
Expand Down