Skip to content

Commit

Permalink
use fastdownload
Browse files Browse the repository at this point in the history
  • Loading branch information
jph00 committed Jul 31, 2021
1 parent 84da60b commit 63b624c
Show file tree
Hide file tree
Showing 3 changed files with 263 additions and 330 deletions.
108 changes: 106 additions & 2 deletions fastai/data/download_checks.py
@@ -1,4 +1,108 @@
{'https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz': (11784419,
{'https://s3.amazonaws.com/fast-ai-coco/coco_sample.tgz': (3245877008,
'006cd55d633d94b36ecaf661467830ec'),
'https://s3.amazonaws.com/fast-ai-coco/coco_tiny.tgz': (801038,
'367467451ac4fba79a647753c2c66d3a'),
'https://s3.amazonaws.com/fast-ai-imageclas/CUB_200_2011.tgz': (1150585339,
'd2acaa99439dff0483c7bbac1bfe2a92'),
'https://s3.amazonaws.com/fast-ai-imageclas/bedroom.tgz': (4579163978,
'35d84f38f8a15fe47e66e460c8800d68'),
'https://s3.amazonaws.com/fast-ai-imageclas/caltech_101.tgz': (131740031,
'd673425306e98ee4619fcdeef8a0e876'),
'https://s3.amazonaws.com/fast-ai-imageclas/cifar100.tgz': (169168619,
'e5e65dcb54b9d3913f7b8a9ad6607e62'),
'https://s3.amazonaws.com/fast-ai-imageclas/food-101.tgz': (5686607260,
'1a540ebf1fb40b2bf3f2294234ba7907'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz': (99003388,
'c475f8f7617a200ba35b9facc48443c3'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz': (341663724,
'1f1f74f133caff120e575856659b87a2'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz': (1557161267,
'2c774ecb40005b35d7937d50f5d42336'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagewang-160.tgz': (191498213,
'dab9b1d97b95574ac64122a19bc74ca1'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagewang-320.tgz': (669826647,
'fff8ac034d60e14fcf7789845e625263'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagewang.tgz': (2900347689,
'2b7776bac0fc95db72ac8a3b091a3e30'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2-160.tgz': (92612825,
'3559b194123981a540b87132dbda412f'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2-320.tgz': (328387740,
'7a1fd92672a559a85de76e4810a20c21'),
'https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2.tgz': (1343715595,
'2cc1c1e36e20cb6fd24ffb978edcb487'),
'https://s3.amazonaws.com/fast-ai-imageclas/mnist_png.tgz': (15683414,
'03639f83c4e3d19e0a3a53a8a997c487'),
'https://s3.amazonaws.com/fast-ai-imageclas/mnist_var_size_tiny.tgz': (565372,
'b71a930f4eb744a4a143a6c7ff7ed67f'),
'https://s3.amazonaws.com/fast-ai-imageclas/oxford-102-flowers.tgz': (345236087,
'5666e01c1311b4c67fcf20d2b3850a88'),
'https://s3.amazonaws.com/fast-ai-imageclas/oxford-iiit-pet.tgz': (811706944,
'e4db5c768afd933bb91f5f594d7417a4'),
'https://s3.amazonaws.com/fast-ai-imageclas/stanford-cars.tgz': (1957803273,
'9045d6673c9ced0889f41816f6bf2f9f'),
'https://s3.amazonaws.com/fast-ai-imagelocal/biwi_head_pose.tgz': (452316199,
'00f4ccf66e8cba184bc292fdc08fb237'),
'https://s3.amazonaws.com/fast-ai-imagelocal/camvid.tgz': (598913237,
'648371e4f3a833682afb39b08a3ce2aa'),
'https://s3.amazonaws.com/fast-ai-imagelocal/pascal_2007.tgz': (1637796771,
'433b4706eb7c42bd74e7f784e3fdf244'),
'https://s3.amazonaws.com/fast-ai-imagelocal/pascal_2012.tgz': (2618908000,
'd90e29e54a4c76c0c6fba8355dcbaca5'),
'https://s3.amazonaws.com/fast-ai-imagelocal/siim_small.tgz': (33276453,
'2f88ac350dce1d971ecbb5ec722da75f'),
'https://s3.amazonaws.com/fast-ai-imagelocal/tcga_small.tgz': (14744474,
'3ceffe7cf522cb1c60e93dc555d8817f'),
'https://s3.amazonaws.com/fast-ai-modelzoo/transformer.tgz': (432848315,
'024b0d2203ebb0cd1fc64b27cf8af18e'),
'https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz': (11784419,
'b86f328f4dbd072486591cb7a5644dcd'),
'https://s3.amazonaws.com/fast-ai-nlp/amazon_review_full_csv.tgz': (71606272,
'4a1196cf0adaea22f4bc3f592cddde90'),
'https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz': (688339454,
'676f7e5208ec343c8274b4bb085bc938'),
'https://s3.amazonaws.com/fast-ai-nlp/dbpedia_csv.tgz': (68341743,
'239c7837b9e79db34486f3de6a00e38e'),
'https://s3.amazonaws.com/fast-ai-nlp/giga-fren.tgz': (2598183296,
'69573f58e2c850b90f2f954077041d8c'),
'https://s3.amazonaws.com/fast-ai-nlp/imdb.tgz': (144440600,
'90f9b1c4ff43a90d67553c9240dc0249'),
'https://s3.amazonaws.com/fast-ai-nlp/sogou_news_csv.tgz': (384269937,
'950f1366d33be52f5b944f8a8b680902'),
'https://s3.amazonaws.com/fast-ai-nlp/wikitext-103.tgz': (190200704,
'2dd8cf8693b3d27e9c8f0a7df054b2c7'),
'https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz': (4070055,
'2a82d47a7b85c8b6a8e068dc4c1d37e7'),
'https://s3.amazonaws.com/fast-ai-nlp/yahoo_answers_csv.tgz': (319476345,
'0632a0d236ef3a529c0fa4429b339f68'),
'https://s3.amazonaws.com/fast-ai-nlp/yelp_review_full_csv.tgz': (196146755,
'1efd84215ea3e30d90e4c33764b889db'),
'https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz': (166373201,
'48c8451c1ad30472334d856b5d294807'),
'https://s3.amazonaws.com/fast-ai-sample/adult_sample.tgz': (968212,
'64eb9d7e23732de0b138f7372d15492f')}
'64eb9d7e23732de0b138f7372d15492f'),
'https://s3.amazonaws.com/fast-ai-sample/biwi_sample.tgz': (593774,
'9179f4c1435f4b291f0d5b072d60c2c9'),
'https://s3.amazonaws.com/fast-ai-sample/camvid_tiny.tgz': (2314212,
'2cf6daf91b7a2083ecfa3e9968e9d915'),
'https://s3.amazonaws.com/fast-ai-sample/cifar10.tgz': (168168549,
'a5f8c31371b63a406b23368042812d3c'),
'https://s3.amazonaws.com/fast-ai-sample/dogscats.tgz': (839285364,
'3e483c8d6ef2175e9d395a6027eb92b7'),
'https://s3.amazonaws.com/fast-ai-sample/human_numbers.tgz': (30252,
'8a19c3bfa2bcb08cd787e741261f3ea2'),
'https://s3.amazonaws.com/fast-ai-sample/imdb_sample.tgz': (571827,
'0842e61a9867caa2e6fbdb14fa703d61'),
'https://s3.amazonaws.com/fast-ai-sample/mnist_sample.tgz': (3214948,
'2dbc7ec6f9259b583af0072c55816a88'),
'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207,
'56143e8f24db90d925d82a5a74141875'),
'https://s3.amazonaws.com/fast-ai-sample/movie_lens_sample.tgz': (51790,
'10961384dfe7c5181460390a460c1f77'),
'https://s3.amazonaws.com/fast-ai-sample/planet_sample.tgz': (15523994,
'8bfb174b3162f07fbde09b54555bdb00'),
'https://s3.amazonaws.com/fast-ai-sample/planet_tiny.tgz': (997569,
'490873c5683454d4b2611fb1f00a68a9'),
'https://storage.googleapis.com/ml-animal-sounds-datasets/macaques.zip': (131604586,
'44fec3950e61d6a898f16fe30bc9c88d'),
'https://storage.googleapis.com/ml-animal-sounds-datasets/zebra_finch.zip': (186624896,
'91fa9c4ebfc986b9babc2a805a10e281')}
180 changes: 21 additions & 159 deletions fastai/data/external.py
@@ -1,65 +1,31 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/04_data.external.ipynb (unless otherwise specified).

__all__ = ['Config', 'URLs', 'download_url', 'download_data', 'file_extract', 'newest_folder', 'rename_extracted',
'untar_data']
__all__ = ['fastai_cfg', 'fastai_path', 'URLs', 'untar_data']

# Cell
from ..torch_basics import *
from fastdownload import FastDownload
from functools import lru_cache
import fastai.data

# Cell
class Config:
"Setup config at `~/.fastai` unless it exists already."
config_path = Path(os.getenv('FASTAI_HOME', '~/.fastai')).expanduser()
config_file = config_path/'config.yml'
@lru_cache(maxsize=None)
def fastai_cfg():
"`Config` object for fastai's `config.ini`"
return Config(Path(os.getenv('FASTAI_HOME', '~/.fastai')), 'config.ini', create=dict(
data = 'data', archive = 'archive', storage = 'tmp', model = 'models'))

def __init__(self):
self.config_path.mkdir(parents=True, exist_ok=True)
if not self.config_file.exists(): self.create_config()
self.d = self.load_config()

def __getitem__(self,k):
k = k.lower()
if k not in self.d: k = k+'_path'
return Path(self.d[k])

def __getattr__(self,k):
if k=='d': raise AttributeError
return self[k]

def __setitem__(self,k,v): self.d[k] = str(v)
def __contains__(self,k): return k in self.d

def load_config(self):
"load and return config if version equals 2 in existing, else create new config."
with open(self.config_file, 'r') as f:
config = yaml.safe_load(f)
if 'version' in config and config['version'] == 2: return config
elif 'version' in config: self.create_config(config)
else: self.create_config()
return self.load_config()

def create_config(self, cfg=None):
"create new config with default paths and set `version` to 2."
config = {'data_path': str(self.config_path/'data'),
'archive_path': str(self.config_path/'archive'),
'storage_path': '/tmp',
'model_path': str(self.config_path/'models'),
'version': 2}
if cfg is not None:
cfg['version'] = 2
config = merge(config, cfg)
self.save_file(config)

def save(self): self.save_file(self.d)
def save_file(self, config):
"save config file at default config location `~/.fastai/config.yml`."
with self.config_file.open('w') as f: yaml.dump(config, f, default_flow_style=False)
# Cell
def fastai_path(folder):
"Path to `folder` in `fastai_cfg`"
return fastai_cfg().path(folder)

# Cell
class URLs():
"Global constants for dataset and model URLs."
LOCAL_PATH = Path.cwd()
MDL = 'http://files.fast.ai/models/'
GOOGLE = 'https://storage.googleapis.com/'
S3 = 'https://s3.amazonaws.com/fast-ai-'
URL = f'{S3}sample/'

Expand Down Expand Up @@ -131,8 +97,8 @@ class URLs():
PASCAL_2012 = f'{S3_IMAGELOC}pascal_2012.tgz'

# Audio classification datasets
MACAQUES = 'https://storage.googleapis.com/ml-animal-sounds-datasets/macaques.zip'
ZEBRA_FINCH = 'https://storage.googleapis.com/ml-animal-sounds-datasets/zebra_finch.zip'
MACAQUES = f'{GOOGLE}ml-animal-sounds-datasets/macaques.zip'
ZEBRA_FINCH = f'{GOOGLE}ml-animal-sounds-datasets/zebra_finch.zip'

# Medical Imaging datasets
#SKIN_LESION = f'{S3_IMAGELOC}skin_lesion.tgz'
Expand All @@ -149,114 +115,10 @@ def path(url='.', c_key='archive'):
fname = url.split('/')[-1]
local_path = URLs.LOCAL_PATH/('models' if c_key=='models' else 'data')/fname
if local_path.exists(): return local_path
return Config()[c_key]/fname

# Cell
def download_url(url, dest, overwrite=False, pbar=None, show_progress=True, chunk_size=1024*1024,
timeout=4, retries=5):
"Download `url` to `dest` unless it exists and not `overwrite`"
if os.path.exists(dest) and not overwrite: return

s = requests.Session()
s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries))
# additional line to identify as a firefox browser, see fastai/#2438
s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'})
u = s.get(url, stream=True, timeout=timeout)
try: file_size = int(u.headers["Content-Length"])
except: show_progress = False

with open(dest, 'wb') as f:
nbytes = 0
if show_progress: pbar = progress_bar(range(file_size), leave=False, parent=pbar)
try:
if show_progress: pbar.update(0)
for chunk in u.iter_content(chunk_size=chunk_size):
nbytes += len(chunk)
if show_progress: pbar.update(nbytes)
f.write(chunk)
except requests.exceptions.ConnectionError as e:
fname = url.split('/')[-1]
data_dir = dest.parent
print(f'\n Download of {url} has failed after {retries} retries\n'
f' Fix the download manually:\n'
f'$ mkdir -p {data_dir}\n'
f'$ cd {data_dir}\n'
f'$ wget -c {url}\n'
f'$ tar xf {fname}\n'
f' And re-run your code once the download is successful\n')

# Cell
def download_data(url, fname=None, c_key='archive', force_download=False, timeout=4):
"Download `url` to `fname`."
fname = Path(fname or URLs.path(url, c_key=c_key))
fname.parent.mkdir(parents=True, exist_ok=True)
if not fname.exists() or force_download: download_url(url, fname, overwrite=force_download, timeout=timeout)
return fname

# Cell
def _get_check(url):
"internal function to get the hash of the file at `url`."
checks = json.load(open(Path(__file__).parent/'checks.txt', 'r'))
return checks.get(url, '')

def _check_file(fname):
"internal function to get the hash of the local file at `fname`."
size = os.path.getsize(fname)
with open(fname, "rb") as f: hash_nb = hashlib.md5(f.read(2**20)).hexdigest()
return [size,hash_nb]

# Cell
def _add_check(url, fname):
"Internal function to update the internal check file with `url` and check on `fname`."
checks = json.load(open(Path(__file__).parent/'checks.txt', 'r'))
checks[url] = _check_file(fname)
json.dump(checks, open(Path(__file__).parent/'checks.txt', 'w'), indent=2)

# Cell
def file_extract(fname, dest=None):
"Extract `fname` to `dest` using `tarfile` or `zipfile`."
if dest is None: dest = Path(fname).parent
fname = str(fname)
if fname.endswith('gz'): tarfile.open(fname, 'r:gz').extractall(dest)
elif fname.endswith('zip'): zipfile.ZipFile(fname ).extractall(dest)
else: raise Exception(f'Unrecognized archive: {fname}')

# Cell
def _try_from_storage(dest, storage):
"an internal function to create symbolic links for files from `storage` to `dest` if `storage` exists"
if not storage.exists(): return
os.makedirs(dest, exist_ok=True)
for f in storage.glob('*'): os.symlink(f, dest/f.name, target_is_directory=f.is_dir())

# Cell
def newest_folder(path):
"Return newest folder on path"
list_of_paths = path.glob('*')
return max(list_of_paths, key=lambda p: p.stat().st_ctime)

# Cell
def rename_extracted(dest):
"Rename file if different from dest"
extracted = newest_folder(dest.parent)
if not (extracted.name == dest.name): extracted.rename(dest)
return fastai_path(c_key)/fname

# Cell
def untar_data(url, fname=None, dest=None, c_key='data', force_download=False, extract_func=file_extract, timeout=4):
"Download `url` to `fname` if `dest` doesn't exist, and un-tgz or unzip to folder `dest`."
default_dest = URLs.path(url, c_key=c_key).with_suffix('')
dest = default_dest if dest is None else Path(dest)/default_dest.name
fname = Path(fname or URLs.path(url))
if fname.exists() and _get_check(url) and _check_file(fname) != _get_check(url):
print("A new version of this dataset is available, downloading...")
force_download = True
if force_download:
if fname.exists(): os.remove(fname)
if dest.exists(): shutil.rmtree(dest)
if not dest.exists(): _try_from_storage(dest, URLs.path(url, c_key='storage').with_suffix(''))
if not dest.exists():
fname = download_data(url, fname=fname, c_key=c_key, timeout=timeout)
if _get_check(url) and _check_file(fname) != _get_check(url):
print(f"File downloaded is broken. Remove {fname} and try again.")
extract_func(fname, dest.parent)
rename_extracted(dest)
return dest
def untar_data(url, archive=None, data=None, c_key='data', force_download=False):#, extract_func=file_extract, timeout=4):
"Download `url` to `fname` if `dest` doesn't exist, and extract to folder `dest`"
d = FastDownload(fastai_cfg(), module=fastai.data, archive=archive, data=data, base='~/.fastai')
return d.get(url, force=force_download, extract_key=c_key)

0 comments on commit 63b624c

Please sign in to comment.