Skip to content


use fastdownload
Browse files Browse the repository at this point in the history
  • Loading branch information
jph00 committed Jul 31, 2021
1 parent 84da60b commit 63b624c
Show file tree
Hide file tree
Showing 3 changed files with 263 additions and 330 deletions.
108 changes: 106 additions & 2 deletions fastai/data/
@@ -1,4 +1,108 @@
{'': (11784419,
{'': (3245877008,
'': (801038,
'': (1150585339,
'': (4579163978,
'': (131740031,
'': (169168619,
'': (5686607260,
'': (99003388,
'': (341663724,
'': (1557161267,
'': (191498213,
'': (669826647,
'': (2900347689,
'': (92612825,
'': (328387740,
'': (1343715595,
'': (15683414,
'': (565372,
'': (345236087,
'': (811706944,
'': (1957803273,
'': (452316199,
'': (598913237,
'': (1637796771,
'': (2618908000,
'': (33276453,
'': (14744474,
'': (432848315,
'': (11784419,
'': (71606272,
'': (688339454,
'': (68341743,
'': (2598183296,
'': (144440600,
'': (384269937,
'': (190200704,
'': (4070055,
'': (319476345,
'': (196146755,
'': (166373201,
'': (968212,
'': (593774,
'': (2314212,
'': (168168549,
'': (839285364,
'': (30252,
'': (571827,
'': (3214948,
'': (342207,
'': (51790,
'': (15523994,
'': (997569,
'': (131604586,
'': (186624896,
180 changes: 21 additions & 159 deletions fastai/data/
@@ -1,65 +1,31 @@
# AUTOGENERATED! DO NOT EDIT! File to edit: nbs/04_data.external.ipynb (unless otherwise specified).

__all__ = ['Config', 'URLs', 'download_url', 'download_data', 'file_extract', 'newest_folder', 'rename_extracted',
__all__ = ['fastai_cfg', 'fastai_path', 'URLs', 'untar_data']

# Cell
from ..torch_basics import *
from fastdownload import FastDownload
from functools import lru_cache

# Cell
class Config:
"Setup config at `~/.fastai` unless it exists already."
config_path = Path(os.getenv('FASTAI_HOME', '~/.fastai')).expanduser()
config_file = config_path/'config.yml'
def fastai_cfg():
"`Config` object for fastai's `config.ini`"
return Config(Path(os.getenv('FASTAI_HOME', '~/.fastai')), 'config.ini', create=dict(
data = 'data', archive = 'archive', storage = 'tmp', model = 'models'))

def __init__(self):
self.config_path.mkdir(parents=True, exist_ok=True)
if not self.config_file.exists(): self.create_config()
self.d = self.load_config()

def __getitem__(self,k):
k = k.lower()
if k not in self.d: k = k+'_path'
return Path(self.d[k])

def __getattr__(self,k):
if k=='d': raise AttributeError
return self[k]

def __setitem__(self,k,v): self.d[k] = str(v)
def __contains__(self,k): return k in self.d

def load_config(self):
"load and return config if version equals 2 in existing, else create new config."
with open(self.config_file, 'r') as f:
config = yaml.safe_load(f)
if 'version' in config and config['version'] == 2: return config
elif 'version' in config: self.create_config(config)
else: self.create_config()
return self.load_config()

def create_config(self, cfg=None):
"create new config with default paths and set `version` to 2."
config = {'data_path': str(self.config_path/'data'),
'archive_path': str(self.config_path/'archive'),
'storage_path': '/tmp',
'model_path': str(self.config_path/'models'),
'version': 2}
if cfg is not None:
cfg['version'] = 2
config = merge(config, cfg)

def save(self): self.save_file(self.d)
def save_file(self, config):
"save config file at default config location `~/.fastai/config.yml`."
with'w') as f: yaml.dump(config, f, default_flow_style=False)
# Cell
def fastai_path(folder):
"Path to `folder` in `fastai_cfg`"
return fastai_cfg().path(folder)

# Cell
class URLs():
"Global constants for dataset and model URLs."
LOCAL_PATH = Path.cwd()
MDL = ''
S3 = ''
URL = f'{S3}sample/'

Expand Down Expand Up @@ -131,8 +97,8 @@ class URLs():
PASCAL_2012 = f'{S3_IMAGELOC}pascal_2012.tgz'

# Audio classification datasets
MACAQUES = f'{GOOGLE}ml-animal-sounds-datasets/'
ZEBRA_FINCH = f'{GOOGLE}ml-animal-sounds-datasets/'

# Medical Imaging datasets
#SKIN_LESION = f'{S3_IMAGELOC}skin_lesion.tgz'
Expand All @@ -149,114 +115,10 @@ def path(url='.', c_key='archive'):
fname = url.split('/')[-1]
local_path = URLs.LOCAL_PATH/('models' if c_key=='models' else 'data')/fname
if local_path.exists(): return local_path
return Config()[c_key]/fname

# Cell
def download_url(url, dest, overwrite=False, pbar=None, show_progress=True, chunk_size=1024*1024,
timeout=4, retries=5):
"Download `url` to `dest` unless it exists and not `overwrite`"
if os.path.exists(dest) and not overwrite: return

s = requests.Session()
# additional line to identify as a firefox browser, see fastai/#2438
s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'})
u = s.get(url, stream=True, timeout=timeout)
try: file_size = int(u.headers["Content-Length"])
except: show_progress = False

with open(dest, 'wb') as f:
nbytes = 0
if show_progress: pbar = progress_bar(range(file_size), leave=False, parent=pbar)
if show_progress: pbar.update(0)
for chunk in u.iter_content(chunk_size=chunk_size):
nbytes += len(chunk)
if show_progress: pbar.update(nbytes)
except requests.exceptions.ConnectionError as e:
fname = url.split('/')[-1]
data_dir = dest.parent
print(f'\n Download of {url} has failed after {retries} retries\n'
f' Fix the download manually:\n'
f'$ mkdir -p {data_dir}\n'
f'$ cd {data_dir}\n'
f'$ wget -c {url}\n'
f'$ tar xf {fname}\n'
f' And re-run your code once the download is successful\n')

# Cell
def download_data(url, fname=None, c_key='archive', force_download=False, timeout=4):
"Download `url` to `fname`."
fname = Path(fname or URLs.path(url, c_key=c_key))
fname.parent.mkdir(parents=True, exist_ok=True)
if not fname.exists() or force_download: download_url(url, fname, overwrite=force_download, timeout=timeout)
return fname

# Cell
def _get_check(url):
"internal function to get the hash of the file at `url`."
checks = json.load(open(Path(__file__).parent/'checks.txt', 'r'))
return checks.get(url, '')

def _check_file(fname):
"internal function to get the hash of the local file at `fname`."
size = os.path.getsize(fname)
with open(fname, "rb") as f: hash_nb = hashlib.md5(**20)).hexdigest()
return [size,hash_nb]

# Cell
def _add_check(url, fname):
"Internal function to update the internal check file with `url` and check on `fname`."
checks = json.load(open(Path(__file__).parent/'checks.txt', 'r'))
checks[url] = _check_file(fname)
json.dump(checks, open(Path(__file__).parent/'checks.txt', 'w'), indent=2)

# Cell
def file_extract(fname, dest=None):
"Extract `fname` to `dest` using `tarfile` or `zipfile`."
if dest is None: dest = Path(fname).parent
fname = str(fname)
if fname.endswith('gz'):, 'r:gz').extractall(dest)
elif fname.endswith('zip'): zipfile.ZipFile(fname ).extractall(dest)
else: raise Exception(f'Unrecognized archive: {fname}')

# Cell
def _try_from_storage(dest, storage):
"an internal function to create symbolic links for files from `storage` to `dest` if `storage` exists"
if not storage.exists(): return
os.makedirs(dest, exist_ok=True)
for f in storage.glob('*'): os.symlink(f, dest/, target_is_directory=f.is_dir())

# Cell
def newest_folder(path):
"Return newest folder on path"
list_of_paths = path.glob('*')
return max(list_of_paths, key=lambda p: p.stat().st_ctime)

# Cell
def rename_extracted(dest):
"Rename file if different from dest"
extracted = newest_folder(dest.parent)
if not ( == extracted.rename(dest)
return fastai_path(c_key)/fname

# Cell
def untar_data(url, fname=None, dest=None, c_key='data', force_download=False, extract_func=file_extract, timeout=4):
"Download `url` to `fname` if `dest` doesn't exist, and un-tgz or unzip to folder `dest`."
default_dest = URLs.path(url, c_key=c_key).with_suffix('')
dest = default_dest if dest is None else Path(dest)/
fname = Path(fname or URLs.path(url))
if fname.exists() and _get_check(url) and _check_file(fname) != _get_check(url):
print("A new version of this dataset is available, downloading...")
force_download = True
if force_download:
if fname.exists(): os.remove(fname)
if dest.exists(): shutil.rmtree(dest)
if not dest.exists(): _try_from_storage(dest, URLs.path(url, c_key='storage').with_suffix(''))
if not dest.exists():
fname = download_data(url, fname=fname, c_key=c_key, timeout=timeout)
if _get_check(url) and _check_file(fname) != _get_check(url):
print(f"File downloaded is broken. Remove {fname} and try again.")
extract_func(fname, dest.parent)
return dest
def untar_data(url, archive=None, data=None, c_key='data', force_download=False):#, extract_func=file_extract, timeout=4):
"Download `url` to `fname` if `dest` doesn't exist, and extract to folder `dest`"
d = FastDownload(fastai_cfg(),, archive=archive, data=data, base='~/.fastai')
return d.get(url, force=force_download, extract_key=c_key)

0 comments on commit 63b624c

Please sign in to comment.