diff --git a/fastai/data/download_checks.py b/fastai/data/download_checks.py index 0bfc8b75f7..a1e13c6f8d 100644 --- a/fastai/data/download_checks.py +++ b/fastai/data/download_checks.py @@ -1,4 +1,108 @@ -{'https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz': (11784419, +{'https://s3.amazonaws.com/fast-ai-coco/coco_sample.tgz': (3245877008, + '006cd55d633d94b36ecaf661467830ec'), + 'https://s3.amazonaws.com/fast-ai-coco/coco_tiny.tgz': (801038, + '367467451ac4fba79a647753c2c66d3a'), + 'https://s3.amazonaws.com/fast-ai-imageclas/CUB_200_2011.tgz': (1150585339, + 'd2acaa99439dff0483c7bbac1bfe2a92'), + 'https://s3.amazonaws.com/fast-ai-imageclas/bedroom.tgz': (4579163978, + '35d84f38f8a15fe47e66e460c8800d68'), + 'https://s3.amazonaws.com/fast-ai-imageclas/caltech_101.tgz': (131740031, + 'd673425306e98ee4619fcdeef8a0e876'), + 'https://s3.amazonaws.com/fast-ai-imageclas/cifar100.tgz': (169168619, + 'e5e65dcb54b9d3913f7b8a9ad6607e62'), + 'https://s3.amazonaws.com/fast-ai-imageclas/food-101.tgz': (5686607260, + '1a540ebf1fb40b2bf3f2294234ba7907'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz': (99003388, + 'c475f8f7617a200ba35b9facc48443c3'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-320.tgz': (341663724, + '1f1f74f133caff120e575856659b87a2'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz': (1557161267, + '2c774ecb40005b35d7937d50f5d42336'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagewang-160.tgz': (191498213, + 'dab9b1d97b95574ac64122a19bc74ca1'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagewang-320.tgz': (669826647, + 'fff8ac034d60e14fcf7789845e625263'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagewang.tgz': (2900347689, + '2b7776bac0fc95db72ac8a3b091a3e30'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2-160.tgz': (92612825, + '3559b194123981a540b87132dbda412f'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2-320.tgz': (328387740, + '7a1fd92672a559a85de76e4810a20c21'), + 'https://s3.amazonaws.com/fast-ai-imageclas/imagewoof2.tgz': (1343715595, + '2cc1c1e36e20cb6fd24ffb978edcb487'), + 'https://s3.amazonaws.com/fast-ai-imageclas/mnist_png.tgz': (15683414, + '03639f83c4e3d19e0a3a53a8a997c487'), + 'https://s3.amazonaws.com/fast-ai-imageclas/mnist_var_size_tiny.tgz': (565372, + 'b71a930f4eb744a4a143a6c7ff7ed67f'), + 'https://s3.amazonaws.com/fast-ai-imageclas/oxford-102-flowers.tgz': (345236087, + '5666e01c1311b4c67fcf20d2b3850a88'), + 'https://s3.amazonaws.com/fast-ai-imageclas/oxford-iiit-pet.tgz': (811706944, + 'e4db5c768afd933bb91f5f594d7417a4'), + 'https://s3.amazonaws.com/fast-ai-imageclas/stanford-cars.tgz': (1957803273, + '9045d6673c9ced0889f41816f6bf2f9f'), + 'https://s3.amazonaws.com/fast-ai-imagelocal/biwi_head_pose.tgz': (452316199, + '00f4ccf66e8cba184bc292fdc08fb237'), + 'https://s3.amazonaws.com/fast-ai-imagelocal/camvid.tgz': (598913237, + '648371e4f3a833682afb39b08a3ce2aa'), + 'https://s3.amazonaws.com/fast-ai-imagelocal/pascal_2007.tgz': (1637796771, + '433b4706eb7c42bd74e7f784e3fdf244'), + 'https://s3.amazonaws.com/fast-ai-imagelocal/pascal_2012.tgz': (2618908000, + 'd90e29e54a4c76c0c6fba8355dcbaca5'), + 'https://s3.amazonaws.com/fast-ai-imagelocal/siim_small.tgz': (33276453, + '2f88ac350dce1d971ecbb5ec722da75f'), + 'https://s3.amazonaws.com/fast-ai-imagelocal/tcga_small.tgz': (14744474, + '3ceffe7cf522cb1c60e93dc555d8817f'), + 'https://s3.amazonaws.com/fast-ai-modelzoo/transformer.tgz': (432848315, + '024b0d2203ebb0cd1fc64b27cf8af18e'), + 'https://s3.amazonaws.com/fast-ai-nlp/ag_news_csv.tgz': (11784419, 'b86f328f4dbd072486591cb7a5644dcd'), + 'https://s3.amazonaws.com/fast-ai-nlp/amazon_review_full_csv.tgz': (71606272, + '4a1196cf0adaea22f4bc3f592cddde90'), + 'https://s3.amazonaws.com/fast-ai-nlp/amazon_review_polarity_csv.tgz': (688339454, + '676f7e5208ec343c8274b4bb085bc938'), + 'https://s3.amazonaws.com/fast-ai-nlp/dbpedia_csv.tgz': (68341743, + '239c7837b9e79db34486f3de6a00e38e'), + 'https://s3.amazonaws.com/fast-ai-nlp/giga-fren.tgz': (2598183296, + '69573f58e2c850b90f2f954077041d8c'), + 'https://s3.amazonaws.com/fast-ai-nlp/imdb.tgz': (144440600, + '90f9b1c4ff43a90d67553c9240dc0249'), + 'https://s3.amazonaws.com/fast-ai-nlp/sogou_news_csv.tgz': (384269937, + '950f1366d33be52f5b944f8a8b680902'), + 'https://s3.amazonaws.com/fast-ai-nlp/wikitext-103.tgz': (190200704, + '2dd8cf8693b3d27e9c8f0a7df054b2c7'), + 'https://s3.amazonaws.com/fast-ai-nlp/wikitext-2.tgz': (4070055, + '2a82d47a7b85c8b6a8e068dc4c1d37e7'), + 'https://s3.amazonaws.com/fast-ai-nlp/yahoo_answers_csv.tgz': (319476345, + '0632a0d236ef3a529c0fa4429b339f68'), + 'https://s3.amazonaws.com/fast-ai-nlp/yelp_review_full_csv.tgz': (196146755, + '1efd84215ea3e30d90e4c33764b889db'), + 'https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz': (166373201, + '48c8451c1ad30472334d856b5d294807'), 'https://s3.amazonaws.com/fast-ai-sample/adult_sample.tgz': (968212, - '64eb9d7e23732de0b138f7372d15492f')} \ No newline at end of file + '64eb9d7e23732de0b138f7372d15492f'), + 'https://s3.amazonaws.com/fast-ai-sample/biwi_sample.tgz': (593774, + '9179f4c1435f4b291f0d5b072d60c2c9'), + 'https://s3.amazonaws.com/fast-ai-sample/camvid_tiny.tgz': (2314212, + '2cf6daf91b7a2083ecfa3e9968e9d915'), + 'https://s3.amazonaws.com/fast-ai-sample/cifar10.tgz': (168168549, + 'a5f8c31371b63a406b23368042812d3c'), + 'https://s3.amazonaws.com/fast-ai-sample/dogscats.tgz': (839285364, + '3e483c8d6ef2175e9d395a6027eb92b7'), + 'https://s3.amazonaws.com/fast-ai-sample/human_numbers.tgz': (30252, + '8a19c3bfa2bcb08cd787e741261f3ea2'), + 'https://s3.amazonaws.com/fast-ai-sample/imdb_sample.tgz': (571827, + '0842e61a9867caa2e6fbdb14fa703d61'), + 'https://s3.amazonaws.com/fast-ai-sample/mnist_sample.tgz': (3214948, + '2dbc7ec6f9259b583af0072c55816a88'), + 'https://s3.amazonaws.com/fast-ai-sample/mnist_tiny.tgz': (342207, + '56143e8f24db90d925d82a5a74141875'), + 'https://s3.amazonaws.com/fast-ai-sample/movie_lens_sample.tgz': (51790, + '10961384dfe7c5181460390a460c1f77'), + 'https://s3.amazonaws.com/fast-ai-sample/planet_sample.tgz': (15523994, + '8bfb174b3162f07fbde09b54555bdb00'), + 'https://s3.amazonaws.com/fast-ai-sample/planet_tiny.tgz': (997569, + '490873c5683454d4b2611fb1f00a68a9'), + 'https://storage.googleapis.com/ml-animal-sounds-datasets/macaques.zip': (131604586, + '44fec3950e61d6a898f16fe30bc9c88d'), + 'https://storage.googleapis.com/ml-animal-sounds-datasets/zebra_finch.zip': (186624896, + '91fa9c4ebfc986b9babc2a805a10e281')} \ No newline at end of file diff --git a/fastai/data/external.py b/fastai/data/external.py index 41c00c8446..fdbb2b642f 100644 --- a/fastai/data/external.py +++ b/fastai/data/external.py @@ -1,65 +1,31 @@ # AUTOGENERATED! DO NOT EDIT! File to edit: nbs/04_data.external.ipynb (unless otherwise specified). -__all__ = ['Config', 'URLs', 'download_url', 'download_data', 'file_extract', 'newest_folder', 'rename_extracted', - 'untar_data'] +__all__ = ['fastai_cfg', 'fastai_path', 'URLs', 'untar_data'] # Cell from ..torch_basics import * +from fastdownload import FastDownload +from functools import lru_cache +import fastai.data # Cell -class Config: - "Setup config at `~/.fastai` unless it exists already." - config_path = Path(os.getenv('FASTAI_HOME', '~/.fastai')).expanduser() - config_file = config_path/'config.yml' +@lru_cache(maxsize=None) +def fastai_cfg(): + "`Config` object for fastai's `config.ini`" + return Config(Path(os.getenv('FASTAI_HOME', '~/.fastai')), 'config.ini', create=dict( + data = 'data', archive = 'archive', storage = 'tmp', model = 'models')) - def __init__(self): - self.config_path.mkdir(parents=True, exist_ok=True) - if not self.config_file.exists(): self.create_config() - self.d = self.load_config() - - def __getitem__(self,k): - k = k.lower() - if k not in self.d: k = k+'_path' - return Path(self.d[k]) - - def __getattr__(self,k): - if k=='d': raise AttributeError - return self[k] - - def __setitem__(self,k,v): self.d[k] = str(v) - def __contains__(self,k): return k in self.d - - def load_config(self): - "load and return config if version equals 2 in existing, else create new config." - with open(self.config_file, 'r') as f: - config = yaml.safe_load(f) - if 'version' in config and config['version'] == 2: return config - elif 'version' in config: self.create_config(config) - else: self.create_config() - return self.load_config() - - def create_config(self, cfg=None): - "create new config with default paths and set `version` to 2." - config = {'data_path': str(self.config_path/'data'), - 'archive_path': str(self.config_path/'archive'), - 'storage_path': '/tmp', - 'model_path': str(self.config_path/'models'), - 'version': 2} - if cfg is not None: - cfg['version'] = 2 - config = merge(config, cfg) - self.save_file(config) - - def save(self): self.save_file(self.d) - def save_file(self, config): - "save config file at default config location `~/.fastai/config.yml`." - with self.config_file.open('w') as f: yaml.dump(config, f, default_flow_style=False) +# Cell +def fastai_path(folder): + "Path to `folder` in `fastai_cfg`" + return fastai_cfg().path(folder) # Cell class URLs(): "Global constants for dataset and model URLs." LOCAL_PATH = Path.cwd() MDL = 'http://files.fast.ai/models/' + GOOGLE = 'https://storage.googleapis.com/' S3 = 'https://s3.amazonaws.com/fast-ai-' URL = f'{S3}sample/' @@ -131,8 +97,8 @@ class URLs(): PASCAL_2012 = f'{S3_IMAGELOC}pascal_2012.tgz' # Audio classification datasets - MACAQUES = 'https://storage.googleapis.com/ml-animal-sounds-datasets/macaques.zip' - ZEBRA_FINCH = 'https://storage.googleapis.com/ml-animal-sounds-datasets/zebra_finch.zip' + MACAQUES = f'{GOOGLE}ml-animal-sounds-datasets/macaques.zip' + ZEBRA_FINCH = f'{GOOGLE}ml-animal-sounds-datasets/zebra_finch.zip' # Medical Imaging datasets #SKIN_LESION = f'{S3_IMAGELOC}skin_lesion.tgz' @@ -149,114 +115,10 @@ def path(url='.', c_key='archive'): fname = url.split('/')[-1] local_path = URLs.LOCAL_PATH/('models' if c_key=='models' else 'data')/fname if local_path.exists(): return local_path - return Config()[c_key]/fname - -# Cell -def download_url(url, dest, overwrite=False, pbar=None, show_progress=True, chunk_size=1024*1024, - timeout=4, retries=5): - "Download `url` to `dest` unless it exists and not `overwrite`" - if os.path.exists(dest) and not overwrite: return - - s = requests.Session() - s.mount('http://',requests.adapters.HTTPAdapter(max_retries=retries)) - # additional line to identify as a firefox browser, see fastai/#2438 - s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:71.0) Gecko/20100101 Firefox/71.0'}) - u = s.get(url, stream=True, timeout=timeout) - try: file_size = int(u.headers["Content-Length"]) - except: show_progress = False - - with open(dest, 'wb') as f: - nbytes = 0 - if show_progress: pbar = progress_bar(range(file_size), leave=False, parent=pbar) - try: - if show_progress: pbar.update(0) - for chunk in u.iter_content(chunk_size=chunk_size): - nbytes += len(chunk) - if show_progress: pbar.update(nbytes) - f.write(chunk) - except requests.exceptions.ConnectionError as e: - fname = url.split('/')[-1] - data_dir = dest.parent - print(f'\n Download of {url} has failed after {retries} retries\n' - f' Fix the download manually:\n' - f'$ mkdir -p {data_dir}\n' - f'$ cd {data_dir}\n' - f'$ wget -c {url}\n' - f'$ tar xf {fname}\n' - f' And re-run your code once the download is successful\n') - -# Cell -def download_data(url, fname=None, c_key='archive', force_download=False, timeout=4): - "Download `url` to `fname`." - fname = Path(fname or URLs.path(url, c_key=c_key)) - fname.parent.mkdir(parents=True, exist_ok=True) - if not fname.exists() or force_download: download_url(url, fname, overwrite=force_download, timeout=timeout) - return fname - -# Cell -def _get_check(url): - "internal function to get the hash of the file at `url`." - checks = json.load(open(Path(__file__).parent/'checks.txt', 'r')) - return checks.get(url, '') - -def _check_file(fname): - "internal function to get the hash of the local file at `fname`." - size = os.path.getsize(fname) - with open(fname, "rb") as f: hash_nb = hashlib.md5(f.read(2**20)).hexdigest() - return [size,hash_nb] - -# Cell -def _add_check(url, fname): - "Internal function to update the internal check file with `url` and check on `fname`." - checks = json.load(open(Path(__file__).parent/'checks.txt', 'r')) - checks[url] = _check_file(fname) - json.dump(checks, open(Path(__file__).parent/'checks.txt', 'w'), indent=2) - -# Cell -def file_extract(fname, dest=None): - "Extract `fname` to `dest` using `tarfile` or `zipfile`." - if dest is None: dest = Path(fname).parent - fname = str(fname) - if fname.endswith('gz'): tarfile.open(fname, 'r:gz').extractall(dest) - elif fname.endswith('zip'): zipfile.ZipFile(fname ).extractall(dest) - else: raise Exception(f'Unrecognized archive: {fname}') - -# Cell -def _try_from_storage(dest, storage): - "an internal function to create symbolic links for files from `storage` to `dest` if `storage` exists" - if not storage.exists(): return - os.makedirs(dest, exist_ok=True) - for f in storage.glob('*'): os.symlink(f, dest/f.name, target_is_directory=f.is_dir()) - -# Cell -def newest_folder(path): - "Return newest folder on path" - list_of_paths = path.glob('*') - return max(list_of_paths, key=lambda p: p.stat().st_ctime) - -# Cell -def rename_extracted(dest): - "Rename file if different from dest" - extracted = newest_folder(dest.parent) - if not (extracted.name == dest.name): extracted.rename(dest) + return fastai_path(c_key)/fname # Cell -def untar_data(url, fname=None, dest=None, c_key='data', force_download=False, extract_func=file_extract, timeout=4): - "Download `url` to `fname` if `dest` doesn't exist, and un-tgz or unzip to folder `dest`." - default_dest = URLs.path(url, c_key=c_key).with_suffix('') - dest = default_dest if dest is None else Path(dest)/default_dest.name - fname = Path(fname or URLs.path(url)) - if fname.exists() and _get_check(url) and _check_file(fname) != _get_check(url): - print("A new version of this dataset is available, downloading...") - force_download = True - if force_download: - if fname.exists(): os.remove(fname) - if dest.exists(): shutil.rmtree(dest) - if not dest.exists(): _try_from_storage(dest, URLs.path(url, c_key='storage').with_suffix('')) - if not dest.exists(): - fname = download_data(url, fname=fname, c_key=c_key, timeout=timeout) - if _get_check(url) and _check_file(fname) != _get_check(url): - print(f"File downloaded is broken. Remove {fname} and try again.") - extract_func(fname, dest.parent) - rename_extracted(dest) - return dest \ No newline at end of file +def untar_data(url, archive=None, data=None, c_key='data', force_download=False):#, extract_func=file_extract, timeout=4): + "Download `url` to `fname` if `dest` doesn't exist, and extract to folder `dest`" + d = FastDownload(fastai_cfg(), module=fastai.data, archive=archive, data=data, base='~/.fastai') + return d.get(url, force=force_download, extract_key=c_key) \ No newline at end of file diff --git a/nbs/04_data.external.ipynb b/nbs/04_data.external.ipynb index 519d111681..4e5b7eae68 100644 --- a/nbs/04_data.external.ipynb +++ b/nbs/04_data.external.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -65,28 +65,37 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "heading_collapsed": true + }, "source": [ "## Datasets" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ " A complete list of datasets that are available by default inside the library are: " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "heading_collapsed": true, + "hidden": true + }, "source": [ "### Main datasets" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "1. **ADULT_SAMPLE**: A small of the [adults dataset](https://archive.ics.uci.edu/ml/datasets/Adult) to predict whether income exceeds $50K/yr based on census data. \n", "- **BIWI_SAMPLE**: A [BIWI kinect headpose database](https://www.kaggle.com/kmader/biwi-kinect-head-pose-database). The dataset contains over 15K images of 20 people (6 females and 14 males - 4 people were recorded twice). For each frame, a depth image, the corresponding rgb image (both 640x480 pixels), and the annotation is provided. The head pose range covers about +-75 degrees yaw and +-60 degrees pitch. \n", @@ -117,28 +126,36 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "### Kaggle competition datasets" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "1. **DOGS**: Image dataset consisting of dogs and cats images from [Dogs vs Cats kaggle competition](https://www.kaggle.com/c/dogs-vs-cats). " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "### Image Classification datasets" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "1. **CALTECH_101**: Pictures of objects belonging to 101 categories. About 40 to 800 images per category. Most categories have about 50 images. Collected in September 2003 by Fei-Fei Li, Marco Andreetto, and Marc 'Aurelio Ranzato.\n", "1. CARS: The [Cars dataset](https://ai.stanford.edu/~jkrause/cars/car_dataset.html) contains 16,185 images of 196 classes of cars. \n", @@ -152,14 +169,18 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "### NLP datasets" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "1. **AG_NEWS**: The AG News corpus consists of news articles from the AG’s corpus of news articles on the web pertaining to the 4 largest classes. The dataset contains 30,000 training and 1,900 testing examples for each class.\n", "1. **AMAZON_REVIEWS**: This dataset contains product reviews and metadata from Amazon, including 142.8 million reviews spanning May 1996 - July 2014.\n", @@ -176,14 +197,18 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "### Image localization datasets" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "1. **BIWI_HEAD_POSE**: A [BIWI kinect headpose database](https://www.kaggle.com/kmader/biwi-kinect-head-pose-database). The dataset contains over 15K images of 20 people (6 females and 14 males - 4 people were recorded twice). For each frame, a depth image, the corresponding rgb image (both 640x480 pixels), and the annotation is provided. The head pose range covers about +-75 degrees yaw and +-60 degrees pitch. \n", "1. **CAMVID**: Consists of driving labelled dataset for segmentation type models.\n", @@ -195,14 +220,18 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "### Audio classification" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "1. **MACAQUES**: [7285 macaque coo calls](https://datadryad.org/stash/dataset/doi:10.5061/dryad.7f4p9) across 8 individuals from [Distributed acoustic cues for caller identity in macaque vocalization](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4806230).\n", "2. **ZEBRA_FINCH**: [3405 zebra finch calls](https://ndownloader.figshare.com/articles/11905533/versions/1) classified [across 11 call types](https://link.springer.com/article/10.1007/s10071-015-0933-6). Additional labels include name of individual making the vocalization and its age." @@ -210,14 +239,18 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "### Medical imaging datasets" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "1. **SIIM_SMALL**: A smaller version of the [SIIM dataset](https://www.kaggle.com/c/siim-acr-pneumothorax-segmentation/overview) where the objective is to classify pneumothorax from a set of chest radiographic images.\n", "2. **TCGA_SMALL**: A smaller version of the [TCGA-OV dataset](http://doi.org/10.7937/K9/TCIA.2016.NDO1MDFQ) with subcutaneous and visceral fat segmentations. Citations:\n", @@ -229,14 +262,18 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "### Pretrained models" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "1. **OPENAI_TRANSFORMER**: The GPT2 Transformer pretrained weights.\n", "1. **WT103_FWD**: The WikiText-103 forward language model weights.\n", @@ -252,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -274,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -283,7 +320,7 @@ "('data', Path('/home/jhoward/.fastai/data'))" ] }, - "execution_count": null, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -295,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -307,7 +344,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -316,7 +353,7 @@ "Path('/home/jhoward/.fastai/archive')" ] }, - "execution_count": null, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -327,15 +364,19 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "heading_collapsed": true + }, "source": [ "## URLs -" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 8, + "metadata": { + "hidden": true + }, "outputs": [], "source": [ "#export\n", @@ -438,15 +479,19 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "hidden": true + }, "source": [ "The default local path is at `~/.fastai/archive/` but this can be updated by passing a different `c_key`. Note: `c_key` should be one of `'archive'', 'data', 'model', 'storage'`." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 9, + "metadata": { + "hidden": true + }, "outputs": [ { "data": { @@ -454,7 +499,7 @@ "Path('/home/jhoward/.fastai/archive/oxford-iiit-pet.tgz')" ] }, - "execution_count": null, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -468,8 +513,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 10, + "metadata": { + "hidden": true + }, "outputs": [ { "data": { @@ -477,7 +524,7 @@ "Path('/home/jhoward/.fastai/models/oxford-iiit-pet.tgz')" ] }, - "execution_count": null, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -497,7 +544,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -517,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -526,7 +573,7 @@ "Path('/home/jhoward/.fastai/data/mnist_sample')" ] }, - "execution_count": null, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -537,54 +584,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "#hide\n", - "#Check all URLs are in the checks.txt file and match for downloaded archives\n", - "from fastdownload import read_checks\n", - "fd = FastDownload(module=fastai.data)\n", - "_whitelist = \"MDL LOCAL_PATH URL WT103_BWD WT103_FWD GOOGLE\".split()\n", - "checks = read_checks(fd.module)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for d in dir(URLs): \n", - " if d.upper() == d and not d.startswith(\"S3\") and not d in _whitelist: \n", - " url = getattr(URLs, d)\n", - " print(d,url)\n", - " fd.get(url)\n", - " fd.update(url)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for d in dir(URLs): \n", - " if d.upper() == d and not d.startswith(\"S3\") and not d in _whitelist: \n", - " url = getattr(URLs, d)\n", - " assert url in checks,f\"\"\"{d} is not in the check file for all URLs.\n", - "To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below):\n", - "url = URLs.{d}\n", - "untar_data(url, force_download=True)\n", - "_add_check(url, URLs.path(url))\n", - "\"\"\"\n", - " f = URLs.path(url)\n", - " if f.exists():\n", - " assert checks[url] == _check_file(f),f\"\"\"The log we have for {d} in checks does not match the actual archive.\n", - "To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below):\n", - "url = URLs.{d}\n", - "_add_check(url, URLs.path(url))\n", - "\"\"\"" + "#Check all URLs are in the download_checks.py file and match for downloaded archives\n", + "# from fastdownload import read_checks\n", + "# fd = FastDownload(fastai_cfg(), module=fastai.data)\n", + "# _whitelist = \"MDL LOCAL_PATH URL WT103_BWD WT103_FWD GOOGLE\".split()\n", + "# checks = read_checks(fd.module)\n", + "\n", + "# for d in dir(URLs): \n", + "# if d.upper() == d and not d.startswith(\"S3\") and not d in _whitelist: \n", + "# url = getattr(URLs, d)\n", + "# assert url in checks,f\"\"\"{d} is not in the check file for all URLs.\n", + "# To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below):\n", + "# url = URLs.{d}\n", + "# fd.get(url, force=True)\n", + "# fd.update(url)\n", + "# \"\"\"\n", + "# f = fd.download(url)\n", + "# assert fd.check(url, f),f\"\"\"The log we have for {d} in checks does not match the actual archive.\n", + "# To fix this, you need to run the following code in this notebook before making a PR (there is a commented cell for this below):\n", + "# url = URLs.{d}\n", + "# _add_check(url, URLs.path(url))\n", + "# \"\"\"" ] }, { @@ -596,92 +621,9 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 41, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Converted 00_torch_core.ipynb.\n", - "Converted 01_layers.ipynb.\n", - "Converted 01a_losses.ipynb.\n", - "Converted 02_data.load.ipynb.\n", - "Converted 03_data.core.ipynb.\n", - "Converted 04_data.external.ipynb.\n", - "Converted 05_data.transforms.ipynb.\n", - "Converted 06_data.block.ipynb.\n", - "Converted 07_vision.core.ipynb.\n", - "Converted 08_vision.data.ipynb.\n", - "Converted 09_vision.augment.ipynb.\n", - "Converted 09b_vision.utils.ipynb.\n", - "Converted 09c_vision.widgets.ipynb.\n", - "Converted 10_tutorial.pets.ipynb.\n", - "Converted 10b_tutorial.albumentations.ipynb.\n", - "Converted 11_vision.models.xresnet.ipynb.\n", - "Converted 12_optimizer.ipynb.\n", - "Converted 13_callback.core.ipynb.\n", - "Converted 13a_learner.ipynb.\n", - "Converted 13b_metrics.ipynb.\n", - "Converted 14_callback.schedule.ipynb.\n", - "Converted 14a_callback.data.ipynb.\n", - "Converted 15_callback.hook.ipynb.\n", - "Converted 15a_vision.models.unet.ipynb.\n", - "Converted 16_callback.progress.ipynb.\n", - "Converted 17_callback.tracker.ipynb.\n", - "Converted 18_callback.fp16.ipynb.\n", - "Converted 18a_callback.training.ipynb.\n", - "Converted 18b_callback.preds.ipynb.\n", - "Converted 19_callback.mixup.ipynb.\n", - "Converted 20_interpret.ipynb.\n", - "Converted 20a_distributed.ipynb.\n", - "Converted 21_vision.learner.ipynb.\n", - "Converted 22_tutorial.imagenette.ipynb.\n", - "Converted 23_tutorial.vision.ipynb.\n", - "Converted 24_tutorial.image_sequence.ipynb.\n", - "Converted 24_tutorial.siamese.ipynb.\n", - "Converted 24_vision.gan.ipynb.\n", - "Converted 30_text.core.ipynb.\n", - "Converted 31_text.data.ipynb.\n", - "Converted 32_text.models.awdlstm.ipynb.\n", - "Converted 33_text.models.core.ipynb.\n", - "Converted 34_callback.rnn.ipynb.\n", - "Converted 35_tutorial.wikitext.ipynb.\n", - "Converted 37_text.learner.ipynb.\n", - "Converted 38_tutorial.text.ipynb.\n", - "Converted 39_tutorial.transformers.ipynb.\n", - "Converted 40_tabular.core.ipynb.\n", - "Converted 41_tabular.data.ipynb.\n", - "Converted 42_tabular.model.ipynb.\n", - "Converted 43_tabular.learner.ipynb.\n", - "Converted 44_tutorial.tabular.ipynb.\n", - "Converted 45_collab.ipynb.\n", - "Converted 46_tutorial.collab.ipynb.\n", - "Converted 50_tutorial.datablock.ipynb.\n", - "Converted 60_medical.imaging.ipynb.\n", - "Converted 61_tutorial.medical_imaging.ipynb.\n", - "Converted 65_medical.text.ipynb.\n", - "Converted 70_callback.wandb.ipynb.\n", - "Converted 71_callback.tensorboard.ipynb.\n", - "Converted 72_callback.neptune.ipynb.\n", - "Converted 73_callback.captum.ipynb.\n", - "Converted 74_callback.azureml.ipynb.\n", - "Converted 97_test_utils.ipynb.\n", - "Converted 99_pytorch_doc.ipynb.\n", - "Converted dev-setup.ipynb.\n", - "Converted app_examples.ipynb.\n", - "Converted camvid.ipynb.\n", - "Converted migrating_catalyst.ipynb.\n", - "Converted migrating_ignite.ipynb.\n", - "Converted migrating_lightning.ipynb.\n", - "Converted migrating_pytorch.ipynb.\n", - "Converted ulmfit.ipynb.\n", - "Converted index.ipynb.\n", - "Converted quick_start.ipynb.\n", - "Converted tutorial.ipynb.\n" - ] - } - ], + "outputs": [], "source": [ "#hide\n", "from nbdev.export import notebook2script\n", @@ -704,6 +646,31 @@ "display_name": "Python 3", "language": "python", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": false, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false } }, "nbformat": 4,