# Fetch data and prepare sample

* The complete training dataset has 25k images, 12.5k from cats and 12.5k from dogs.
* We use 1000 images each for training and 500 for validation / test.

## Fetch training data

In [1]:
import os
import requests

from tqdm import tqdm_notebook as tqdm

In [2]:
home_dir = os.environ["HOME"]
archive_name = "cats_vs_dogs.zip"
data_dir = os.path.join(home_dir, "data", "tmp")
training_data_url = "https://d2b7dn9rofvhjd.cloudfront.net/{}".format(archive_name)
training_data_path = os.path.join(data_dir, archive_name)

In [3]:
def download_from_url(url, dst, chunk_size=1024):
    """
    @param: url to download
    @param: dst path to destination file
    """
    file_size = int(requests.head(url).headers["Content-Length"])
    first_byte = os.path.getsize(dst) if os.path.exists(dst) else 0
    
    # return early when we are already done
    if first_byte >= file_size:
        return file_size
    
    # download first_byte to file_size
    header = {"Range": "bytes={}-{}".format(first_byte, file_size)}
    pbar = tqdm(
        total=file_size, initial=first_byte,
        unit='B', unit_scale=True, desc=url.split('/')[-1])
    req = requests.get(url, headers=header, stream=True)
    with(open(dst, 'ab')) as f:
        for chunk in req.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
                pbar.update(chunk_size)
    pbar.close()
    return file_size

In [5]:
file_size = download_from_url(training_data_url, training_data_path)

HBox(children=(IntProgress(value=0, description='cats_vs_dogs.zip', max=571167138), HTML(value='')))

## Unpack archive

In [6]:
import zipfile

In [7]:
%%time
archive = zipfile.ZipFile(training_data_path, 'r')
archive.extractall(data_dir)
archive.close()

CPU times: user 10.3 s, sys: 4.75 s, total: 15 s
Wall time: 15.3 s


In [8]:
os.unlink(training_data_path)

In [9]:
original_data_dir = os.path.join(data_dir, os.listdir(data_dir)[0])

## Create sample data

In [10]:
from shutil import copy

### Create directories

In [19]:
sample_dir = os.path.join(data_dir, "cats_vs_dogs_sample")

leaf_dirs = {}
for part in ("test", "train", "validation"):
    for category in ("cats", "dogs"):
        leaf_dir = os.path.join(sample_dir, part, category)
        os.makedirs(leaf_dir, exist_ok=True)
        leaf_dirs[(part, category)] = leaf_dir

### Copy files

In [20]:
def copy_files_range(src_dir, dst_dir, file_template, start, stop):
    print(src_dir, dst_dir)
    fnames = (file_template.format(i) for i in range(start, stop))
    for fname in fnames:
        src = os.path.join(src_dir, fname)
        copy(src, dst_dir)

In [21]:
%%time
file_templates = {"cats": "cat.{}.jpg", "dogs": "dog.{}.jpg"}
data_splits = (
    ("train", 0, 1000),          # Copy first 1000 images to train dirs
    ("validation", 1000, 1500),  # Copy next 500 images into validation dirs
    ("test", 1500, 2000),        # Copy next 500 images into test dirs
)

for (part, start, stop) in data_splits:
    for category in ("cats", "dogs"):
        copy_files_range(
            original_data_dir,
            leaf_dirs[(part, category)],
            file_templates[category],
            start,
            stop
        )

/Users/jochen/data/tmp/cats_vs_dogs /Users/jochen/data/tmp/cats_vs_dogs_sample/train/cats
/Users/jochen/data/tmp/cats_vs_dogs /Users/jochen/data/tmp/cats_vs_dogs_sample/train/dogs
/Users/jochen/data/tmp/cats_vs_dogs /Users/jochen/data/tmp/cats_vs_dogs_sample/validation/cats
/Users/jochen/data/tmp/cats_vs_dogs /Users/jochen/data/tmp/cats_vs_dogs_sample/validation/dogs
/Users/jochen/data/tmp/cats_vs_dogs /Users/jochen/data/tmp/cats_vs_dogs_sample/test/cats
/Users/jochen/data/tmp/cats_vs_dogs /Users/jochen/data/tmp/cats_vs_dogs_sample/test/dogs
CPU times: user 495 ms, sys: 1.26 s, total: 1.75 s
Wall time: 2.41 s


In [32]:
for part in ("train", "validation", "test"):
    for category in ("cats", "dogs"):
        desc = "total {} images of {}:".format(part, category)
        num_files_in_dir = len(os.listdir(leaf_dirs[(part, category)]))
        print(desc.ljust(35), num_files_in_dir)

total train images of cats:         1000
total train images of dogs:         1000
total validation images of cats:    500
total validation images of dogs:    500
total test images of cats:          500
total test images of dogs:          500
