# Fetch data and prepare sample

* The complete training dataset has 25k images, 12.5k from cats and 12.5k from dogs.
* We use 1000 images each for training and 500 for validation / test.

## Fetch training data

In [1]:
import os
import requests

from pathlib import Path
from tqdm import tqdm_notebook as tqdm

In [2]:
archive_name = "cats_vs_dogs.zip"
training_data_url = "https://d2b7dn9rofvhjd.cloudfront.net/{}".format(archive_name)
data_root = Path.home() / "data" / "tmp"
data_root.mkdir(parents=True, exist_ok=True)
training_data_path = data_root / archive_name

In [5]:
def download_from_url(url, dst, chunk_size=1024):
    """
    @param: url to download
    @param: dst path to destination file
    """
    file_size = int(requests.head(url).headers["Content-Length"])
    first_byte = dst.stat().st_size if dst.exists() else 0
    
    # return early when we are already done
    if first_byte >= file_size:
        return file_size
    
    # download first_byte to file_size
    header = {"Range": "bytes={}-{}".format(first_byte, file_size)}
    pbar = tqdm(
        total=file_size, initial=first_byte,
        unit='B', unit_scale=True, desc=url.split('/')[-1])
    req = requests.get(url, headers=header, stream=True)
    with(open(str(dst), 'ab')) as f:
        for chunk in req.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
                pbar.update(chunk_size)
    pbar.close()
    return file_size

In [6]:
file_size = download_from_url(training_data_url, training_data_path)

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


## Unpack archive

In [7]:
import zipfile

In [10]:
data_root

PosixPath('/home/jochen_wersdoerfer/data/tmp')

In [13]:
%%time
archive = zipfile.ZipFile(str(training_data_path), 'r')
archive.extractall(str(data_root))
archive.close()

CPU times: user 6.94 s, sys: 1.74 s, total: 8.68 s
Wall time: 8.7 s


In [14]:
training_data_path.unlink()

In [15]:
original_data_dir = list(data_root.glob("*cats*dogs"))[0]

## Create sample data

In [16]:
from shutil import copy

### Create directories

In [17]:
sample_dir = data_root / "cats_vs_dogs_sample"
for part in ("test", "train", "validation"):
    for category in ("cats", "dogs"):
        (sample_dir / part / category).mkdir(parents=True, exist_ok=True)

### Copy files

In [20]:
def copy_files_range(src_dir, dst_dir, file_template, start, stop):
    print(dst_dir)
    fnames = (file_template.format(i) for i in range(start, stop))
    for fname in fnames:
        src = src_dir / fname
        copy(src, dst_dir)

In [21]:
%%time
file_templates = {"cats": "cat.{}.jpg", "dogs": "dog.{}.jpg"}
data_splits = (
    ("train", 0, 1000),          # Copy first 1000 images to train dirs
    ("validation", 1000, 1500),  # Copy next 500 images into validation dirs
    ("test", 1500, 2000),        # Copy next 500 images into test dirs
)

for (part, start, stop) in data_splits:
    for category in ("cats", "dogs"):
        copy_files_range(
            original_data_dir,
            sample_dir / part / category,
            file_templates[category],
            start,
            stop
        )

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs_sample/train/cats


TypeError: argument should be string, bytes or integer, not PosixPath

In [None]:
for part in ("train", "validation", "test"):
    for category in ("cats", "dogs"):
        desc = "total {} images of {}:".format(part, category)
        num_files_in_dir = len(list((sample_dir / part / category).iterdir()))
        print(desc.ljust(35), num_files_in_dir)