# Fetch data and prepare sample

* The complete training dataset has 25k images, 12.5k from cats and 12.5k from dogs.
* We use 1000 images each for training and 500 for validation / test.

## Fetch training data

In [1]:
import os
import requests

from pathlib import Path
from tqdm import tqdm_notebook as tqdm

In [2]:
archive_name = "cats_vs_dogs.zip"
training_data_url = "https://d2b7dn9rofvhjd.cloudfront.net/{}".format(archive_name)
data_root = Path.home() / "data" / "tmp"
data_root.mkdir(parents=True, exist_ok=True)
training_data_path = data_root / archive_name

In [5]:
def download_from_url(url, dst, chunk_size=1024):
    """
    @param: url to download
    @param: dst path to destination file
    """
    file_size = int(requests.head(url).headers["Content-Length"])
    first_byte = dst.stat().st_size if dst.exists() else 0
    
    # return early when we are already done
    if first_byte >= file_size:
        return file_size
    
    # download first_byte to file_size
    header = {"Range": "bytes={}-{}".format(first_byte, file_size)}
    pbar = tqdm(
        total=file_size, initial=first_byte,
        unit='B', unit_scale=True, desc=url.split('/')[-1])
    req = requests.get(url, headers=header, stream=True)
    with(open(str(dst), 'ab')) as f:
        for chunk in req.iter_content(chunk_size=chunk_size):
            if chunk:
                f.write(chunk)
                pbar.update(chunk_size)
    pbar.close()
    return file_size

In [6]:
file_size = download_from_url(training_data_url, training_data_path)

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


## Unpack archive

In [7]:
import zipfile

In [10]:
data_root

PosixPath('/home/jochen_wersdoerfer/data/tmp')

In [13]:
%%time
archive = zipfile.ZipFile(str(training_data_path), 'r')
archive.extractall(str(data_root))
archive.close()

CPU times: user 6.94 s, sys: 1.74 s, total: 8.68 s
Wall time: 8.7 s


In [14]:
training_data_path.unlink()

In [15]:
original_data_dir = list(data_root.glob("*cats*dogs"))[0]

## Create sample data

In [16]:
from shutil import copy

### Create directories

In [17]:
sample_dir = data_root / "cats_vs_dogs_sample"
for part in ("test", "train", "validation"):
    for category in ("cats", "dogs"):
        (sample_dir / part / category).mkdir(parents=True, exist_ok=True)

### Copy files

In [26]:
def copy_files_range(src_dir, dst_dir, file_template, start, stop):
    print(dst_dir)
    fnames = (file_template.format(i) for i in range(start, stop))
    for fname in fnames:
        src = src_dir / fname
        copy(str(src), str(dst_dir))

In [27]:
%%time
file_templates = {"cats": "cat.{}.jpg", "dogs": "dog.{}.jpg"}
data_splits = (
    ("train", 0, 1000),          # Copy first 1000 images to train dirs
    ("validation", 1000, 1500),  # Copy next 500 images into validation dirs
    ("test", 1500, 2000),        # Copy next 500 images into test dirs
)

for (part, start, stop) in data_splits:
    for category in ("cats", "dogs"):
        copy_files_range(
            original_data_dir,
            sample_dir / part / category,
            file_templates[category],
            start,
            stop
        )

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs_sample/train/cats
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.0.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.2.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.3.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.4.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.5.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.6.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.7.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.8.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.9.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.10.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.11.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.12.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.13.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.14.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.15.jpg
/home/jochen_wers

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.427.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.428.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.429.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.430.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.431.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.432.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.433.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.434.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.435.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.436.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.437.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.438.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.439.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.440.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.441.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.442.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.443.j

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.849.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.850.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.851.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.852.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.853.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.854.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.855.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.856.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.857.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.858.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.859.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.860.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.861.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.862.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.863.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.864.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.865.j

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.285.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.286.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.287.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.288.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.289.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.290.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.291.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.292.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.293.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.294.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.295.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.296.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.297.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.298.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.299.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.300.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.301.j

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.722.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.723.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.724.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.725.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.726.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.727.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.728.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.729.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.730.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.731.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.732.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.733.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.734.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.735.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.736.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.737.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.738.j

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1144.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1145.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1146.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1147.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1148.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1149.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1150.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1151.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1152.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1153.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1154.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1155.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1156.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1157.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1158.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1159.jpg
/home/jochen_wersdoerfer/data/tmp/cats_v

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1064.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1065.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1066.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1067.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1068.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1069.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1070.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1071.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1072.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1073.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1074.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1075.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1076.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1077.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1078.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1079.jpg
/home/jochen_wersdoerfer/data/tmp/cats_v

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1481.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1482.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1483.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1484.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1485.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1486.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1487.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1488.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1489.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1490.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1491.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1492.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1493.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1494.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1495.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1496.jpg
/home/jochen_wersdoerfer/data/tmp/cats_v

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1915.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1916.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1917.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1918.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1919.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1920.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1921.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1922.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1923.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1924.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1925.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1926.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1927.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1928.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1929.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/cat.1930.jpg
/home/jochen_wersdoerfer/data/tmp/cats_v

/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1853.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1854.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1855.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1856.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1857.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1858.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1859.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1860.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1861.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1862.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1863.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1864.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1865.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1866.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1867.jpg
/home/jochen_wersdoerfer/data/tmp/cats_vs_dogs/dog.1868.jpg
/home/jochen_wersdoerfer/data/tmp/cats_v

In [28]:
for part in ("train", "validation", "test"):
    for category in ("cats", "dogs"):
        desc = "total {} images of {}:".format(part, category)
        num_files_in_dir = len(list((sample_dir / part / category).iterdir()))
        print(desc.ljust(35), num_files_in_dir)

total train images of cats:         1000
total train images of dogs:         1000
total validation images of cats:    500
total validation images of dogs:    500
total test images of cats:          500
total test images of dogs:          500
