In [6]:
from urllib.request import urlretrieve
import os
import hashlib

In [8]:
def download(url, file):
    """
    Download file from <url>
    :param url: URL to file
    :param file: Local file path
    """
    if not os.path.isfile(file):
        print('Downloading ' + file + '...')
        urlretrieve(url, file)
        print('Download Finished')

# Download the training and test dataset.
# download('https://s3.amazonaws.com/udacity-sdc/notMNIST_train.zip', 'notMNIST_train.zip')
download('https://s3.amazonaws.com/udacity-sdc/notMNIST_test.zip', './data/notMNIST_test.zip')

# Make sure the files aren't corrupted
# assert hashlib.md5(open('notMNIST_train.zip', 'rb').read()).hexdigest() == 'c8673b3f28f489e9cdf3a3d74e2ac8fa',\
#         'notMNIST_train.zip file is corrupted.  Remove the file and try again.'
assert hashlib.md5(open('./data/notMNIST_test.zip', 'rb').read()).hexdigest() == '5d3c7e653e63471c88df796156a9dfa9',\
        'notMNIST_test.zip file is corrupted.  Remove the file and try again.'

# Wait until you see that all files have been downloaded.
print('All files downloaded.')

All files downloaded.


In [14]:
from tqdm import tqdm
from zipfile import ZipFile
from PIL import Image
import numpy as np

In [15]:
def uncompress_features_labels(file):
    """
    Uncompress features and labels from a zip file
    :param file: The zip file to extract the data from
    """
    features = []
    labels = []

    with ZipFile(file) as zipf:
        # Progress Bar
        filenames_pbar = tqdm(zipf.namelist(), unit='files')
        
        # Get features and labels from all files
        for filename in filenames_pbar:
            # Check if the file is a directory
            if not filename.endswith('/'):
                with zipf.open(filename) as image_file:
                    image = Image.open(image_file)
                    image.load()
                    # Load image data as 1 dimensional array
                    # We're using float32 to save on memory space
                    feature = np.array(image, dtype=np.float32).flatten()

                # Get the the letter from the filename.  This is the letter of the image.
                label = os.path.split(filename)[1][0]

                features.append(feature)
                labels.append(label)
    return np.array(features), np.array(labels)

test_features, test_labels = uncompress_features_labels('./data/notMNIST_test.zip')



  0%|          | 0/10001 [00:00<?, ?files/s][A[A

  3%|▎         | 328/10001 [00:00<00:02, 3279.18files/s][A[A

  6%|▋         | 629/10001 [00:00<00:02, 3191.21files/s][A[A

  9%|▉         | 947/10001 [00:00<00:02, 3185.41files/s][A
 13%|█▎        | 1255/10001 [00:00<00:02, 3109.22files/s][A
 16%|█▌        | 1567/10001 [00:00<00:02, 3089.40files/s][A
 19%|█▉        | 1877/10001 [00:00<00:02, 3073.45files/s][A
 22%|██▏       | 2163/10001 [00:00<00:02, 2987.16files/s][A
 25%|██▍       | 2455/10001 [00:00<00:02, 2951.77files/s][A
 27%|██▋       | 2741/10001 [00:00<00:02, 2921.43files/s][A
 30%|███       | 3045/10001 [00:01<00:02, 2954.26files/s][A
 33%|███▎      | 3347/10001 [00:01<00:02, 2971.33files/s][A
 36%|███▋      | 3639/10001 [00:01<00:02, 2954.74files/s][A
 40%|███▉      | 3951/10001 [00:01<00:02, 2970.70files/s][A
 42%|████▏     | 4248/10001 [00:01<00:01, 2970.04files/s][A
 45%|████▌     | 4543/10001 [00:01<00:01, 2924.03files/s][A
 49%|████▊     | 4855/1000

In [16]:
test_features.shape

(10000, 784)