In [2]:
from urlparse import urlparse
from os.path import basename
import tarfile
import urllib
import sys, os

# Dataset download

First of all, data will be downloaded from the source URL. Three datasets are included in the source:

* Train (http://ufldl.stanford.edu/housenumbers/train.tar.gz)
* Test  (http://ufldl.stanford.edu/housenumbers/test.tar.gz)
* Extra (http://ufldl.stanford.edu/housenumbers/extra.tar.gz)


They will all be downloaded independently.

In [3]:
sources = {
    'train': 
    {'url':'http://ufldl.stanford.edu/housenumbers/train.tar.gz',
     'bytes':404141560
    }
    ,
    'test':
    {'url':'http://ufldl.stanford.edu/housenumbers/test.tar.gz',
     'bytes': 276555967
    }
    ,
    'extra':
    {'url':'http://ufldl.stanford.edu/housenumbers/extra.tar.gz',
     'bytes': 1955489752
    }
}

In [4]:
last_percent_reported = None
def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent

def extract_filename(url):
    return basename(urlparse(url).path)
    
def download(url, filename, expected_bytes, force=False):
    """Download a file if not present, and make sure it's the right size."""
    if force or not os.path.exists(filename):
        print('Attempting to download:', filename) 
        filename, _ = urllib.urlretrieve(url, filename, reporthook=download_progress_hook)
        print('\nDownload Complete!')
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified', filename)
    else:
        raise Exception('Failed to verify ' + filename + 
            '. Can you get to it with a browser?')

In [5]:
for _, source in sources.iteritems():
    source['filename'] = extract_filename(source['url'])
    download(source['url'], source['filename'], source['bytes'])

('Attempting to download:', 'test.tar.gz')
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
('Found and verified', 'test.tar.gz')
('Attempting to download:', 'train.tar.gz')
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
('Found and verified', 'train.tar.gz')
('Attempting to download:', 'extra.tar.gz')
0%....5%....10%....15%....20%....25%....30%....35%....40%....45%....50%....55%....60%....65%....70%....75%....80%....85%....90%....95%....100%
Download Complete!
('Found and verified', 'extra.tar.gz')


## Data extraction

In [6]:
def extract(filename, force=False):
    root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
    if os.path.isdir(root) and not force:
        print('%s already present - Skipping extraction of %s.' % (root, filename))
    else:
        print('Extracting data for %s. This may take a while. Please wait.' % root)
        tar = tarfile.open(filename)
        sys.stdout.flush()
        tar.extractall()
        tar.close()
    return root

In [8]:
for _, source in sources.iteritems():
    source['folder'] = extract(source['filename'])

Extracting data for test. This may take a while. Please wait.
Extracting data for train. This may take a while. Please wait.
Extracting data for extra. This may take a while. Please wait.


## Data serialization

In [10]:
import pickle
pickle.dump(sources, open('sources.pickle', 'wb'))