# PREPROCESS DATA

In [1]:
# Import required modules
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle


In [2]:
url = 'http://ufldl.stanford.edu/housenumbers/'
last_percent_reported = None

def download_progress_hook(count, blockSize, totalSize):
  """A hook to report the progress of a download. This is mostly intended for users with
  slow internet connections. Reports every 1% change in download progress.
  """
  global last_percent_reported
  percent = int(count * blockSize * 100 / totalSize)

  if last_percent_reported != percent:
    if percent % 5 == 0:
      sys.stdout.write("%s%%" % percent)
      sys.stdout.flush()
    else:
      sys.stdout.write(".")
      sys.stdout.flush()
      
    last_percent_reported = percent
        
def maybe_download(filename, force=False):
  """Download a file if not present, and make sure it's the right size."""
  if force or not os.path.exists(filename):
    print('Attempting to download:', filename) 
    filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook)
    print('\nDownload Complete!')    
  statinfo = os.stat(filename)
  if statinfo.st_size > 0:
    print ('File found: ', filename)  
  
  return filename

train_filename = maybe_download('train.tar.gz')
test_filename = maybe_download('test.tar.gz')
extra_filename = maybe_download('extra.tar.gz')

File found:  train.tar.gz
File found:  test.tar.gz
File found:  extra.tar.gz


In [3]:
def maybe_extract(filename, force=False):
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # remove .tar.gz
  if os.path.isdir(root) and not force:
    # You may override by setting force=True.
    print('%s already present - Skipping extraction of %s.' % (root, filename))
  else:
    print('Extracting data for %s. This may take a while. Please wait.' % root)
    tar = tarfile.open(filename)
    sys.stdout.flush()
    tar.extractall()
    tar.close()  
  
  print(root)
  return root
  
train_folder = maybe_extract(train_filename)
test_folder = maybe_extract(test_filename)
extra_folder = maybe_extract(extra_filename)

train already present - Skipping extraction of train.tar.gz.
train
test already present - Skipping extraction of test.tar.gz.
test
extra already present - Skipping extraction of extra.tar.gz.
extra


## PROCESS BOUNDING BOXES

In [4]:
import h5py

def get_data_boxes(dsf, start_range, end_range):    
    
    dsf_name = dsf['digitStruct']['name']
    dsf_box = dsf['digitStruct']['bbox']
    train_data_boxes = {}
    
    for i in range(start_range, end_range):
        img_file = dsf[dsf_name[i][0]].value.tostring().replace("\x00","")
        boxes = []
        boxes_count = len(dsf[dsf_box[i].item()]["label"])
        if boxes_count > 1:
            for j in range(boxes_count):
                box = {}
                box["top"] = dsf[dsf[dsf_box[i].item()]["top"].value[j].item()].value[0][0]
                box["left"] = dsf[dsf[dsf_box[i].item()]["left"].value[j].item()].value[0][0]
                box["height"] = dsf[dsf[dsf_box[i].item()]["height"].value[j].item()].value[0][0]
                box["width"] = dsf[dsf[dsf_box[i].item()]["width"].value[j].item()].value[0][0]
                box["label"] = dsf[dsf[dsf_box[i].item()]["label"].value[j].item()].value[0][0]
                boxes.append(box)
        else:
            box = {}
            box["top"] = dsf[dsf_box[i].item()]["top"].value[0][0]
            box["left"] = dsf[dsf_box[i].item()]["left"].value[0][0]
            box["height"] = dsf[dsf_box[i].item()]["height"].value[0][0]
            box["width"] = dsf[dsf_box[i].item()]["width"].value[0][0]
            box["label"] = dsf[dsf_box[i].item()]["label"].value[0][0]            
            boxes.append(box)
    
        train_data_boxes[img_file] = boxes    
    
    return train_data_boxes

In [6]:
train_size = 33402

dsf_train_file = os.path.join(train_folder, 'digitStruct.mat')
dsf_train = h5py.File(dsf_train_file, 'r')
train_data_boxes = get_data_boxes(dsf_train, 0, train_size)

In [None]:
#valid_size = 64

#valid_data_boxes = get_data_boxes(dsf_train, train_size, train_size+valid_size)

In [5]:
test_size = 13068

dsf_test_file = os.path.join(test_folder, 'digitStruct.mat')
dsf_test = h5py.File(dsf_test_file, 'r')
test_data_boxes = get_data_boxes(dsf_test, 0, test_size)

In [8]:
extra_size = 202353

dsf_extra_file = os.path.join(extra_folder, 'digitStruct.mat')
dsf_extra = h5py.File(dsf_extra_file, 'r')
extra_data_boxes = get_data_boxes(dsf_extra, 0, extra_size)

In [9]:
import PIL.Image as Image

def dataset_stats(boxes_data, folder):
    width_max = (0,'')
    width_min = (9999,'')
    height_max = (0,'')
    height_min = (9999,'')
    pixel_count_max = (0,'')
    pixel_count_min = (9999999,'')
    for img_file in boxes_data.keys():
        img_path = os.path.join(folder, img_file)
        img = Image.open(img_path)
        
        width, height = img.size
        pixel_count = width * height
        
        if width > width_max[0]:
            width_max = (width, img_file)
        if width < width_min[0]:
            width_min = (width, img_file)
            
        if height > height_max[0]:
            height_max = (height, img_file)
        if height < height_min[0]:
            height_min = (height, img_file) 
            
        if pixel_count > pixel_count_max[0]:
            pixel_count_max = (pixel_count, img_file)
        if pixel_count < pixel_count_min[0]:
            pixel_count_min = (pixel_count, img_file)
    
    return width_max, width_min, height_max, height_min, pixel_count_max, pixel_count_min

width_max, width_min, height_max, height_min, pixel_count_max, pixel_count_min = dataset_stats(extra_data_boxes, extra_folder)

print ('Extra sizes:')
print ('')
print ('Max Width:', width_max[0], width_max[1])
print ('Min Width:', width_min[0], width_min[1])
print ('Max Height:', height_max[0], height_max[1])
print ('Min Height:', height_min[0], height_min[1])
print ('Max Img Size:', pixel_count_max[0], pixel_count_max[1])
print ('Min Img Size:', pixel_count_min[0], pixel_count_min[1])

Extra sizes:

Max Width: 668 198955.png
Min Width: 22 56511.png
Max Height: 415 100972.png
Min Height: 13 105260.png
Max Img Size: 253172 104222.png
Min Img Size: 330 127904.png


# LOAD DATA