In [141]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
from scipy import ndimage
import numpy as np
import glob
import re
import os

In [142]:
files = glob.glob('data/images/*')
imageSizes = []

for f in files:
    m = re.search('data/images/(\d+).jpg', f)
    imageId = int(m.group(1))
    img = ndimage.imread(f)
    imageSizes.append([imageId, img.shape[0],img.shape[1]])
    
imageSizes = pd.DataFrame(imageSizes, columns=['id', 'width', 'height'])
imageSizes.set_index('id', inplace=True)

In [143]:
if not os.path.exists('trainingData.p'):
    train = pd.read_csv('data/train.csv', index_col=0, dtype='object')

    trainData = []

    marginCols = ['margin{0}'.format(i) for i in range(1, 65)]
    textureCols = ['texture{0}'.format(i) for i in range(1, 65)]
    shapeCols = ['shape{0}'.format(i) for i in range(1, 65)]

    ## Generate one hot encoding for species
    classNames = sorted(np.unique(train['species']))
    numClasses = len(classNames)
    labelMap = {k:i for i,k in enumerate(classNames)}

    def downsampleByTwo(img):
        tmp = np.array(img)
        # if the number of rows is odd, pad with a row of zeros
        if (tmp.shape[0] & 0x01):
            tmp = np.vstack([tmp, np.zeros((1, tmp.shape[1]))])

        if (tmp.shape[1] & 0x01):
            tmp = np.hstack([tmp, np.zeros((tmp.shape[0], 1))])

        ret = np.zeros(np.array(tmp.shape) / 2)

        for i in range(tmp.shape[0]):
            for j in range(tmp.shape[1]):
                ret[i/2,j/2] = .25 * tmp[i,j]

        return ret

    def preprocessImage(img):
        blurred = ndimage.gaussian_filter(img, 3)    
        ds = downsampleByTwo(blurred)
        ds = downsampleByTwo(ds)
        ds = downsampleByTwo(ds)
        return ds

    columnGroups = [marginCols, textureCols, shapeCols]
    for r in train.iterrows():
        caseId = r[0]
        trainData.append([])
        # Generate one hot encoding for species
        oneHot = np.zeros((numClasses,))
        oneHot[labelMap[r[1]['species']]] = 1.0
        trainData[-1].append(oneHot)

        for col in columnGroups:
            curr = r[1][col]
            trainData[-1].append(np.array(curr))

        img = ndimage.imread('data/images/{0}.jpg'.format(caseId)) / 255.0
        img = preprocessImage(img)
        trainData[-1].append(img)
        trainData[-1].append(img.shape[0])
        trainData[-1].append(img.shape[1])
    trainData = pd.DataFrame(trainData, columns=['oneHot', 'margin', 'textureCols', 'shape', 'img', 'width', 'height'])
    pd.to_pickle(trainData, 'trainingData.p')
else:
    trainData = pd.read_pickle('trainingData.p')

In [149]:
maxWidth = np.max(trainData['width'])
maxHeight = np.max(trainData['height'])

normImgSize = maxWidth * 2, maxHeight * 2


In [150]:
normImgSize

(274, 410)