In [1]:
import os
import pandas as pd
import glob
import menpo.io as mio
import numpy as np
#np.random.seed(1337)  # for reproducibility
from keras.utils import np_utils
from keras import backend as K

Using TensorFlow backend.


In [2]:
# returns (X_train, y_train), (X_test, y_test)
def randomSplit(X, y, propTest):
    assert X.shape[0] == y.shape[0]
    p = np.random.permutation(len(y))
    sX, sy = X[p], y[p]
    cutoff = round(propTest * len(y))
    return (sX[cutoff:], sy[cutoff:]), (sX[0:cutoff], sy[0:cutoff])

def randomOrder(X, y):
    p = np.random.permutation(len(y))
    return X[p], y[p]

In [10]:
data_file = '/home/dane/data/imdb_crop/meta.csv'
image_folder = '/home/dane/data/imdb_crop/' # must end with '/'
#image_folder = '/data/nlp/corpora/imdb_crop/' # must end with '/'
variable = 0 # relevant column of data_file
height = 128
width = 128
channels = 3

In [8]:
annos = pd.DataFrame.from_csv(data_file, header=None, index_col=None)

In [15]:
ids = list(set(annos[2].get_values()))
ids = np.random.permutation(ids)
boundary = round(0.9 * len(ids))
train_ids = set(ids[0:boundary])
test_ids = set(ids[boundary:])


414651

In [10]:
ids = list(set(annos['subject_id'].get_values()))
ids = np.random.permutation(ids)
boundary = round(0.9 * len(ids))
train_ids = set(ids[0:boundary])
test_ids = set(ids[boundary:])

train_images = []
train_labels = []
test_images = []
test_labels = []
for fp in train_ids:
    fn = image_folder + fp
    image = mio.import_image(fn)
    

for fp in glob.glob(image_folder + '*.jpg'):
    fn = os.path.basename(fp)
    image = mio.import_image(fp)
    assert(image.shape == (height, width))
    assert(image.n_channels == channels)
    if bool(np.random.randint(0,1)):
        image = image.mirror()
    if annos.loc[annos['file_name'] == fn]['subject_id'].sum() in train_ids:
        train_images.append(image)
        train_labels.append(annos.loc[annos['file_name'] == fn][variable].sum())
    else:
        test_images.append(image)
        test_labels.append(annos.loc[annos['file_name'] == fn][variable].sum())

In [11]:
nb_classes = 2

# input image dimensions
img_rows, img_cols, img_channels = height, width, channels

X_train = np.stack([img.pixels for img in train_images])
X_test = np.stack([img.pixels for img in test_images])
if K.image_dim_ordering() == 'tf':
    X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, channels)
    X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, channels)
    input_shape = input_shape = (img_rows, img_cols, channels)
else:
    input_shape = (channels, img_rows, img_cols)
    
y_train = np.array(train_labels)
y_test = np.array(test_labels)
#(X_train, y_train), (X_test, y_test) = randomSplit(Xs, ys, 0.1)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')
print('X_train shape:', X_train.shape)
print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

# convert class vectors to trinary class matrices
y_train = np_utils.to_categorical(y_train, nb_classes)
y_test = np_utils.to_categorical(y_test, nb_classes)

print('y_train shape:', y_train.shape)

X_train, y_train = randomOrder(X_train, y_train)
X_test, y_test = randomOrder(X_test, y_test)

X_train shape: (6632, 128, 128, 3)
6632 train samples
778 test samples
y_train shape: (6632, 2)


In [15]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D

batch_size = 50
nb_epoch = 2

# number of convolutional filters to use
nb_filters1 = 48
# convolution kernel size
kernel1 = (7, 7)
# size of pooling area for max pooling
pool1 = (3, 3)

# number of convolutional filters to use
nb_filters2 = 128
# convolution kernel size
kernel2 = (5, 5)
# size of pooling area for max pooling
pool2 = (3, 3)

# number of convolutional filters to use
nb_filters3 = 192
# convolution kernel size
kernel3 = (3, 3)
# size of pooling area for max pooling
pool3 = (3, 3)

dense1 = 256
dense2 = 256

model = Sequential()

model.add(Convolution2D(nb_filters1, kernel1[0], kernel1[1],
                        border_mode='valid',
                        input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool1))
model.add(Dropout(0.2))
model.add(Convolution2D(nb_filters2, kernel2[0], kernel2[1]))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool2))
model.add(Dropout(0.2))
model.add(Convolution2D(nb_filters3, kernel3[0], kernel3[1]))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool3))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(dense1))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(dense2))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch,
          verbose=1, validation_data=(X_test, y_test))
score = model.evaluate(X_test, y_test, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Train on 6632 samples, validate on 778 samples
Epoch 1/2
Epoch 2/2
Test score: 0.637498436366
Test accuracy: 0.632390745655


In [None]:
model.save("gender_classifier.h5")