In [1]:
# -*- coding: utf-8 -*-
import os
import pandas as pd
import numpy as np
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import gc, math
import pickle

from keras.models import Sequential
from keras.optimizers import SGD
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.models import Model, load_model, model_from_json

from keras.layers import Input, Dense, Convolution2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, merge, Reshape, Activation
from keras.layers.advanced_activations import LeakyReLU, PReLU
from keras.layers.normalization import BatchNormalization
from keras import regularizers
from keras import backend as K
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard

from sklearn.metrics import log_loss, accuracy_score, confusion_matrix

from cnnmodels import vgg_std16_model, preprocess_input, create_rect5, load_img, train_generator, test_generator
from cnnmodels import identity_block, testcv_generator, conv_block, resnet50_model

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1080 (CNMeM is enabled with initial size: 85.0% of memory, cuDNN 5110)


In [2]:
# Params
img_rows, img_cols = 224, 224 # Resolution of inputs
channel = 3
ROWS, COLS = 224, 224
CHECKPOINT_DIR = 'log/checkpoint09/'
BATCHSIZE = 32
CERV_CLASSES = ['Type_1', 'Type_2', 'Type_3']
nb_perClass = int(BATCHSIZE / len(CERV_CLASSES))
TRAIN_DIR = '../data/gmm/train'
TEST_DIR = '../data/gmm/test'
DATA_DIR = '../data/gmm'
num_class = len(CERV_CLASSES)
full = True
bags = 5

In [3]:
def train_generator(datagen, df):
    while 1:
        batch_x = np.zeros((BATCHSIZE, ROWS, COLS, 3), dtype=K.floatx())
        batch_y = np.zeros((BATCHSIZE, len(CERV_CLASSES)), dtype=K.floatx())
        fn = lambda obj: obj.loc[np.random.choice(obj.index, size=nb_perClass, replace=False),:]
        batch_df = df.groupby('class', as_index=True).apply(fn)
        i = 0
        for index,row in batch_df.iterrows():
            row = row.tolist()
            image_file = row[0]
            typ_class = row[1]
            img = Image.open(image_file).resize((ROWS, COLS))
            img = img.convert('RGB')
            x = np.asarray(img, dtype=K.floatx())
            #x = datagen.random_transform(x)
            x = preprocess_input(x)
            batch_x[i] = x
            batch_y[i,CERV_CLASSES.index(typ_class)] = 1
            i += 1
        #return (batch_x, batch_y)
        yield (batch_x.transpose(0, 3, 1, 2), batch_y)

In [4]:
def test_generator(df, datagen, batch_size = BATCHSIZE):
    n = df.shape[0]
    batch_index = 0
    while 1:
        current_index = batch_index * batch_size
        if n >= current_index + batch_size:
            current_batch_size = batch_size
            batch_index += 1    
        else:
            current_batch_size = n - current_index
            batch_index = 0        
        batch_df = df[current_index:current_index+current_batch_size]
        batch_x = np.zeros((batch_df.shape[0], ROWS, COLS, 3), dtype=K.floatx())
        i = 0
        for index,row in batch_df.iterrows():
            row = row.tolist()
            image_file = row[0]
            # typ_class = row[1]
            img = Image.open(image_file).resize((ROWS, COLS))
            img = img.convert('RGB')
            x = np.asarray(img, dtype=K.floatx())
            # x = datagen.random_transform(x)
            x = preprocess_input(x)
            batch_x[i] = x
            i += 1
        if batch_index%100 == 0: print(batch_index)
        # return (batch_x.transpose(0, 3, 1, 2))
        yield(batch_x.transpose(0, 3, 1, 2))

In [5]:
train_datagen = ImageDataGenerator(
    rotation_range=180,
    shear_range=0.2,
    zoom_range=0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True)

In [6]:
img_ls = []
y_ls = []
imgadd_ls = []
yadd_ls = []
for typ in CERV_CLASSES:
    for img in os.listdir(os.path.join(TRAIN_DIR, typ)):
        if img != '.DS_Store':
            img_ls.append(os.path.join(TRAIN_DIR, typ, img))
            y_ls.append(typ)
for typ in CERV_CLASSES:
    for img in os.listdir(os.path.join(DATA_DIR, typ)):
        if img != '.DS_Store':
            imgadd_ls.append(os.path.join(DATA_DIR, typ, img))
            yadd_ls.append(typ)
train_orig_all  = pd.DataFrame({'class': y_ls, 'img': img_ls, })[['img', 'class']]
train_addl_all  = pd.DataFrame({'class': yadd_ls, 'img': imgadd_ls, })[['img', 'class']]
print(train_orig_all.shape)
print(train_addl_all.shape)

(1480, 2)
(6726, 2)


In [7]:
train_all = train_orig_all
test_df =   train_addl_all

In [8]:
# Split into train and valid
valid_set = pd.read_csv("../val_images.csv", header = None, names = ['img']).img.tolist()
valid_set[-4:]

['Type_2/3498.jpg', 'Type_2/1341.jpg', 'Type_3/6017.jpg', 'Type_2/5629.jpg']

In [9]:
valid_df = train_all[train_all['img'].str.replace('../data/original/', '').isin(valid_set)]
if full == True:
    train_df = train_all
else:
    train_df = train_all[~train_all['img'].str.replace('../data/original/', '').isin(valid_set)]
samples_per_epoch=BATCHSIZE*math.ceil(train_df.groupby('class').size()['Type_2']/nb_perClass)
print(train_df.shape)
print(valid_df.shape)

(1480, 2)
(0, 2)


In [10]:
test_sub = pd.read_csv('../feat/additional_pred.csv')
test_sub.head(3)

Unnamed: 0,image_name,Type_1,Type_2,Type_3
0,../data/original/Type_1/6461.jpg,0.704089,0.080506,0.215405
1,../data/original/Type_1/1081.jpg,0.055063,0.780277,0.16466
2,../data/original/Type_1/6333.jpg,0.131375,0.349753,0.518872


In [11]:
good_additional = []
bad_additional = []
for c, row in test_sub.iterrows():
    typ = int(row[0].split('/')[3][-1])
    if row[typ] >= 0.2:
        good_additional.append(row[0].replace('/original', '/gmm'))
    else:
        bad_additional.append(row[0].replace('/original', '/gmm'))

In [12]:
print(len(good_additional), len(bad_additional))

(5252, 1477)


### Now that we have the good and bad additionals, lets do a full run

In [13]:
train_all = pd.concat([train_orig_all, train_addl_all])

In [14]:
train_all = train_all[~train_all['img'].isin(bad_additional)]

In [15]:
img_ls = []
for img in os.listdir(TEST_DIR):
    if img != '.DS_Store':
        img_ls.append(os.path.join(TEST_DIR, img))
test_df  = pd.DataFrame({'img': img_ls}) 

In [16]:
train_all.tail(3)

Unnamed: 0,img,class
6723,../data/gmm/Type_3/4534.jpg,Type_3
6724,../data/gmm/Type_3/4780.jpg,Type_3
6725,../data/gmm/Type_3/4696.jpg,Type_3


In [17]:
# Split into train and valid
valid_set = pd.read_csv("../val_images.csv", header = None, names = ['img']).img.tolist()
valid_set[-4:]

['Type_2/3498.jpg', 'Type_2/1341.jpg', 'Type_3/6017.jpg', 'Type_2/5629.jpg']

In [18]:
valid_df = train_all[train_all['img'].str.replace('../data/gmm/', '').isin(valid_set)]
if full == True:
    train_df = train_all
else:
    train_df = train_all[~train_all['img'].str.replace('../data/gmm/', '').isin(valid_set)]
samples_per_epoch=BATCHSIZE*math.ceil(train_df.groupby('class').size()['Type_2']/nb_perClass)
print(train_df.shape)
print(valid_df.shape)

(6732, 2)
(1344, 2)


In [19]:
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [20]:
# Make our validation set
l = valid_df.groupby('class').size()
valid_x = np.zeros((valid_df.shape[0], ROWS, COLS, 3), dtype=K.floatx())
valid_y = np.zeros((valid_df.shape[0], len(CERV_CLASSES)), dtype=K.floatx())
i = 0
for index,row in valid_df.iterrows():
    row = row.tolist()
    image_file = row[0]
    typ_class = row[1]
    img = Image.open(image_file).resize((ROWS, COLS))
    img = img.convert('RGB')
    x = np.asarray(img, dtype=K.floatx())
    # x = datagen.random_transform(x)
    x = preprocess_input(x)
    valid_x[i] = x
    valid_y[i,CERV_CLASSES.index(typ_class)] = 1
    i += 1
valid_x = valid_x.transpose(0, 3, 1, 2)

In [21]:
# Make our validation set
l = valid_df.groupby('class').size()
train_x = np.zeros((train_df.shape[0], ROWS, COLS, 3), dtype=K.floatx())
train_y = np.zeros((train_df.shape[0], len(CERV_CLASSES)), dtype=K.floatx())
i = 0
for index,row in train_df.iterrows():
    row = row.tolist()
    image_file = row[0]
    typ_class = row[1]
    img = Image.open(image_file).resize((ROWS, COLS))
    img = img.convert('RGB')
    x = np.asarray(img, dtype=K.floatx())
    # x = datagen.random_transform(x)
    x = preprocess_input(x)
    train_x[i] = x
    train_y[i,CERV_CLASSES.index(typ_class)] = 1
    i += 1

In [22]:
def train_generator(datagen, df):
    while 1:
        batch_x = np.zeros((BATCHSIZE, ROWS, COLS, 3), dtype=K.floatx())
        batch_y = np.zeros((BATCHSIZE, len(CERV_CLASSES)), dtype=K.floatx())
        fn = lambda obj: obj.loc[np.random.choice(obj.index, size=nb_perClass, replace=False),:]
        batch_df = df.groupby('class', as_index=True).apply(fn)
        i = 0
        for index in batch_df.index.levels[1].values:
            batch_x[i] = train_x[index] 
            batch_y[i,train_y[index].argmax()] = 1
            i += 1
        yield (batch_x.transpose(0, 3, 1, 2), batch_y)
        #return (batch_x.transpose(0, 3, 1, 2), batch_y)

In [23]:
print "Model creation... "
nb_epoch = 5
model = resnet50_model(ROWS, COLS, channel, num_class)
for layer in model.layers:
    layer.trainable = False
for layer in model.layers[-3:]:
    layer.trainable = True

# Start Fine-tuning
print "Fine tune part 1"
model.fit_generator(train_generator(train_datagen, train_df),
          nb_epoch=nb_epoch,
          samples_per_epoch=samples_per_epoch, #50000,
          verbose=1,
          validation_data=(valid_x, valid_y),
          #callbacks=[early_stopping, model_checkpoint],
          )

Model creation... 
Fine tune part 1
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f14ce4ff6d0>

In [24]:
samples_per_epoch = 32*300
for layer in model.layers[38:]:
    layer.trainable = True
#model.optimizer.lr = 1e-5
nb_epoch = 1
print "Fine tune part 2"
model.fit_generator(train_generator(train_datagen, df=train_df),
          nb_epoch=nb_epoch,
          samples_per_epoch=samples_per_epoch,
          verbose=1,
          validation_data=(valid_x, valid_y),
          #callbacks=[model_checkpoint, early_stopping], # , 
          )

Fine tune part 2
Epoch 1/1


<keras.callbacks.History at 0x7f16292c8dd0>

In [25]:
test_preds_ls = []
for ii in range(10):
    test_preds_ls.append(model.predict_generator(test_generator(test_df, train_datagen), 
                                         val_samples = test_df.shape[0])) 
    model.fit_generator(train_generator(train_datagen, df=train_df),
              nb_epoch=1,
              samples_per_epoch=samples_per_epoch,
              verbose=1,
              validation_data=(valid_x, valid_y),
              #callbacks=[model_checkpoint, early_stopping], # , 
              )
test_preds_ls.append(model.predict_generator(test_generator(test_df, train_datagen), 
                                        val_samples = test_df.shape[0])) 

0
Epoch 1/1
0
Epoch 1/1
0
Epoch 1/1
0
Epoch 1/1
0
Epoch 1/1
0
Epoch 1/1
0
Epoch 1/1
0
Epoch 1/1
0
Epoch 1/1
0
Epoch 1/1
0


In [26]:
test_preds = sum(test_preds_ls)/len(test_preds_ls)


In [27]:
test_sub = pd.DataFrame(test_preds, columns=CERV_CLASSES)
test_sub['image_name'] = test_df['img'].str.split('/').apply(lambda x: x[-1])
test_sub = test_sub[['image_name'] + CERV_CLASSES ]
test_sub.head(3)

Unnamed: 0,image_name,Type_1,Type_2,Type_3
0,78.jpg,0.000111,0.003504,0.996385
1,504.jpg,0.032931,0.184447,0.782622
2,31.jpg,0.413796,0.558904,0.027299


In [28]:
import time
timestr = time.strftime("%Y%m%d")
if full:
    subm_name = '../sub/sub_dara_full_gmm_remove_addl_10xbag.csv' #'.csv.gz'
else:
    subm_name = '../sub/sub_dara_part_gmm_remove_addl_10xbag.csv' #'.csv.gz'
    
test_sub.to_csv(subm_name, index=False)

In [29]:
test_preds.shape

(512, 3)