In [1]:
from keras.preprocessing import image
from keras.preprocessing.image import load_img, img_to_array

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, Flatten, Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras import regularizers
from keras import applications
from keras.callbacks import EarlyStopping, ModelCheckpoint

import numpy as np
import pandas as pd
import glob
import skimage.io as io
from matplotlib import pyplot as plt
from sklearn.metrics import fbeta_score
from tqdm import tqdm

#import utils; reload(utils)
#from utils import plots

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is enabled with initial size: 75.0% of memory, cuDNN not available)


In [2]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [3]:
data_path = '/cinc/data/planet_amazon/train-jpg/'
save_path = '/cinc/data/planet_amazon/save_data/train-jpg/'

# labels

In [10]:
labels_train = pd.read_csv(data_path + '../labels/train_v2_10000.csv')
print labels_train.shape
labels_train.head()

(10000, 2)


Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [11]:
labels_1 = labels_train['tags'].apply(lambda x: x.split(' '))
labels_1.head()

0                                    [haze, primary]
1               [agriculture, clear, primary, water]
2                                   [clear, primary]
3                                   [clear, primary]
4    [agriculture, clear, habitation, primary, road]
Name: tags, dtype: object

In [12]:
tag_count = {}
for line in labels_1:
    for l in line:
        if (tag_count.has_key(l)):
            tag_count[l] = tag_count[l] + 1
        else:
            tag_count[l]= 1
        
tags = tag_count.keys()
n_tags = len(tags)
tags

['slash_burn',
 'clear',
 'blooming',
 'primary',
 'cloudy',
 'conventional_mine',
 'water',
 'haze',
 'cultivation',
 'partly_cloudy',
 'artisinal_mine',
 'habitation',
 'bare_ground',
 'blow_down',
 'agriculture',
 'road',
 'selective_logging']

In [13]:
tag_id = dict([(v, i) for i, v in enumerate(tags)])
id_tag = dict([(i, v) for i, v in enumerate(tags)])
tag_id
#tag_id['primary'], tag_id['clear'], id_tag[4], id_tag[2]

{'agriculture': 14,
 'artisinal_mine': 10,
 'bare_ground': 12,
 'blooming': 2,
 'blow_down': 13,
 'clear': 1,
 'cloudy': 4,
 'conventional_mine': 5,
 'cultivation': 8,
 'habitation': 11,
 'haze': 7,
 'partly_cloudy': 9,
 'primary': 3,
 'road': 15,
 'selective_logging': 16,
 'slash_burn': 0,
 'water': 6}

In [14]:
save_array(save_path + 'id_tag', id_tag)
save_array(save_path + 'tag_id', tag_id)

In [15]:
id_tag = load_array(save_path + 'id_tag')[0]
tag_id = load_array(save_path + 'tag_id')[0]
type(id_tag), type(tag_id)
tag_id

{'agriculture': 14,
 'artisinal_mine': 10,
 'bare_ground': 12,
 'blooming': 2,
 'blow_down': 13,
 'clear': 1,
 'cloudy': 4,
 'conventional_mine': 5,
 'cultivation': 8,
 'habitation': 11,
 'haze': 7,
 'partly_cloudy': 9,
 'primary': 3,
 'road': 15,
 'selective_logging': 16,
 'slash_burn': 0,
 'water': 6}

In [16]:
def tagsToIds(tags):
    ids = []
    for tag in tags.split(' '):
        ids.append(tag_id[tag])
    return ids
tagsToIds('primary clear')

[3, 1]

In [9]:
labels_train['tagIds'] = labels_train['tags'].apply(tagsToIds)
labels_train.head()

Unnamed: 0,image_name,tags,tagIds
0,train_0,haze primary,"[7, 3]"
1,train_1,agriculture clear primary water,"[14, 1, 3, 6]"
2,train_2,clear primary,"[1, 3]"
3,train_3,clear primary,"[1, 3]"
4,train_4,agriculture clear habitation primary road,"[14, 1, 11, 3, 15]"


In [10]:
# [1,3] => [0,1,0,1,0]
def n_hot(labels, size):
    y = np.zeros(size)
    for i in range(len(labels)):
        y[labels[i]] = 1
    return y

# [0, 1, 0, 1, 0] => [1,3]
def reverse_n_hot(n_hot_array):
    indexes = []
    for i in range(len(n_hot_array)):
        if (n_hot_array[i] == 1):
            indexes.append(i)
    return indexes

labels = [1,3]
n_hot_result = n_hot(labels, 5)

print (labels)
print (n_hot_result)
print (reverse_n_hot(n_hot_result))

[1, 3]
[ 0.  1.  0.  1.  0.]
[1, 3]


In [11]:
labels_train['nhot'] = labels_train['tagIds'].apply(lambda x: n_hot(x, len(tag_id)))
labels_train.head()

Unnamed: 0,image_name,tags,tagIds,nhot
0,train_0,haze primary,"[7, 3]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
1,train_1,agriculture clear primary water,"[14, 1, 3, 6]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,train_2,clear primary,"[1, 3]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,train_3,clear primary,"[1, 3]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,train_4,agriculture clear habitation primary road,"[14, 1, 11, 3, 15]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
labels_train['nhot'][1]

array([ 0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.])

In [13]:
# verify new label, train_26 should be cloudy only
labels_train[labels_train['image_name'] == 'train_26']

Unnamed: 0,image_name,tags,tagIds,nhot
26,train_26,cloudy,[4],"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


# read image

In [14]:
n_samples = labels_train.shape[0]
#n_samples = 100
n_samples

10000

In [15]:
x_all = np.empty([n_samples, 224, 224, 3])
y_all = np.empty([n_samples, n_tags])

for i in range(n_samples):
    #print i, labels_train['image_name'][i]
    img = load_img(data_path + labels_train['image_name'][i] + '.jpg')
    #img = img.resize((224, 224))
    x = img_to_array(img.resize((224, 224)))
    x_all[i] = x
    y_all[i] = labels_train['nhot'][i]

In [16]:
x_all[0].shape, y_all[0].shape

((224, 224, 3), (17,))

In [17]:
# save for later
save_array(save_path + 'x_all', x_all)
save_array(save_path + 'y_all', y_all)

In [18]:
x_all = load_array(save_path + 'x_all')
y_all = load_array(save_path + 'y_all')

In [19]:
x_all.shape, y_all.shape

((100, 224, 224, 3), (100, 17))

# random split

In [20]:
msk = np.random.rand(n_samples) < 0.8
msk[:10]

array([ True,  True,  True,  True,  True,  True,  True,  True, False,  True], dtype=bool)

In [21]:
x_train = x_all[msk]
x_valid = x_all[~msk]
y_train = y_all[msk]
y_valid = y_all[~msk]
(x_train.shape, y_train.shape, x_valid.shape, y_valid.shape)

((82, 224, 224, 3), (82, 17), (18, 224, 224, 3), (18, 17))

In [22]:
# save for later
save_array(save_path + 'x_train', x_train)
save_array(save_path + 'x_valid', x_valid)
save_array(save_path + 'y_train', y_train)
save_array(save_path + 'y_valid', y_valid)

In [15]:
# load back
x_train = load_array(save_path + 'x_train')
x_valid = load_array(save_path + 'x_valid')
y_train = load_array(save_path + 'y_train')
y_valid = load_array(save_path + 'y_valid')
(x_train.shape, y_train.shape, x_valid.shape, y_valid.shape)

# reuse vgg16 model

In [16]:
# build the VGG16 network
base_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
print('Model loaded.')

for layer in base_model.layers:
    layer.trainable = False

Model loaded.


In [17]:
x = base_model.output
x = Flatten(input_shape=(512, 8, 8))(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
pred_layer = Dense(17, activation='sigmoid')(x)

In [18]:
model = Model(inputs=base_model.input, outputs=pred_layer)

In [19]:
model.compile(Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
batch_size=128
gen = image.ImageDataGenerator()
train_batches = gen.flow(x_train, y_train, batch_size=batch_size)
valid_batches = gen.flow(x_valid, y_valid, batch_size=batch_size)

In [22]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=0),
    ModelCheckpoint(save_path + 'model/vgg16_pretrained_keras2.h5', monitor='val_loss', save_best_only=True, verbose=0)
]

In [23]:
#model.fit_generator(train_batches, 
#                    samples_per_epoch=train_batches.n,
#                    validation_data=valid_batches,
#                    nb_val_samples=valid_batches.n,
#                    epochs=1)

model.fit_generator(train_batches, 
                    steps_per_epoch=train_batches.n//batch_size+1,
                    validation_data=valid_batches,
                    validation_steps=valid_batches.n//batch_size+1,
                    epochs=10,
                   callbacks=callbacks)

#model.fit(x_train, y_train, validation_data=(x_valid, y_valid), callbacks=callbacks, batch_size=32, epochs=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f163157d150>

In [30]:
model.fit_generator(train_batches, 
                    steps_per_epoch=train_batches.n//batch_size+1,
                    validation_data=valid_batches,
                    validation_steps=valid_batches.n//batch_size+1,
                    epochs=100,
                   callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.callbacks.History at 0x7f161f995450>

In [20]:
model.load_weights(save_path + 'model/vgg16_pretrained_keras2.h5', by_name=True)

In [24]:
valid_pred = model.predict(x_valid, verbose=2, batch_size=batch_size)

In [25]:
valid_pred[0] > 0.2

array([False, False, False,  True, False, False,  True,  True, False,
       False, False, False, False, False, False,  True, False], dtype=bool)

In [26]:
y_valid[0]

array([ 0.,  0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.])

In [27]:
fbeta_score(y_valid, np.array(valid_pred) > 0.2, beta=2, average='samples')

0.88984582660242684

# predict test data

In [25]:
#files = glob.glob(data_path + '../test-jpg/*.jpg')
files = glob.glob(data_path + '../test-jpg-all/*.jpg')
n_test = len(files)
n_test, files[0]

(61191, '/cinc/data/planet_amazon/train-jpg/../test-jpg-all/test_1347.jpg')

In [29]:
def predict_batch(files, start, end):
    files_batch = files[start:end]
    n_batch = len(files_batch)

    x_test = np.empty([n_batch, 224, 224, 3])
    for i in range(n_batch):
        #print i, labels_train['image_name'][i]
        img = load_img(files_batch[i])
        #img = img.resize((224, 224))
        x = img_to_array(img.resize((224, 224)))
        x_test[i] = x

    return model.predict(x_test, batch_size=128, verbose=2)


In [30]:
batch_size = 10240

for i in tqdm(range(len(files) / batch_size + 1)):
    #print i
    start = i*batch_size
    end   = min((i+1)*batch_size, len(files))
    batch_pred = predict_batch(files, start, end)
    if (i == 0):
        test_pred = batch_pred
    else:
        test_pred = np.vstack((test_pred, batch_pred))

100%|██████████| 6/6 [23:38<00:00, 227.21s/it]


In [18]:
#save_array(save_path + 'test_pred_vgg16', test_pred)
test_pred = load_array(save_path + 'test_pred_vgg16')

In [41]:
#test_pred = model.predict_proba(x_test, batch_size=128, verbose=2)
#id_tag

In [19]:
test_pred[0], test_pred[0] > 0.1
#1 in [1,4,7,9]

(array([  3.97341182e-07,   9.98804927e-01,   3.02211731e-03,
          9.99992251e-01,   4.64103589e-10,   2.79540930e-12,
          9.95468814e-03,   3.09968309e-05,   1.20625561e-02,
          8.10933532e-04,   1.28470861e-08,   5.13541345e-05,
          9.87595831e-06,   1.24116792e-04,   1.43487593e-02,
          1.27202459e-03,   1.07647502e-03], dtype=float32),
 array([False,  True, False,  True, False, False, False, False, False,
        False, False, False, False, False, False, False, False], dtype=bool))

In [20]:
# [False, True, True] {0: clear, 1: primary, 2: cloudy} => [primary, cloudy]
def boolsToTags(bools, id_tag_map):
    tags = []
    for i in range(len(bools)):
        #print bools[i]
        if bools[i]:
            tags.append(id_tag_map[i])
    return tags

# only choose the biggest probability for atomespheric tags
def probsToTagsAtom(probs, threshold, id_tag_map):
    tags = []
    atomIndex = -1
    atomProb = 0.0
    for i in range(len(probs)):
        #print bools[i]
        if (i in [1,4,7,9]):
            #print i, probs[i]
            if (atomProb < probs[i]):
                #print 'assign atom'
                atomProb = probs[i]
                atomIndex = i;
        else:
            if probs[i] > threshold:
                tags.append(id_tag_map[i])
    tags.append(id_tag_map[atomIndex])
    return tags

#id_tag[0]
#boolsToTags(test_pred[0] > 0.2, id_tag)
print ' '.join(boolsToTags(test_pred[0] > 0.1, id_tag))
print ' '.join(probsToTagsAtom(test_pred[0], 0.1, id_tag))

clear primary
primary clear


In [21]:
test_tags = []
for i in range(len(test_pred)):
    test_tags.append( ' '.join(probsToTagsAtom(test_pred[i], 0.2, id_tag))  )
test_tags[:5]

['primary clear',
 'primary water cultivation agriculture clear',
 'primary habitation agriculture road clear',
 'primary cultivation habitation agriculture road clear',
 'primary clear']

In [22]:
#df_test = pd.read_csv('/cinc/data/planet_amazon/submission/sample_submission.csv')
df_test = pd.read_csv('/cinc/data/planet_amazon/submission/sample_submission_v2.csv')
#df_test = df_test[0:20522]

In [26]:
df_test['image_name'] = files
df_test['image_name'] = df_test['image_name'].apply(lambda x: x.split('/')[7].split('.')[0])
df_test['tags'] = test_tags

In [27]:
df_test.head()

Unnamed: 0,image_name,tags
0,test_1347,primary clear
1,file_1426,primary water cultivation agriculture clear
2,file_18643,primary habitation agriculture road clear
3,test_8038,primary cultivation habitation agriculture roa...
4,test_17609,primary clear


In [28]:
df_test.to_csv(data_path + '../submission/vgg16_pretrain_patient10_atom_20170521.csv', index=False)