In [37]:
from keras.preprocessing import image
from keras.preprocessing.image import load_img, img_to_array

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, Flatten, Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import Adam, RMSprop
from keras import regularizers
from keras import applications
from keras.callbacks import EarlyStopping, ModelCheckpoint

import numpy as np
import pandas as pd
import glob
import skimage.io as io
from matplotlib import pyplot as plt
from sklearn.metrics import fbeta_score
from tqdm import tqdm
import time

#import utils; reload(utils)
#from utils import plots

In [2]:
import bcolz
def save_array(fname, arr): c=bcolz.carray(arr, rootdir=fname, mode='w'); c.flush()
def load_array(fname): return bcolz.open(fname)[:]

In [3]:
data_path = '/cinc/data/planet_amazon/train-jpg/'
save_path = '/cinc/data/planet_amazon/save_data/train-jpg/'

# labels

In [4]:
labels_all = pd.read_csv(data_path + '../labels/train_v2.csv')
print labels_all.shape
labels_all.head()

(40479, 2)


Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [5]:
labels_1 = labels_all['tags'].apply(lambda x: x.split(' '))
labels_1.head()

0                                    [haze, primary]
1               [agriculture, clear, primary, water]
2                                   [clear, primary]
3                                   [clear, primary]
4    [agriculture, clear, habitation, primary, road]
Name: tags, dtype: object

In [6]:
tag_count = {}
for line in labels_1:
    for l in line:
        if (tag_count.has_key(l)):
            tag_count[l] = tag_count[l] + 1
        else:
            tag_count[l]= 1
        
tags = tag_count.keys()
n_tags = len(tags)
tags

['slash_burn',
 'clear',
 'blooming',
 'primary',
 'cloudy',
 'conventional_mine',
 'water',
 'haze',
 'cultivation',
 'partly_cloudy',
 'artisinal_mine',
 'habitation',
 'bare_ground',
 'blow_down',
 'agriculture',
 'road',
 'selective_logging']

In [7]:
tag_id = dict([(v, i) for i, v in enumerate(tags)])
id_tag = dict([(i, v) for i, v in enumerate(tags)])
tag_id
#tag_id['primary'], tag_id['clear'], id_tag[4], id_tag[2]

{'agriculture': 14,
 'artisinal_mine': 10,
 'bare_ground': 12,
 'blooming': 2,
 'blow_down': 13,
 'clear': 1,
 'cloudy': 4,
 'conventional_mine': 5,
 'cultivation': 8,
 'habitation': 11,
 'haze': 7,
 'partly_cloudy': 9,
 'primary': 3,
 'road': 15,
 'selective_logging': 16,
 'slash_burn': 0,
 'water': 6}

In [8]:
def tagsToIds(tags):
    ids = []
    for tag in tags.split(' '):
        ids.append(tag_id[tag])
    return ids
tagsToIds('primary clear')

[3, 1]

In [9]:
labels_all['tagIds'] = labels_all['tags'].apply(tagsToIds)
labels_all.head()

Unnamed: 0,image_name,tags,tagIds
0,train_0,haze primary,"[7, 3]"
1,train_1,agriculture clear primary water,"[14, 1, 3, 6]"
2,train_2,clear primary,"[1, 3]"
3,train_3,clear primary,"[1, 3]"
4,train_4,agriculture clear habitation primary road,"[14, 1, 11, 3, 15]"


In [10]:
# [1,3] => [0,1,0,1,0]
def n_hot(labels, size):
    y = np.zeros(size)
    for i in range(len(labels)):
        y[labels[i]] = 1
    return y

# [0, 1, 0, 1, 0] => [1,3]
def reverse_n_hot(n_hot_array):
    indexes = []
    for i in range(len(n_hot_array)):
        if (n_hot_array[i] == 1):
            indexes.append(i)
    return indexes

labels = [1,3]
n_hot_result = n_hot(labels, 5)

print (labels)
print (n_hot_result)
print (reverse_n_hot(n_hot_result))

[1, 3]
[ 0.  1.  0.  1.  0.]
[1, 3]


In [11]:
labels_all['nhot'] = labels_all['tagIds'].apply(lambda x: n_hot(x, len(tag_id)))
labels_all.head()

Unnamed: 0,image_name,tags,tagIds,nhot
0,train_0,haze primary,"[7, 3]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
1,train_1,agriculture clear primary water,"[14, 1, 3, 6]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,train_2,clear primary,"[1, 3]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,train_3,clear primary,"[1, 3]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,train_4,agriculture clear habitation primary road,"[14, 1, 11, 3, 15]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
labels_all['nhot'][1]

array([ 0.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  0.,  0.])

In [13]:
# verify new label, train_26 should be cloudy only
labels_all[labels_all['image_name'] == 'train_26']

Unnamed: 0,image_name,tags,tagIds,nhot
26,train_26,cloudy,[4],"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."


In [14]:
n_samples = labels_all.shape[0]
#n_samples = 100
n_samples

40479

# random split

In [15]:
msk = np.random.rand(n_samples) < 0.9
len(msk), msk[:10]

(40479,
 array([ True,  True,  True,  True,  True,  True, False, False,  True,  True], dtype=bool))

In [17]:
save_array(save_path + 'msk', msk)

In [16]:
msk = load_array(save_path + 'msk')
len(msk), msk[:10]

(40479,
 array([ True,  True, False,  True,  True,  True,  True, False,  True,  True], dtype=bool))

In [17]:
labels_train = labels_all[0:n_samples][msk]
labels_train = labels_train.reset_index()
n_train = labels_train.shape[0]
print labels_train.shape, n_train
labels_train.head()

(36431, 5) 36431


Unnamed: 0,index,image_name,tags,tagIds,nhot
0,0,train_0,haze primary,"[7, 3]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
1,1,train_1,agriculture clear primary water,"[14, 1, 3, 6]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
2,3,train_3,clear primary,"[1, 3]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,4,train_4,agriculture clear habitation primary road,"[14, 1, 11, 3, 15]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,5,train_5,haze primary water,"[7, 3, 6]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, ..."


In [18]:
labels_valid = labels_all[0:n_samples][~msk]
labels_valid = labels_valid.reset_index()
n_valid = labels_valid.shape[0]
print labels_valid.shape, n_valid
labels_valid.head()

(4048, 5) 4048


Unnamed: 0,index,image_name,tags,tagIds,nhot
0,2,train_2,clear primary,"[1, 3]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,7,train_7,haze primary,"[7, 3]","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
2,20,train_20,agriculture clear primary water,"[14, 1, 3, 6]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,26,train_26,cloudy,[4],"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
4,31,train_31,clear cultivation primary,"[1, 8, 3]","[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."


In [21]:
x_valid = np.empty([n_valid, 224, 224, 3])
y_valid = np.empty([n_valid, n_tags])

for i in range(n_valid):
    #print i, labels_valid['image_name'][i]
    img = load_img(data_path + labels_valid['image_name'][i] + '.jpg')
    #img = img.resize((224, 224))
    x = img_to_array(img.resize((224, 224)))
    x_valid[i] = x
    y_valid[i] = labels_valid['nhot'][i]
x_valid.shape, y_valid.shape

((3986, 224, 224, 3), (3986, 17))

In [22]:
# save for later
#labels_train.to_csv(save_path + 'labels_train.csv', index=False)
#labels_valid.to_csv(save_path + 'labels_valid.csv', index=False)
save_array(save_path + 'x_valid', x_valid)
save_array(save_path + 'y_valid', y_valid)

In [19]:
# load back
#labels_train = pd.read_csv(save_path + 'labels_train.csv')
#labels_valid = pd.read_csv(save_path + 'labels_valid.csv')
x_valid = load_array(save_path + 'x_valid')
y_valid = load_array(save_path + 'y_valid')
#(labels_train.shape, labels_valid.shape, y_valid.shape, y_valid.shape)
(x_valid.shape, y_valid.shape)

((4048, 224, 224, 3), (4048, 17))

# reuse vgg16 model

In [20]:
# build the VGG16 network
base_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
print('Model loaded.')

for layer in base_model.layers:
    layer.trainable = False

Model loaded.


In [21]:
x = base_model.output
x = Flatten(input_shape=(512, 8, 8))(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
pred_layer = Dense(17, activation='sigmoid')(x)

In [22]:
model = Model(inputs=base_model.input, outputs=pred_layer)

In [23]:
model.compile(Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [24]:
batch_size=128
gen = image.ImageDataGenerator()
#train_batches = gen.flow(x_train, y_train, batch_size=batch_size)
valid_batches = gen.flow(x_valid, y_valid, batch_size=batch_size)

In [35]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, verbose=0),
    ModelCheckpoint(save_path + 'model/vgg16_pretrained_keras2_generator.h5', monitor='val_loss', save_best_only=True, verbose=0)
]

# simple generator

In [41]:
steps_per_epoch=10
def my_generator():
    while(1):
        for i in range(steps_per_epoch):
            #print i
            yield x_train[i*10:(i+1)*10], y_train[i*10:(i+1)*10]

In [43]:
model.fit_generator(my_generator(), 
                    steps_per_epoch=steps_per_epoch,
                    validation_data=valid_batches,
                    validation_steps=valid_batches.n//batch_size+1,
                    epochs=1,
                   callbacks=callbacks)


Epoch 1/1


<keras.callbacks.History at 0x7fdcea564890>

# randomized generator

In [26]:
#x = np.random.shuffle(range(10))
x = [i for i in range(n_train)]
np.random.shuffle(x)
type(x), len(x), x[0:10], np.sort(x)[0:10]

(list,
 36431,
 [31327, 10205, 575, 13628, 17171, 18384, 16619, 9618, 24877, 21794],
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

In [27]:
print labels_train['image_name'][x][0:10]

31327    train_34822
10205    train_11362
575        train_656
13628    train_15143
17171    train_19071
18384    train_20419
16619    train_18445
9618     train_10713
24877    train_27676
21794    train_24233
Name: image_name, dtype: object


In [47]:
#batch_size = 10
def my_generator(n_train):
    while(1):
        shuffle_index = [i for i in range(n_train)]
        np.random.shuffle(shuffle_index)
        #print shuffle_index[0:10]

        for i in range(n_train // batch_size):
            start = i*batch_size
            end   = min((i+1)*batch_size, n_train)
            if (start == end):
                continue
            #print i, start, end

            x_batch = np.empty([end-start, 224, 224, 3])
            y_batch = np.empty([end-start, n_tags])

            for i in range(end-start):
                #print start+i, shuffle_index[start+i]
                index = shuffle_index[start+i]
                img = load_img(data_path + labels_train['image_name'][index] + '.jpg')
                x = img_to_array(img.resize((224, 224)))
                x_batch[i] = x
                y_batch[i] = labels_train['nhot'][index]
            #print x_batch.shape, y_batch.shape
            time.sleep(2)
            yield x_batch, y_batch

In [48]:
#my_generator(n_train)

In [49]:
model.fit_generator(my_generator(n_train), 
                    steps_per_epoch=n_train // batch_size,
                    validation_data=valid_batches,
                    validation_steps=valid_batches.n//batch_size+1,
                    epochs=100,
                   callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

KeyboardInterrupt: 

In [32]:
model.optimizer = Adam(lr=0.0001)
#model.optimizer.lr.get_value()

In [29]:
model.fit_generator(my_generator(n_train), 
                    steps_per_epoch=n_train // batch_size,
                    validation_data=valid_batches,
                    validation_steps=valid_batches.n//batch_size+1,
                    epochs=100,
                   callbacks=callbacks)

In [50]:
model.load_weights(save_path + 'model/vgg16_pretrained_keras2_generator.h5', by_name=True)

In [51]:
valid_pred = model.predict(x_valid, verbose=2, batch_size=batch_size)

In [52]:
valid_pred[0] > 0.2

array([False,  True, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False], dtype=bool)

In [53]:
y_valid[0]

array([ 0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.])

In [54]:
fbeta_score(y_valid, np.array(valid_pred) > 0.2, beta=2, average='samples')

0.89914780535998629

# predict test data

In [55]:
#files = glob.glob(data_path + '../test-jpg/*.jpg')
files = glob.glob(data_path + '../test-jpg-all/*.jpg')
n_test = len(files)
n_test, files[0]

(61191, '/cinc/data/planet_amazon/train-jpg/../test-jpg-all/test_1347.jpg')

In [56]:
def predict_batch(files, start, end):
    files_batch = files[start:end]
    n_batch = len(files_batch)

    x_test = np.empty([n_batch, 224, 224, 3])
    for i in range(n_batch):
        #print i, labels_train['image_name'][i]
        img = load_img(files_batch[i])
        #img = img.resize((224, 224))
        x = img_to_array(img.resize((224, 224)))
        x_test[i] = x

    return model.predict(x_test, batch_size=128, verbose=2)


In [57]:
batch_size = 10240

for i in tqdm(range(len(files) / batch_size + 1)):
    #print i
    start = i*batch_size
    end   = min((i+1)*batch_size, len(files))
    batch_pred = predict_batch(files, start, end)
    if (i == 0):
        test_pred = batch_pred
    else:
        test_pred = np.vstack((test_pred, batch_pred))

100%|██████████| 6/6 [24:07<00:00, 240.35s/it]


In [40]:
#save_array(save_path + 'test_pred', test_pred)
#test_pred = load_array(save_path + 'test_pred')

In [41]:
#test_pred = model.predict_proba(x_test, batch_size=128, verbose=2)
#id_tag

In [58]:
test_pred[0], test_pred[0] > 0.1
#1 in [1,4,7,9]

(array([  6.01787994e-11,   9.99803722e-01,   3.65002011e-03,
          1.00000000e+00,   1.96093117e-16,   3.97735387e-27,
          7.55721296e-04,   1.65932452e-05,   3.37057543e-04,
          5.67284042e-06,   1.73850018e-18,   5.59940281e-06,
          1.47458323e-07,   1.10273286e-05,   3.33831471e-04,
          1.08822991e-04,   1.76604808e-04], dtype=float32),
 array([False,  True, False,  True, False, False, False, False, False,
        False, False, False, False, False, False, False, False], dtype=bool))

In [59]:
# [False, True, True] {0: clear, 1: primary, 2: cloudy} => [primary, cloudy]
def boolsToTags(bools, id_tag_map):
    tags = []
    for i in range(len(bools)):
        #print bools[i]
        if bools[i]:
            tags.append(id_tag_map[i])
    return tags

# only choose the biggest probability for atomespheric tags
def probsToTagsAtom(probs, threshold, id_tag_map):
    tags = []
    atomIndex = -1
    atomProb = 0.0
    for i in range(len(probs)):
        #print bools[i]
        if (i in [1,4,7,9]):
            #print i, probs[i]
            if (atomProb < probs[i]):
                #print 'assign atom'
                atomProb = probs[i]
                atomIndex = i;
        else:
            if probs[i] > threshold:
                tags.append(id_tag_map[i])
    tags.append(id_tag_map[atomIndex])
    return tags

#id_tag[0]
#boolsToTags(test_pred[0] > 0.2, id_tag)
print ' '.join(boolsToTags(test_pred[0] > 0.1, id_tag))
print ' '.join(probsToTagsAtom(test_pred[0], 0.1, id_tag))

clear primary
primary clear


In [60]:
test_tags = []
for i in range(len(test_pred)):
    test_tags.append( ' '.join(probsToTagsAtom(test_pred[i], 0.2, id_tag))  )
test_tags[:5]

['primary clear',
 'primary agriculture haze',
 'primary habitation agriculture road clear',
 'primary cultivation agriculture clear',
 'primary clear']

In [61]:
#df_test = pd.read_csv('/cinc/data/planet_amazon/submission/sample_submission.csv')
df_test = pd.read_csv('/cinc/data/planet_amazon/submission/sample_submission_v2.csv')
#df_test = df_test[0:20522]

In [62]:
df_test['image_name'] = files
df_test['image_name'] = df_test['image_name'].apply(lambda x: x.split('/')[7].split('.')[0])
df_test['tags'] = test_tags

In [63]:
df_test.head()

Unnamed: 0,image_name,tags
0,test_1347,primary clear
1,file_1426,primary agriculture haze
2,file_18643,primary habitation agriculture road clear
3,test_8038,primary cultivation agriculture clear
4,test_17609,primary clear


In [64]:
df_test.to_csv(data_path + '../submission/vgg16_pretrain_generator_20170618.csv', index=False)