In [7]:
%matplotlib inline

In [8]:
from __future__ import division,print_function

import os, json
import shutil
from glob import glob
import random
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
import pandas as pd
from matplotlib import pyplot as plt

In [9]:
from importlib import reload
import utils; reload(utils)
from utils import plots

In [10]:
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.utils.data_utils import get_file
from keras.preprocessing import image
from keras.optimizers import SGD, RMSprop, Adam, Nadam


## Create validation set and sample

In [11]:
NB_ROOT = os.getcwd()

In [12]:
DATA_HOME_DIR = os.path.join(NB_ROOT, "data/invasive-species-monitoring")
results_path = os.path.join('/mnt/data/invasive-species-monitoring', 'results/')

In [13]:
%cd $DATA_HOME_DIR
%mkdir -p valid
%mkdir -p results
%mkdir -p sample/train
%mkdir -p sample/test/unknown
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown
%mkdir -p /mnt/data/invasive-species-monitoring/results

/mnt/ml/working/fastai-courses/deeplearning1/nbs/data/invasive-species-monitoring


In [14]:
def create_label_dirs(base_dir):
    """
    """
    # Create label directories thant can be recognized by Keras ImageDataGenerator.flow_from_directory    
    labels = ['invasive', 'not_invasive']
    for label in labels:
        try:
            os.makedirs(os.path.join(base_dir, label))
        except OSError as e:
            if e.errno != os.errno.EEXIST:
                raise

create_label_dirs('train')
create_label_dirs('valid')
create_label_dirs('sample/train')
create_label_dirs('sample/valid')

In [15]:
train_labels_csv = pd.read_csv("train_labels.csv")

In [16]:
def label_images_by_dir(base_dir="train"):
    """
    Move images into their label directory so that they can be recognized by
    ImageDataGenerator.flow_from_directory
    """
    for _, row in train_labels_csv.iterrows():
        image_name = "{}.jpg".format(row['name'])
        src_path = os.path.join(base_dir, image_name)
        if row['invasive'] == 1:
            dst_path = os.path.join(base_dir, 'invasive', image_name)
        else:
            dst_path = "train/not_invasive/{}".format(image_name)
        if os.path.exists(src_path):
            shutil.move(src_path, dst_path)
# label_images_by_dir()

In [17]:
def create_valid_set(train_root='train', valid_root="valid", valid_rate=0.1):
    labels = ['invasive', 'not_invasive']
    for label in labels:
        train_label_dir = os.path.join(train_root, label)
        valid_label_dir = os.path.join(valid_root, label)
        files = os.listdir(train_label_dir)
        for file in random.sample(files, k=int(len(files) * valid_rate)):
            shutil.move(os.path.join(train_label_dir, file), valid_label_dir)
# create_valid_set()

In [18]:
def create_sample_set(rate=0.01):
    labels = ['invasive', 'not_invasive', 'unknown']
    for dataset in ['train', 'valid', 'test']:
        if dataset == "valid":
            # use a higher smple rate for the validation dataset
            # because they have fewer items
            sample_rate = rate*5
        else:
            sample_rate = rate
        for label in labels:
            src_dir = os.path.join(dataset, label)
            dst_dir = os.path.join('sample', dataset, label)
            try:
                files = os.listdir(src_dir)
            except FileNotFoundError:
                continue
            for file in random.sample(files, k=int(len(files) * sample_rate)):
                shutil.copy(os.path.join(src_dir, file), dst_dir)

# create_sample_set()

# Build Model

In [19]:
from keras.models import Sequential, Model, load_model
from keras import applications
from keras import optimizers
from keras.layers import Dropout, Flatten, Dense

In [53]:
img_rows, img_cols, img_channel = 224, 224, 3
base_model = applications.VGG16(weights='imagenet', include_top=False, input_shape=(img_channel,img_rows, img_cols))


In [54]:
add_model = Sequential()
add_model.add(Flatten(input_shape=base_model.output_shape[1:]))
add_model.add(Dense(256, activation='relu'))
add_model.add(Dense(2, activation='sigmoid'))

model = Model(inputs=base_model.input, outputs=add_model(base_model.output))
model.compile(loss='binary_crossentropy', optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 3, 224, 224)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 64, 224, 224)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 64, 224, 224)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 64, 112, 112)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 128, 112, 112)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 128, 112, 112)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 128, 56, 56)       0         
__________

# Train

In [23]:
_path = DATA_HOME_DIR
# _path = DATA_HOME_DIR + '/sample' # Only for sample tests!
test_path = os.path.join(DATA_HOME_DIR, 'test')
train_path = os.path.join(_path, 'train')
valid_path = os.path.join(_path, 'valid')
test_path = os.path.join(_path, 'test')

In [22]:
def get_batches(path, gen=image.ImageDataGenerator(), shuffle=True, batch_size=8, target_size=(224,224), class_mode='categorical'):
        """
            Takes the path to a directory, and generates batches of augmented/normalized data. Yields batches indefinitely, in an infinite loop.

            See Keras documentation: https://keras.io/preprocessing/image/
        """
        # 224x224 is the image size used by ImageNet
        return gen.flow_from_directory(path, target_size=target_size,
                class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

In [30]:
! cd $NB_ROOT
BATCH_SIZE = 32

trans_gen = image.ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1, height_shift_range=0.1,
    horizontal_flip=True,
)
# target_size = (600, 450)
target_size = (224, 224) 
train_batches = get_batches(train_path, batch_size=BATCH_SIZE, target_size=target_size)
valid_batches = get_batches(valid_path, batch_size=BATCH_SIZE*2, target_size=target_size)

Found 2067 images belonging to 2 classes.
Found 228 images belonging to 2 classes.


In [55]:
def fit_model(model, no_of_epochs = 10):
    latest_weights_filename = None
    for epoch in range(no_of_epochs):
        print("Running epoch: %d" % epoch)
        model.fit_generator(
            train_batches, steps_per_epoch=train_batches.samples,
            epochs=no_of_epochs,
            validation_data=valid_batches, validation_steps=valid_batches.samples
        )
        latest_weights_filename = 'ft%d.h5' % epoch
        model.save_weights(os.path.join(results_path, latest_weights_filename))
        print("Completed %s fit operations" % no_of_epochs)

In [56]:
fit_model(model, 10)

Running epoch: 0
Epoch 1/10
  65/2067 [..............................] - ETA: 43:27 - loss: 0.4667 - acc: 0.8459 - val_loss: 0.0000e+00 - val_acc: 0.0000e+00Epoch 2/10
  65/2067 [..............................] - ETA: 42:57 - loss: 0.1538 - acc: 0.9406 - val_loss: 0.0000e+00 - val_acc: 0.0000e+00Epoch 3/10
  65/2067 [..............................] - ETA: 42:56 - loss: 0.0995 - acc: 0.9700 - val_loss: 0.0000e+00 - val_acc: 0.0000e+00Epoch 4/10
  43/2067 [..............................] - ETA: 36:21 - loss: 0.0629 - acc: 0.9767

KeyboardInterrupt: 

# Predict

In [69]:
test_batches = get_batches(
    test_path, batch_size=BATCH_SIZE * 2, target_size=target_size, shuffle=False, class_mode=None)

Found 1531 images belonging to 1 classes.


In [70]:
def predict(model, test_batches):
    return model.predict_generator(test_batches, test_batches.nb_sample)
    

In [71]:
preds = predict(model, test_batches)

In [72]:
sbm = pd.DataFrame(preds, columns=["invasive","not invasive"])
sbm['name'] = [int(f.replace('unknown/', '').replace('.jpg', '')) for f in test_batches.filenames]
sbm = sbm.set_index(['name'])
sbm = sbm.sort_index()

In [73]:
sbm.to_csv('submission.csv', columns=['invasive'])

In [74]:
!kg submit -c invasive-species-monitoring submission.csv

0.95541
