In [None]:
%pwd

In [None]:
HOME = '/home/stanislav/Dev/fast-ai-homework/seedling-classification'

In [None]:
DATA_HOME = HOME + '/data'

In [None]:
import pandas as pd
import os
from glob import glob
import numpy as np
from shutil import copyfile

# Prepare data

In [None]:
%cd $DATA_HOME

In [None]:
%rm -rf test/ train/ valid/ results/ sample/ sample_submission.csv

In [None]:
!unzip test.zip && unzip train.zip && unzip sample_submission.csv.zip

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

In [None]:
categories = os.listdir(DATA_HOME + '/train')

In [None]:
for category in categories: 
    %mkdir -p "$DATA_HOME/valid/$category"
    %mkdir -p "$DATA_HOME/sample/valid/$category"
    %mkdir -p "$DATA_HOME/sample/train/$category"

In [None]:
%mkdir -p "$DATA_HOME/results"
%mkdir -p "$DATA_HOME/sample/results"
%mkdir -p "$DATA_HOME/test/unknown"
%mkdir -p "$DATA_HOME/sample/test/unknown"

In [None]:
% cd $DATA_HOME/test
!echo *.png | xargs mv -t unknown/

In [None]:
%cd $DATA_HOME/train
training_files = glob('*/*.png')

In [None]:
validation_set_size = int(len(training_files) * .3)

shuffled_training_files = np.random.permutation(training_files).tolist()
for i in range(validation_set_size): 
    random_file = shuffled_training_files.pop()
    os.rename(random_file, DATA_HOME + '/valid/' + random_file)

sample_training_set_size = 1000
for i in range(sample_training_set_size): 
    random_file = shuffled_training_files.pop()
    copyfile(random_file, DATA_HOME + '/sample/train/' + random_file)

sample_validation_set_size = 100
for i in range(sample_validation_set_size): 
    random_file = shuffled_training_files.pop()
    copyfile(random_file, DATA_HOME + '/sample/valid/' + random_file)

In [None]:
%cd $DATA_HOME/test
test_files = glob('*/*.png')

In [None]:
sample_test_set_size = 100
for filename in test_files[:sample_test_set_size]: copyfile(filename, DATA_HOME + '/sample/test/' + filename)

In [None]:
%cd $DATA_HOME
for dirpath, dirnames, filenames in os.walk(DATA_HOME):
    relative = dirpath[len(os.getcwd()):]
    if (len(filenames) and len(relative)): print("Files in {}: {}".format(relative, len(filenames)))

# Create model

In [None]:
import tensorflow as tf
sess = tf.Session()

from keras import backend as K
K.set_session(sess)

In [None]:
import numpy as np
np.set_printoptions(precision=4, linewidth=100)

from keras.models import Model, Sequential
from keras.layers import Input, GlobalAveragePooling2D, Flatten, Dense, Dropout, Conv2D, MaxPooling2D, ZeroPadding2D, BatchNormalization
from keras.applications.vgg16 import VGG16
from keras.applications.xception import Xception
from keras.optimizers import SGD, RMSprop, Adam
from keras.utils.data_utils import get_file

def addConvBlock(x, layers, filters):
    for i in range(layers):
        x = Conv2D(filters, (3, 3), padding='same', activation='relu')(x)
        x = BatchNormalization(axis=1)(x)
    return MaxPooling2D(strides=(2, 2))(x)

def add_top(x):
    x = Flatten()(x)
    x = Dense(4096, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(4096, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    return Dense(12, activation='softmax')(x)

def vgg_16_from_scatch():
    inputs = Input(shape=(224,224,3))
    x = BatchNormalization(axis=1)(inputs)
    x = addConvBlock(x, 2, 64)
    x = addConvBlock(x, 2, 128)
    x = addConvBlock(x, 2, 256)
    x = addConvBlock(x, 2, 512)
    x = addConvBlock(x, 2, 512)
    x = add_top(x)
    return Model(inputs, x)
    
# model = vgg_16_from_scatch()  
# vgg16 = VGG16(include_top=False, weights="imagenet", input_shape = (224,224,3))
# for layer in vgg16.layers: layer.trainable = False

# Xception().summary()
xception = Xception(include_top=False, weights="imagenet", input_shape = (299,299,3))
# for layer in xception.layers: layer.trainable = False
x = xception.output
x = GlobalAveragePooling2D(name='avg_pool')(x)
x = Dropout(0.5)(x)
x = Dense(12, activation='softmax', name='predictions')(x)

model = Model(xception.input, x)
model.compile(Adam(decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

# Train model

In [None]:
# path = DATA_HOME + '/sample'
path = DATA_HOME
batch_size = 16

In [None]:
from keras.preprocessing import image

def get_batches(path, gen=image.ImageDataGenerator(), shuffle=True, batch_size=8, class_mode='categorical'):
        return gen.flow_from_directory(path, 
                                       target_size=(299,299),
                                       class_mode=class_mode, 
                                       shuffle=shuffle, 
                                       batch_size=batch_size)

In [None]:
gen = image.ImageDataGenerator(rotation_range=180, horizontal_flip=True, vertical_flip=True)

In [None]:
batches = get_batches(path + '/train', batch_size=batch_size, gen=gen)
val_batches = get_batches(path + '/valid', batch_size=batch_size, gen=gen)

In [19]:
model.load_weights("seedlings-xception.h5")

In [None]:
from keras.callbacks import ModelCheckpoint

modelsave = ModelCheckpoint(filepath='seedlings-xception.h5', save_best_only=True)
model.fit_generator(batches, epochs=600, validation_data=val_batches, callbacks=[modelsave])

In [20]:
val_batches = get_batches(path + '/valid', batch_size=batch_size, shuffle=False)

Found 1425 images belonging to 12 classes.


In [21]:
predictions = model.predict_generator(val_batches)

In [22]:
predicted_classes = np.argmax(predictions, axis = 1)
actual_classes = val_batches.classes

In [23]:
from sklearn.metrics import precision_score, log_loss, recall_score, f1_score

precision = precision_score(actual_classes, predicted_classes, average='micro');
recall = recall_score(actual_classes, predicted_classes, average='micro')
f1 = (2 * precision * recall) / (precision + recall)
f1_calculated = f1_score(actual_classes, predicted_classes, average='micro')

print('Precision score: {:.4f}%'.format(precision))
print('Recall score: {:.4f}%'.format(recall))
print('F1: {:.4f}%'.format(f1))
print('F1 calculated: {:.4f}%'.format(f1_calculated))

Precision score: 0.9796%
Recall score: 0.9796%
F1: 0.9796%
F1 calculated: 0.9796%


In [24]:
idxs = np.where(actual_classes == predicted_classes)[0]
print('{} correct predictions found'.format(len(idxs)))


1396 correct predictions found


# Run prediction and prepare submission

In [25]:
test_batches = get_batches(path + '/test', batch_size=batch_size, shuffle=False)

Found 794 images belonging to 1 classes.


In [26]:
predictions = model.predict_generator(test_batches)

In [27]:
import csv

results_file = path + '/results/submission.csv'
file = open(results_file, "w")
writer = csv.writer(file, delimiter=',', quoting=csv.QUOTE_NONE)
header = ['file','species']
writer.writerow(header)

predicted_classes = np.argmax(predictions, axis = 1)


for filename, prediction in zip(test_batches.filenames, predicted_classes):
    writer.writerow([filename[len('unknown/'):]] + [sorted(categories)[prediction]])

file.close()

In [28]:
from IPython.display import FileLink
%cd $HOME
relative_path_to_results = results_file.replace(HOME, '')[1:]
FileLink(relative_path_to_results)

/home/stanislav/Dev/fast-ai-homework/seedling-classification
