In [1]:
%pwd

'/home/chekalin/Dev/fast-ai-homework/seedling-classification'

In [2]:
HOME = '/home/chekalin/Dev/fast-ai-homework/seedling-classification'

In [3]:
DATA_HOME = HOME + '/data'

In [4]:
import pandas as pd
import os
from glob import glob
import numpy as np
from shutil import copyfile

# Prepare data

In [5]:
%cd $DATA_HOME

/home/chekalin/Dev/fast-ai-homework/seedling-classification/data


In [6]:
%rm -rf test/ train/ valid/ results/ sample/ sample_submission.csv

In [7]:
!unzip -q '*.zip' 


3 archives were successfully processed.


In [8]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

Unnamed: 0,file,species
0,0021e90e4.png,Sugar beet
1,003d61042.png,Sugar beet
2,007b3da8b.png,Sugar beet
3,0086a6340.png,Sugar beet
4,00c47e980.png,Sugar beet


In [9]:
categories = os.listdir(DATA_HOME + '/train')

In [10]:
for category in categories: 
    %mkdir -p "$DATA_HOME/valid/$category"
    %mkdir -p "$DATA_HOME/sample/valid/$category"
    %mkdir -p "$DATA_HOME/sample/train/$category"

In [11]:
%mkdir -p "$DATA_HOME/results"
%mkdir -p "$DATA_HOME/sample/results"
%mkdir -p "$DATA_HOME/test/unknown"
%mkdir -p "$DATA_HOME/sample/test/unknown"

In [12]:
% cd $DATA_HOME/test
!echo *.png | xargs mv -t unknown/

/home/chekalin/Dev/fast-ai-homework/seedling-classification/data/test


In [13]:
%cd $DATA_HOME/train
training_files = glob('*/*.png')

/home/chekalin/Dev/fast-ai-homework/seedling-classification/data/train


In [14]:
validation_set_size = int(len(training_files) * .3)

shuffled_training_files = np.random.permutation(training_files).tolist()
for i in range(validation_set_size): 
    random_file = shuffled_training_files.pop()
    os.rename(random_file, DATA_HOME + '/valid/' + random_file)

sample_training_set_size = 1000
for i in range(sample_training_set_size): 
    random_file = shuffled_training_files.pop()
    copyfile(random_file, DATA_HOME + '/sample/train/' + random_file)

sample_validation_set_size = 100
for i in range(sample_validation_set_size): 
    random_file = shuffled_training_files.pop()
    copyfile(random_file, DATA_HOME + '/sample/valid/' + random_file)

In [15]:
%cd $DATA_HOME/test
test_files = glob('*/*.png')

/home/chekalin/Dev/fast-ai-homework/seedling-classification/data/test


In [16]:
sample_test_set_size = 100
for filename in test_files[:sample_test_set_size]: copyfile(filename, DATA_HOME + '/sample/test/' + filename)

In [17]:
%cd $DATA_HOME
for dirpath, dirnames, filenames in os.walk(DATA_HOME):
    relative = dirpath[len(os.getcwd()):]
    if (len(filenames) and len(relative)): print("Files in {}: {}".format(relative, len(filenames)))

/home/chekalin/Dev/fast-ai-homework/seedling-classification/data
Files in /sample/test/unknown: 100
Files in /sample/valid/Fat Hen: 14
Files in /sample/valid/Black-grass: 10
Files in /sample/valid/Charlock: 3
Files in /sample/valid/Common Chickweed: 13
Files in /sample/valid/Loose Silky-bent: 18
Files in /sample/valid/Maize: 4
Files in /sample/valid/Scentless Mayweed: 12
Files in /sample/valid/Small-flowered Cranesbill: 7
Files in /sample/valid/Sugar beet: 11
Files in /sample/valid/Common wheat: 2
Files in /sample/valid/Cleavers: 2
Files in /sample/valid/Shepherds Purse: 4
Files in /sample/train/Fat Hen: 102
Files in /sample/train/Black-grass: 56
Files in /sample/train/Charlock: 78
Files in /sample/train/Common Chickweed: 121
Files in /sample/train/Loose Silky-bent: 138
Files in /sample/train/Maize: 43
Files in /sample/train/Scentless Mayweed: 106
Files in /sample/train/Small-flowered Cranesbill: 107
Files in /sample/train/Sugar beet: 94
Files in /sample/train/Common wheat: 52
Files in

# Create model

In [18]:
import tensorflow as tf
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

from keras import backend as K
K.set_session(sess)

Using TensorFlow backend.


In [19]:
import numpy as np
np.set_printoptions(precision=4, linewidth=100)

from keras.models import Model, Sequential
from keras.layers import Input, GlobalAveragePooling2D, Flatten, Dense, Dropout, Conv2D, MaxPooling2D, ZeroPadding2D, BatchNormalization
from keras.applications.vgg16 import VGG16
from keras.applications.xception import Xception
from keras.optimizers import SGD, RMSprop, Adam
from keras.utils.data_utils import get_file

def addConvBlock(x, layers, filters):
    for i in range(layers):
        x = Conv2D(filters, (3, 3), padding='same', activation='relu')(x)
        x = BatchNormalization(axis=1)(x)
    return MaxPooling2D(strides=(2, 2))(x)

def add_top(x):
    x = Flatten()(x)
    x = Dense(4096, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(4096, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    return Dense(12, activation='softmax')(x)

def vgg_16_from_scatch():
    inputs = Input(shape=(224,224,3))
    x = BatchNormalization(axis=1)(inputs)
    x = addConvBlock(x, 2, 64)
    x = addConvBlock(x, 2, 128)
    x = addConvBlock(x, 2, 256)
    x = addConvBlock(x, 2, 512)
    x = addConvBlock(x, 2, 512)
    x = add_top(x)
    return Model(inputs, x)
    
# model = vgg_16_from_scatch()  
# vgg16 = VGG16(include_top=False, weights="imagenet", input_shape = (224,224,3))
# for layer in vgg16.layers: layer.trainable = False

# Xception().summary()
xception = Xception(include_top=False, weights="imagenet", input_shape = (299,299,3))
# for layer in xception.layers: layer.trainable = False
x = xception.output
x = GlobalAveragePooling2D(name='avg_pool')(x)
x = Dropout(0.5)(x)
x = Dense(12, activation='softmax', name='predictions')(x)

model = Model(xception.input, x)
model.compile(Adam(decay=1e-6), loss='categorical_crossentropy', metrics=['accuracy'])

In [20]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 299, 299, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 149, 149, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_conv1_act (Activation)   (None, 149, 149, 32) 0           block1_conv1_bn[0][0]            
__________________________________________________________________________________________________
block1_con

# Train model

In [21]:
# path = DATA_HOME + '/sample'
path = DATA_HOME
batch_size = 16

In [22]:
from keras.preprocessing import image

def get_batches(path, gen=image.ImageDataGenerator(), shuffle=True, batch_size=8, class_mode='categorical'):
        return gen.flow_from_directory(path, 
                                       target_size=(299,299),
                                       class_mode=class_mode, 
                                       shuffle=shuffle, 
                                       batch_size=batch_size)

In [23]:
gen = image.ImageDataGenerator(rotation_range=180, horizontal_flip=True, vertical_flip=True)

In [24]:
batches = get_batches(path + '/train', batch_size=batch_size, gen=gen)
val_batches = get_batches(path + '/valid', batch_size=batch_size, gen=gen)

Found 3325 images belonging to 12 classes.
Found 1425 images belonging to 12 classes.


In [25]:
from keras.callbacks import ModelCheckpoint

modelsave = ModelCheckpoint(filepath='seedlings-xception.h5', save_best_only=True)
model.fit_generator(batches, epochs=1, validation_data=val_batches, callbacks=[modelsave])

Epoch 1/1


<keras.callbacks.History at 0x7fccc11c22b0>

In [26]:
val_batches = get_batches(path + '/valid', batch_size=batch_size, shuffle=False)

Found 1425 images belonging to 12 classes.


In [27]:
predictions = model.predict_generator(val_batches)

In [28]:
predicted_classes = np.argmax(predictions, axis = 1)
actual_classes = val_batches.classes

In [29]:
from sklearn.metrics import precision_score, log_loss, recall_score, f1_score

precision = precision_score(actual_classes, predicted_classes, average='micro');
recall = recall_score(actual_classes, predicted_classes, average='micro')
f1 = (2 * precision * recall) / (precision + recall)
f1_calculated = f1_score(actual_classes, predicted_classes, average='micro')

print('Precision score: {:.4f}%'.format(precision))
print('Recall score: {:.4f}%'.format(recall))
print('F1: {:.4f}%'.format(f1))
print('F1 calculated: {:.4f}%'.format(f1_calculated))

Precision score: 0.6821%
Recall score: 0.6821%
F1: 0.6821%
F1 calculated: 0.6821%


In [30]:
idxs = np.where(actual_classes == predicted_classes)[0]
print('{} correct predictions found'.format(len(idxs)))


972 correct predictions found


# Run prediction and prepare submission

In [31]:
test_batches = get_batches(path + '/test', batch_size=batch_size, shuffle=False)

Found 794 images belonging to 1 classes.


In [32]:
predictions = model.predict_generator(test_batches)

In [33]:
import csv

results_file = path + '/results/submission.csv'
file = open(results_file, "w")
writer = csv.writer(file, delimiter=',', quoting=csv.QUOTE_NONE)
header = ['file','species']
writer.writerow(header)

predicted_classes = np.argmax(predictions, axis = 1)


for filename, prediction in zip(test_batches.filenames, predicted_classes):
    writer.writerow([filename[len('unknown/'):]] + [sorted(categories)[prediction]])

file.close()

In [34]:
from IPython.display import FileLink
%cd $HOME
relative_path_to_results = results_file.replace(HOME, '')[1:]
FileLink(relative_path_to_results)

/home/chekalin/Dev/fast-ai-homework/seedling-classification
