In [1]:
%pwd

u'/home/chekalin/Dev/fast-ai-homework/seedling-classification'

In [2]:
HOME = '/home/chekalin/Dev/fast-ai-homework/seedling-classification'

In [3]:
DATA_HOME = HOME + '/data'

In [4]:
import pandas as pd
import os
from glob import glob
import numpy as np
from shutil import copyfile

# Prepare data

In [None]:
%cd $DATA_HOME

In [None]:
%rm -rf test/ train/ valid/ results/ sample/ sample_submission.csv

In [None]:
!unzip test.zip && unzip train.zip && unzip sample_submission.csv.zip

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission.head()

In [123]:
categories = os.listdir(DATA_HOME + '/train')

In [None]:
for category in categories: 
    %mkdir -p "$DATA_HOME/valid/$category"
    %mkdir -p "$DATA_HOME/sample/valid/$category"
    %mkdir -p "$DATA_HOME/sample/train/$category"

In [None]:
%mkdir -p "$DATA_HOME/results"
%mkdir -p "$DATA_HOME/sample/results"
%mkdir -p "$DATA_HOME/test/unknown"
%mkdir -p "$DATA_HOME/sample/test/unknown"

In [None]:
% cd $DATA_HOME/test
!echo *.png | xargs mv -t unknown/

In [None]:
% cd $DATA_HOME
!tree -d

In [None]:
%cd $DATA_HOME/train
training_files = glob('*/*.png')

In [None]:
validation_set_size = int(len(training_files) * .1)

shuffled_training_files = np.random.permutation(training_files).tolist()
for i in range(validation_set_size): 
    random_file = shuffled_training_files.pop()
    os.rename(random_file, DATA_HOME + '/valid/' + random_file)

sample_training_set_size = 1000
for i in range(sample_training_set_size): 
    random_file = shuffled_training_files.pop()
    copyfile(random_file, DATA_HOME + '/sample/train/' + random_file)

sample_validation_set_size = 1000
for i in range(sample_validation_set_size): 
    random_file = shuffled_training_files.pop()
    copyfile(random_file, DATA_HOME + '/sample/valid/' + random_file)

In [None]:
%cd $DATA_HOME/test
test_files = glob('*/*.png')

In [None]:
sample_test_set_size = 100
for filename in test_files[:sample_test_set_size]: copyfile(filename, DATA_HOME + '/sample/test/' + filename)

In [None]:
%cd $DATA_HOME
for dirpath, dirnames, filenames in os.walk(DATA_HOME):
    relative = dirpath[len(os.getcwd()):]
    if (len(filenames) and len(relative)): print "Files in {}: {}".format(relative, len(filenames))

# Create model

In [52]:
import numpy as np
np.set_printoptions(precision=4, linewidth=100)

from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
from keras.utils.data_utils import get_file

def addConvBlock(model, layers, filters):
    for i in range(layers):
        model.add(ZeroPadding2D((1, 1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
        model.add(BatchNormalization(axis=1))
    model.add(MaxPooling2D((2, 2), strides=(2, 2)))
    
model = Sequential()
model.add(BatchNormalization(axis=1, input_shape=(3,224,224)))
addConvBlock(model, 2, 64)
addConvBlock(model, 2, 128)
addConvBlock(model, 3, 256)
addConvBlock(model, 3, 512)
addConvBlock(model, 3, 512)
model.add(Flatten())
model.add(Dense(4096, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(4096, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(12, activation='softmax'))

model.compile(Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [53]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
batchnormalization_33 (BatchNorm (None, 3, 224, 224)   12          batchnormalization_input_3[0][0] 
____________________________________________________________________________________________________
zeropadding2d_27 (ZeroPadding2D) (None, 3, 226, 226)   0           batchnormalization_33[0][0]      
____________________________________________________________________________________________________
convolution2d_27 (Convolution2D) (None, 64, 224, 224)  1792        zeropadding2d_27[0][0]           
____________________________________________________________________________________________________
batchnormalization_34 (BatchNorm (None, 64, 224, 224)  256         convolution2d_27[0][0]           
___________________________________________________________________________________________

batchnormalization_45 (BatchNorm (None, 512, 14, 14)   2048        convolution2d_38[0][0]           
____________________________________________________________________________________________________
zeropadding2d_39 (ZeroPadding2D) (None, 512, 16, 16)   0           batchnormalization_45[0][0]      
____________________________________________________________________________________________________
convolution2d_39 (Convolution2D) (None, 512, 14, 14)   2359808     zeropadding2d_39[0][0]           
____________________________________________________________________________________________________
batchnormalization_46 (BatchNorm (None, 512, 14, 14)   2048        convolution2d_39[0][0]           
____________________________________________________________________________________________________
maxpooling2d_15 (MaxPooling2D)   (None, 512, 7, 7)     0           batchnormalization_46[0][0]      
___________________________________________________________________________________________

# Train model

In [54]:
# path = DATA_HOME + '/sample'
path = DATA_HOME
batch_size = 16

In [55]:
from keras.preprocessing import image

def get_batches(path, gen=image.ImageDataGenerator(), shuffle=True, batch_size=8, class_mode='categorical'):
        return gen.flow_from_directory(path, 
                                       target_size=(224,224),
                                       class_mode=class_mode, 
                                       shuffle=shuffle, 
                                       batch_size=batch_size)

In [56]:
gen = image.ImageDataGenerator(rotation_range=15, width_shift_range=0.1, 
                               height_shift_range=0.1, zoom_range=0.1, horizontal_flip=True)

In [57]:
batches = get_batches(path + '/train', batch_size=batch_size, gen=gen)
val_batches = get_batches(path + '/valid', batch_size=batch_size)

Found 4275 images belonging to 12 classes.
Found 475 images belonging to 12 classes.


In [58]:
import time

fitting_start_time = time.time()
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=50, 
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
model.save_weights('weights-50-epochs.h5')
print "fitting 50 epochs took {} seconds".format(int(time.time() - fitting_start_time))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
fitting 50 epochs took 13956 seconds


In [59]:
fitting_start_time = time.time()
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=50, 
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
model.save_weights('weights-100-epochs.h5')
print "fitting 50 took {} seconds".format(int(time.time() - fitting_start_time))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
fitting 50 took 13957 seconds


In [63]:
model.optimizer.lr=0.00001

In [64]:
model.load_weights('weights-100-epochs.h5')
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=10, 
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
model.save_weights('weights-110-epochs.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [65]:
model.optimizer.lr=0.0001

In [66]:
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=10, 
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
model.save_weights('weights-120-epochs.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [73]:
model.optimizer.lr=0.00001

In [74]:
model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=100, 
                    validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
model.save_weights('weights-220-epochs.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100


Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [113]:
val_batches = get_batches(path + '/valid', batch_size=batch_size, shuffle=False)

Found 475 images belonging to 12 classes.


In [114]:
predictions = model.predict_generator(val_batches, val_batches.nb_sample)

In [115]:
predicted_classes = np.argmax(predictions, axis = 1)
actual_classes = val_batches.classes

In [116]:
from sklearn.metrics import precision_score, log_loss, recall_score, f1_score

precision = precision_score(actual_classes, predicted_classes, average='micro');
recall = recall_score(actual_classes, predicted_classes, average='micro')
f1 = (2 * precision * recall) / (precision + recall)
f1_calculated = f1_score(actual_classes, predicted_classes, average='micro')

print 'Precision score: {:.4f}%'.format(precision)
print 'Recall score: {:.4f}%'.format(recall)
print 'F1: {:.4f}%'.format(f1)
print 'F1 calculated: {:.4f}%'.format(f1_calculated)

Precision score: 0.8800%
Recall score: 0.8800%
F1: 0.8800%
F1 calculated: 0.8800%


In [117]:
idxs = np.where(actual_classes == predicted_classes)[0]
print '{} correct predictions found'.format(len(idxs))


418 correct predictions found


# Run prediction and prepare submission

In [118]:
test_batches = get_batches(path + '/test', batch_size=batch_size, shuffle=False)

Found 794 images belonging to 1 classes.


In [119]:
predictions = model.predict_generator(test_batches, test_batches.nb_sample)

12

In [127]:
import csv

results_file = path + '/results/submission.csv'
file = open(results_file, "wb")
writer = csv.writer(file, delimiter=',', quoting=csv.QUOTE_NONE)
header = ['file','species']
writer.writerow(header)

predicted_classes = np.argmax(predictions, axis = 1)


for filename, prediction in zip(test_batches.filenames, predicted_classes):
    writer.writerow([filename[len('unknown/'):]] + [sorted(categories)[prediction]])

file.close()

In [128]:
from IPython.display import FileLink
%cd $HOME
relative_path_to_results = results_file.replace(HOME, '')[1:]
FileLink(relative_path_to_results)

/home/chekalin/Dev/fast-ai-homework/seedling-classification
