# Fish Monitoring

In [1]:
%matplotlib inline
import utils; reload(utils)
from utils import *

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


In [5]:
# define references for home and data directories
LESSON_HOME_DIR = os.getcwd()
DATA_HOME_DIR = LESSON_HOME_DIR + '/data/fish-monitoring/'

In [10]:
path = DATA_HOME_DIR
test_path = DATA_HOME_DIR + 'test_stg1/'

In [2]:
batch_size = 64

**Action Plan:**

1. Create validation set and sample
2. Move to separate dirs for each set
3. Finetune and train
4. Generate Predictions
5. Submit results to Kaggle

## Create validation set and sample

In [93]:
from __future__ import division,print_function

import os, json
from glob import glob
from shutil import copyfile
from sklearn.metrics import log_loss
import numpy as np
import pandas as pd
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

In [4]:
# check present directory
%pwd

u'/home/ubuntu/nbs'

In [6]:
# make root directories for train, test, valid and result
%cd $DATA_HOME_DIR
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/results

/home/ubuntu/nbs/data/fish-monitoring
mkdir: cannot create directory ‘results’: File exists


In [7]:
%cd $DATA_HOME_DIR/train

/home/ubuntu/nbs/data/fish-monitoring/train


## Move to separate dirs for each set

In [8]:
# grab jpeg files from train class directories and move 20% to validation class directories
for (dirpath, dirnames, filenames) in os.walk(os.getcwd()):
    for dirname in dirnames:
        
        # grab all jpgs from directory
        g = glob(dirname + '/*.jpg')
        num_images = int(len(g)*0.2)
        
        # shuffle files
        shuf = np.random.permutation(g)
        
        # move 20% of files from train to validation directory
        for i in range(num_images):
            old_path = DATA_HOME_DIR + 'train/'
            valid_home = DATA_HOME_DIR + 'valid/'
            new_path = valid_home + dirname + '/'
            if not os.path.exists(new_path): 
                os.mkdir(new_path)
            os.rename(shuf[i], valid_home + shuf[i])

In [9]:
# grab jpeg files from train class directories and copy 10% to sample class directories
for (dirpath, dirnames, filenames) in os.walk(os.getcwd()):
    for dirname in dirnames:
        
        # grab all jpgs from directory
        g = glob(dirname + '/*.jpg')
        num_images = int(len(g)*0.1)
        
        # shuffle files
        shuf = np.random.permutation(g)
        
        # copy 10% of files from original directory to sample dir
        for i in range(num_images):
            old_path = DATA_HOME_DIR + 'train/'
            sample_home = DATA_HOME_DIR + 'sample/train/'
            new_path = sample_home + dirname + '/'
            if not os.path.exists(new_path): 
                os.mkdir(new_path)
            copyfile(shuf[i], sample_home + shuf[i])

In [10]:
# grab jpeg files from valid class directories and copy 10% to sample class directories
for (dirpath, dirnames, filenames) in os.walk(os.getcwd()):
    for dirname in dirnames:
        
        # grab all jpgs from directory
        g = glob(dirname + '/*.jpg')
        num_images = int(len(g)*0.1)
        
        # shuffle files
        shuf = np.random.permutation(g)
        
        # copy 10% of files from original directory to sample dir
        for i in range(num_images):
            old_path = DATA_HOME_DIR + 'train/'
            test_home = DATA_HOME_DIR + 'sample/test/'
            new_path = test_home + dirname + '/'
            if not os.path.exists(new_path): 
                os.mkdir(new_path)
            copyfile(shuf[i], test_home + shuf[i])

## Finetune and train

In [94]:
from vgg16 import Vgg16

In [95]:
vgg = Vgg16()

In [96]:
batches = vgg.get_batches(path+'train', batch_size=batch_size)
val_batches = vgg.get_batches(path+'valid', batch_size=batch_size)
vgg.finetune(batches)

Found 3025 images belonging to 8 classes.
Found 752 images belonging to 8 classes.


In [97]:
vgg.fit(batches, val_batches, nb_epoch=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [98]:
#vgg.model.save_weights(path+'results/ft1.h5')
#vgg.model.save_weights(path+'results/ft2.h5') #4 epochs, val_acc: 0.8351
vgg.model.save_weights(path+'results/ft3.h5') #10 epochs, val_acc: 0.8351

## Generate Predictions

In [99]:
#load previous weights
vgg.model.load_weights(path+'results/ft3.h5')

In [100]:
batches, pred = vgg.test(path=test_path, batch_size=batch_size)

Found 1000 images belonging to 1 classes.


In [101]:
#preview the first 5 lines
np.set_printoptions(suppress=True, precision=17)
pred[:5,:]

array([[ 0.61406898498535156,  0.00695174979045987,  0.00061145419022068,  0.00006396313256118,
         0.04437973350286484,  0.04894832521677017,  0.00002677444354049,  0.28494897484779358],
       [ 0.09914875775575638,  0.00048282110947184,  0.00019609663286246,  0.00000006480980375,
         0.00299828499555588,  0.54311037063598633,  0.02692178823053837,  0.32714182138442993],
       [ 0.02267983928322792,  0.0000068579361141 ,  0.0000012685652564 ,  0.00000000141916345,
         0.86604654788970947,  0.0006162611534819 ,  0.10809539258480072,  0.0025538713671267 ],
       [ 0.29357615113258362,  0.00005596435221378,  0.00000098536008863,  0.00000001117241943,
         0.00003259267032263,  0.04420936852693558,  0.02144675329327583,  0.6406782865524292 ],
       [ 0.77352499961853027,  0.00004588506999426,  0.00267445854842663,  0.00000002269828592,
         0.00539935892447829,  0.00001411127141182,  0.00005464505011332,  0.21828649938106537]], dtype=float32)

In [102]:
#extract file names from test directory
file_names = [x[8:] for x in batches.filenames]

In [103]:
FISH_CLASSES = ['ALB','BET','DOL','LAG','NoF','OTHER','SHARK','YFT']
submission = pd.DataFrame(pred, columns=FISH_CLASSES, index=None)
submission.insert(0, 'image', file_names)
submission.head()

Unnamed: 0,image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT
0,img_06237.jpg,0.614069,0.006952,0.0006114542,6.396313e-05,0.04438,0.048948,2.7e-05,0.284949
1,img_06893.jpg,0.099149,0.000483,0.0001960966,6.48098e-08,0.002998,0.54311,0.026922,0.327142
2,img_02082.jpg,0.02268,7e-06,1.268565e-06,1.419163e-09,0.866047,0.000616,0.108095,0.002554
3,img_06261.jpg,0.293576,5.6e-05,9.853601e-07,1.117242e-08,3.3e-05,0.044209,0.021447,0.640678
4,img_03628.jpg,0.773525,4.6e-05,0.002674459,2.269829e-08,0.005399,1.4e-05,5.5e-05,0.218286


In [104]:
#save file for submission
%cd $DATA_HOME_DIR
submission_file_name = 'submission3.csv'
submission.to_csv(submission_file_name, index=False)

/home/ubuntu/nbs/data/fish-monitoring


In [105]:
from IPython.display import FileLink
%cd $LESSON_HOME_DIR
FileLink('data/fish-monitoring/' + submission_file_name)

/home/ubuntu/nbs
