# Export lion patches to training dataset

In [1]:
INPUT_DIR='../../input/kaggle-sea-lion/'
OUTPUT_DIR='../../output/kaggle-sea-lion/02/'

SHOW_IMAGES = False
FORCE_IMAGE = None

IMAGE_DIMS = (148,148,3)

#%prun print('test')
#%pdb

In [2]:
%matplotlib inline
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import os

import modules.logging
from modules.logging import logger
import modules.lions as lions
from modules.utils import Timer
import modules.utils as utils

Using TensorFlow backend.


## Prepare output dataset

In [3]:
utils.mkdirs(OUTPUT_DIR, recreate=True)
modules.logging.setup_file_logger(OUTPUT_DIR + 'out.log')
logger.info('Dir ' + OUTPUT_DIR + ' created')

x_ds = None
y_ds = None

logger.info('creating dataset')
dataset_path = OUTPUT_DIR + utils.dataset_name('lion-patches', IMAGE_DIMS)
h5file = h5py.File(dataset_path, 'w')
x_ds = h5file.create_dataset('X', (0, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2]), maxshape=(None, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2]), chunks=(1, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2]), dtype='f')
y_ds = h5file.create_dataset('Y', (0, 5), maxshape=(None, 5), dtype='f')
logger.info('done')

2017-04-03 03:41:27,868 INFO Dir ../../output/kaggle-sea-lion/02/ created
2017-04-03 03:41:27,870 INFO creating dataset
2017-04-03 03:41:27,873 INFO done


## Process input photos and extract lions patches to dataset

### Open csv files

In [None]:
#from MismatchedTrainImages.txt
MISMATCHED = [3, 7, 9, 21, 30, 34, 71, 81, 89, 97, 151, 184, 215, 234, 242, 268, 290, 311, 331, 344, 380, 384, 406, 421, 469, 475, 490, 499, 507, 530, 531, 605, 607, 614, 621, 638, 644, 687, 712, 721, 767, 779, 781, 794, 800, 811, 839, 840, 869, 882, 901, 903, 905, 909, 913, 927, 946]
logger.info('loading train.csv')
train = pd.read_csv(INPUT_DIR + "Train/train.csv")

2017-04-03 03:41:27,881 INFO loading train.csv


### Process each photo

In [None]:
logger.info('process all photos from train file')
count = 0
total_errors = np.zeros(5)
total_classes = np.zeros(5)
total_classes_added = np.zeros(5)
for row in train.itertuples():
    #load images
    train_id = row[1]
    
    if(FORCE_IMAGE!=None and train_id!=FORCE_IMAGE):
        continue
    
    image_raw_file = INPUT_DIR + 'Train/'+ str(train_id) +'.jpg'
    image_dotted_file = INPUT_DIR + 'TrainDotted/'+ str(train_id) +'.jpg'

    if(train_id in MISMATCHED):
        logger.warning('skipping mismatched train_id ' + str(train_id))
        continue
    
    if(not os.path.isfile(image_raw_file)):
        logger.warning('file not found. skipping. file=' + image_raw_file)
        continue
        
    t = Timer('processing photo ' + image_raw_file)
    image_raw = cv2.imread(image_raw_file)
    image_dotted = cv2.imread(image_dotted_file)
    
    classes_count_total, classes_count_added = lions.export_lions(image_raw, image_dotted, x_ds, y_ds, IMAGE_DIMS, debug=SHOW_IMAGES, min_distance_others=40)
    logger.info('detected classes count')
    logger.info(str(classes_count_total))
    logger.info('-----')
    logger.info('expected classes count')
    logger.info(str(row))
    logger.info('-----')
    logger.info('errors per class')
    error = np.subtract(row[2:], classes_count_total)
    logger.info(str(error))
    logger.info('acum errors per class')
    total_errors = np.add(total_errors, np.absolute(error))
    logger.info(str(total_errors))
    logger.info('-----')
    logger.info('acum detected classes count total')
    total_classes = np.add(total_classes, classes_count_total)
    logger.info(str(total_classes))
    logger.info('-----')
    logger.info('acum detected classes count added to dataset')
    total_classes_added = np.add(total_classes_added, classes_count_added)
    logger.info(str(total_classes_added))
    t.stop()
    
    count = count + 1

logger.info('ERRORS PER CLASS')
logger.info(str(total_errors))
        
h5file.close()

2017-04-03 03:41:27,926 INFO process all photos from train file
2017-04-03 03:41:27,931 INFO > [started] processing photo ../../input/kaggle-sea-lion/Train/0.jpg...
2017-04-03 03:41:29,809 INFO detected classes count
2017-04-03 03:41:29,811 INFO [  62.   12.  485.   42.  344.]
2017-04-03 03:41:29,812 INFO -----
2017-04-03 03:41:29,813 INFO expected classes count
2017-04-03 03:41:29,814 INFO Pandas(Index=0, train_id=0, adult_males=62, subadult_males=12, adult_females=486, juveniles=42, pups=344)
2017-04-03 03:41:29,816 INFO -----
2017-04-03 03:41:29,817 INFO errors per class
2017-04-03 03:41:29,818 INFO [ 0.  0.  1.  0.  0.]
2017-04-03 03:41:29,820 INFO acum errors per class
2017-04-03 03:41:29,822 INFO [ 0.  0.  1.  0.  0.]
2017-04-03 03:41:29,823 INFO -----
2017-04-03 03:41:29,824 INFO acum detected classes count total
2017-04-03 03:41:29,827 INFO [  62.   12.  485.   42.  344.]
2017-04-03 03:41:29,828 INFO -----
2017-04-03 03:41:29,829 INFO acum detected classes count added to datase

### Validate dataset