# Export lion patches to training dataset

In [1]:
INPUT_DIR='../../input/kaggle-sea-lion/'
OUTPUT_DIR='../../output/kaggle-sea-lion/02/'

SHOW_IMAGES = True
MAX_IMAGES = 1

IMAGE_DIMS = (148,148,3)

#%prun print('test')
#%pdb

In [2]:
%matplotlib inline
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import os

import modules.logging
from modules.logging import logger
import modules.lions as lions
from modules.utils import Timer
import modules.utils as utils

## Prepare output dataset

In [3]:
utils.mkdirs(OUTPUT_DIR, recreate=True)
modules.logging.setup_file_logger(OUTPUT_DIR + 'out.log')
logger.info('Dir ' + OUTPUT_DIR + ' created')

x_ds = None
y_ds = None

logger.info('creating dataset')
dataset_path = OUTPUT_DIR + utils.dataset_name('lion-patches', IMAGE_DIMS)
h5file = h5py.File(dataset_path, 'w')
x_ds = h5file.create_dataset('X', (0, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2]), maxshape=(None, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2]), chunks=(1, IMAGE_DIMS[0], IMAGE_DIMS[1], IMAGE_DIMS[2]), dtype='f')
y_ds = h5file.create_dataset('Y', (0, 5), maxshape=(None, 5), dtype='f')
logger.info('done')

2017-04-02 19:12:20,687 INFO Dir ../../output/sea-lion/02/ created
2017-04-02 19:12:20,688 INFO creating dataset
2017-04-02 19:12:20,693 INFO done


## Process input photos and extract lions patches to dataset

### Open csv files

In [4]:
#from MismatchedTrainImages.txt
MISMATCHED = [3, 7, 9, 21, 30, 34, 71, 81, 89, 97, 151, 184, 215, 234, 242, 268, 290, 311, 331, 344, 380, 384, 406, 421, 469, 475, 490, 499, 507, 530, 531, 605, 607, 614, 621, 638, 644, 687, 712, 721, 767, 779, 781, 794, 800, 811, 839, 840, 869, 882, 901, 903, 905, 909, 913, 927, 946]
logger.info('loading train.csv')
train = pd.read_csv(INPUT_DIR + "Train/train.csv")

2017-04-02 19:12:20,701 INFO loading train.csv


FileNotFoundError: File b'../../input/sea-lion/Train/train.csv' does not exist

### Process each photo

In [None]:
logger.info('process all photos from train file')
count = 0
total_errors = np.zeros(5)
total_classes = np.zeros(5)
for row in train.itertuples():
    #load images
    train_id = row[1]
    image_raw_file = INPUT_DIR + 'Train/'+ str(train_id) +'.jpg'
    image_dotted_file = INPUT_DIR + 'TrainDotted/'+ str(train_id) +'.jpg'

    if(train_id in MISMATCHED):
        logger.warning('skipping mismatched train_id ' + str(train_id))
        continue
    
    if(not os.path.isfile(image_raw_file)):
        logger.warning('file not found. skipping. file=' + image_raw_file)
        continue
        
    t = Timer('processing photo ' + image_raw_file)
    image_raw = cv2.imread(image_raw_file)
    image_dotted = cv2.imread(image_dotted_file)
    
    classes_count = lions.export_lions(image_raw, image_dotted, x_ds, y_ds, IMAGE_DIMS, debug=True)
    logger.info('detected classes count')
    logger.info(str(classes_count))
    logger.info('-----')
    logger.info('expected classes count')
    logger.info(str(row))
    logger.info('-----')
    logger.info('errors per class')
    error = np.subtract(row[2:], classes_count)
    logger.info(str(error))
    logger.info('acum errors per class')
    total_errors = np.add(total_errors, np.absolute(error))
    logger.info(str(total_errors))
    logger.info('-----')
    logger.info('acum detected classes count')
    total_classes = np.add(total_classes, classes_count)
    logger.info(str(total_classes))
    t.stop()
    
    count = count + 1
    if(MAX_IMAGES!=None and count>=MAX_IMAGES): break

logger.info('ERRORS PER CLASS')
logger.info(str(total_errors))
        
h5file.close()

### Validate dataset