# Analyse and balance classes from dataset

In [1]:
INPUT_DIR='../../input/kaggle-sea-lion/02/'
OUTPUT_DIR='../../output/kaggle-sea-lion/03/'

INPUT_NAME='lion-patches-30px'

SHOW_IMAGES = True
MAX_IMAGES = 1

#IMAGE_DIMS = (148,148,3)
IMAGE_DIMS = (84,84,3)

RANDOM_SEED = 1

#%prun print('test')
#%pdb

In [2]:
%matplotlib inline
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import os
import random
import keras
from keras import backend as K
from keras.preprocessing.image import ImageDataGenerator

import modules.logging
from modules.logging import logger
import modules.lions as lions
from modules.utils import Timer
import modules.utils as utils

random.seed(RANDOM_SEED)

Using TensorFlow backend.


## Prepare output dir

In [None]:
utils.mkdirs(OUTPUT_DIR, recreate=True)
modules.logging.setup_file_logger(OUTPUT_DIR + 'out.log')
logger.info('Dir ' + OUTPUT_DIR + ' created')

2017-04-09 19:56:06,042 INFO Dir ../../output/kaggle-sea-lion/03/ created


## Rebalance dataset classes

In [None]:
input_h5file_path = INPUT_DIR + utils.dataset_name(INPUT_NAME, IMAGE_DIMS)
output_h5file_path = OUTPUT_DIR + utils.dataset_name(INPUT_NAME + '-balanced', IMAGE_DIMS)

logger.info('loading input dataset ' + input_h5file_path)
input_h5file = h5py.File(input_h5file_path, 'r')
with input_h5file:
    logger.info('creating output dataset ' + output_h5file_path)
    output_h5file = h5py.File(output_h5file_path, 'w')
    with output_h5file:
        logger.info('balancing classes')
        utils.dataset_xy_balance_classes_image(input_h5file, output_h5file, max_augmentation_ratio=1, max_undersampling_ratio=1, classes_distribution_weight=(1,1,1,1,1,1), enforce_max_ratios=True)

2017-04-09 19:56:06,053 INFO loading input dataset ../../input/kaggle-sea-lion/02/lion-patches-30px-84-84.h5
2017-04-09 19:56:06,055 INFO creating output dataset ../../output/kaggle-sea-lion/03/lion-patches-30px-balanced-84-84.h5
2017-04-09 19:56:06,057 INFO balancing classes
2017-04-09 19:56:06,059 INFO > [started] traversing entire dataset in order to extract population classes distribution...
2017-04-09 19:56:06,099 INFO > [done]    traversing entire dataset in order to extract population classes distribution (40.017 ms)
2017-04-09 19:56:06,100 INFO population distribution
2017-04-09 19:56:06,102 INFO 0: 4680.0
2017-04-09 19:56:06,103 INFO 1: 3534.0
2017-04-09 19:56:06,105 INFO 2: 17903.0
2017-04-09 19:56:06,106 INFO 3: 8577.0
2017-04-09 19:56:06,107 INFO 4: 6597.0
2017-04-09 19:56:06,109 INFO 5: 33037.0
2017-04-09 19:56:06,110 INFO targeting items per class: 7068.0
2017-04-09 19:56:06,111 INFO augmentation/undersampling ratio per class
2017-04-09 19:56:06,113 INFO 0: 1.51025641026


|#######################--| 68560/74328 92% 183s

### Validate dataset

In [None]:
logger.info('class distribution BEFORE balancing')
with h5py.File(input_h5file_path, 'r') as input_h5:
    y = input_h5['Y'][()]
    count_classes = utils.class_distribution(y)
    logger.info(str(count_classes))

In [None]:
logger.info('class distribution AFTER balancing')
with h5py.File(output_h5file_path, 'r') as output_h5:
    y = output_h5['Y'][()]
    count_classes = utils.class_distribution(y)
    logger.info(str(count_classes))

In [None]:
with h5py.File(input_h5file_path, 'r') as input_h5:
    utils.show_images(input_h5['X'][0:10], cols=10, is_bgr=True, size=2)

In [None]:
with h5py.File(output_h5file_path, 'r') as output_h5:
    utils.show_images(output_h5['X'][0:10], image_labels=utils.categorical_to_label(output_h5['Y'][0:10]), cols=10, is_bgr=True, size=2)