# Analyse and balance classes from dataset

In [6]:
INPUT_DIR='../../input/kaggle-sea-lion/02/'
OUTPUT_DIR='../../output/kaggle-sea-lion/03/'

SHOW_IMAGES = True
MAX_IMAGES = 3

IMAGE_DIMS = (148,148,3)

#%prun print('test')
#%pdb

In [7]:
%matplotlib inline
import cv2
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import h5py
import os

import modules.logging
from modules.logging import logger
import modules.lions as lions
from modules.utils import Timer
import modules.utils as utils

## Prepare output dir

In [13]:
utils.mkdirs(OUTPUT_DIR, recreate=True)
modules.logging.setup_file_logger(OUTPUT_DIR + 'out.log')
logger.info('Dir ' + OUTPUT_DIR + ' created')

2017-04-03 03:48:13,719 INFO Dir ../../output/kaggle-sea-lion/03/ created


## Rebalance dataset classes

In [14]:
from sklearn import preprocessing

#max_augmentation_rotation=20, max_augmentation_shift=0, max_augmentation_scale=1, augmentation_flip_leftright=True, augmentation_flip_updown=True
def dataset_xy_balance_classes_image(input_h5file_path, output_h5file_path, max_augmentation_ratio=3, max_undersampling_ratio=0):
    if(os.path.isfile(output_h5file_path)):
        raise Exception('Output file already exists. file=' + output_h5file_path)
    
    logger.info('loading input dataset ' + input_h5file_path)
    input_h5 = h5py.File(input_h5file_path, 'r')
    input_x_ds = input_h5['X'][0:5]
    input_y_ds = input_h5['Y'][0:5]
    x_dims = input_x_ds.shape
    y_dims = input_y_ds.shape

    nr_classes = input_y_ds.shape[1]

    t = Timer('traversing entire dataset in order to extract population classes distribution')
    count_classes = np.zeros(nr_classes)
    for y in input_y_ds:
        #convert from categorical to label
        lb = preprocessing.LabelBinarizer()
        lb.fit(np.array(range(nr_classes)))
        Y_label = lb.inverse_transform(Y)
        
        count_classes[Y_label] = count_classes[Y_label] + 1
    t.stop()

    logger.info('population distribution')
    smallest_class = None
    smallest_qtty = 999999999
    largest_class = None
    largest_qtty = 0
    for i,c in enumerate(count_classes):
        logger.info(str(i) + ': ' + str(c))
        if(c<smallest_qtty):
            smallest_qtty = c
            smallest_class = i
        if(c>largest_qtty):
            largest_qtty = c
            largest_class = i
    
    qtty_per_class = min(smallest_qtty*max_augmentation_ratio, largest_qtty)
    logger.info('items per class: ' + str(qtty_per_class))
    
    logger.info('augmentation/undersampling ratio per class')
    ratio_classes = np.zeros(nr_classes)
    for i,c in enumerate(count_classes):
        ratio_classes[i] = qtty_per_class/c
        logger.info(str(i) + ': ' + str(ratio_classes[i]))
    
    logger.info('creating output dataset ' + output_h5file_path)
    output_h5 = h5py.File(output_h5file_path, 'w')
    x_dims_zero = x_dims.copy()
    x_dims_zero[0] = 0
    x_dims_one = x_dims.copy()
    x_dims_one[0] = 1
    x_dims_none = x_dims.copy()
    x_dims_none[0] = None
    output_x_ds = output_h5.create_dataset('X', x_dims_zero, maxshape=x_dims_none, chunks=x_dims_one, dtype='f')
    output_y_ds = output_h5.create_dataset('Y', y_dims_zero, maxshape=y_dims_none, dtype='f')
    
    #TODO: NOW ITERATE OVER INPUT AND AUGMENT/UNDERSAMPLE ITEMS ACCORDING TO RATIO_CLASSES!
    parei aqui
    
    logger.info('done')
    
    

In [15]:
input_h5file_path = INPUT_DIR + utils.dataset_name('lion-patches-full', IMAGE_DIMS)
output_h5file_path = OUTPUT_DIR + utils.dataset_name('lion-patches', IMAGE_DIMS)

dataset_xy_balance_classes_image(input_h5file_path, output_h5file_path, max_augmentation_ratio=3, max_undersampling_ratio=0)
    

2017-04-03 03:48:16,126 INFO loading input dataset ../../input/kaggle-sea-lion/02/lion-patches-full-148-148.h5
2017-04-03 03:48:16,130 INFO > [started] traversing entire dataset in order to extract population classes distribution...


5


NameError: name 'item' is not defined

### Validate dataset