# Introduction

Notebook para explicar processo de seleção de dados para partições Train, Validation e Test para treinos.

Cada dataset é especificado em uma subseção do notebook.

Como as redes Single Task foram treinadas seguindo a proporção
* train_prop = 0.9
* validation_prop = 0.05
* test_prop = 0.05
* validation_split = 0.1

E inicialmente se usou um _validation_split_ de 0.1 sobre os dados de treino (95%), foi preciso manter os mesmos dados inicialmente para não precisar reexecutar os treinamentos das redes Single Task.

No setup inicial usamos a classe utilitária _ImageDataGenerator_ do Keras para fazer o split dos dados e data augmentation para as partições de treino e validação. 

Contudo, ao final desse setup nesse notebook, carregaremos os dados diretamente dos .csv para um dataframe e então para o _ImageDataGenerator_ e sem usar data augmentation para a partição de validação. 

Sendo assim, rodar o ExperimentRunner não precisará mais dos argumentos train_prop, validation_prop e test_prop para fins de mudar os dados de entrada da rede. Esses valores não serão mais usados.

# Import Libraries and Dependencies

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import os
import sys
import pandas as pd

if '../../../notebooks/' not in sys.path:
    sys.path.append('../../../notebooks/')
if 'src' not in sys.path:
    sys.path.insert(0, 'src')

import utils.constants as cts

from data_loaders.data_loader import DLName
from gt_loaders.gt_names import GTName
from exp_runner import ExperimentRunner
from model_trainer import BaseModel, Optimizer

# Data Selection

## FVC - Aligned

In [15]:
ds = GTName.FVC
aligned = True

kwargs = { 
    'use_neptune': False,
    'exp_params' : {
        'name': '',
        'description': f'',
        'tags': [],
        'src_files': []
    },
    'properties': {
        'reqs': list(cts.ICAO_REQ),
        'aligned': aligned,
        'use_gt_data': True,
        'gt_names': {
            'train_validation': [],
            'test': [],
            'train_validation_test': [ds]
        },
        'balance_input_data': False,
        'train_model': False,
        'save_trained_model': False,
        'model_name': '',
        'orig_model_experiment_id': '-',
        'sample_training_data': False,
        'sample_prop': 1.
    },
    'net_train_params': {
        'base_model': BaseModel.VGG16,
        'batch_size': 64,
        'n_epochs': 10,
        'early_stopping': 10,
        'learning_rate': 1e-3,
        'optimizer': Optimizer.ADAMAX,
        'train_prop': 0.9,
        'validation_prop': 0.05,
        'test_prop': 0.05,
        'validation_split': 0.1,
        'dropout': 0.3
    }
}

runner = ExperimentRunner(**kwargs)

runner.load_training_data()
runner.setup_data_generators()

train_imgs = runner.data_processor.train_gen.filenames
validation_imgs = runner.data_processor.validation_gen.filenames
test_imgs = runner.data_processor.test_gen.filenames

print(len(train_imgs), len(validation_imgs),len(test_imgs))

train_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())
validation_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())
test_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())

train_df['img_path'] = train_imgs
validation_df['img_path'] = validation_imgs
test_df['img_path'] = test_imgs

for idx,req in enumerate(cts.ICAO_REQ.list_reqs_names()):
    train_df[req] = runner.data_processor.train_gen.labels[idx].astype(int)
    validation_df[req] = runner.data_processor.validation_gen.labels[idx].astype(int)
    test_df[req] = runner.data_processor.test_gen.labels[idx].astype(int)

train_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'fvc_aligned_train.csv'), index=False)
validation_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'fvc_aligned_validation.csv'), index=False)
test_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'fvc_aligned_test.csv'), index=False)

-------------------- Init ExperimentRunner -------------------
---------------------------
Parent Process ID: 6145
Process ID: 53600
---------------------------
-----
Use Neptune:  False
-----
-------------------
Args: 
{'exp_params': {'description': '', 'name': '', 'src_files': [], 'tags': []},
 'net_train_params': {'base_model': <BaseModel.VGG16: {'target_size': (224, 224), 'prep_function': <function preprocess_input at 0x7fa223748e50>}>,
                      'batch_size': 64,
                      'dropout': 0.3,
                      'early_stopping': 10,
                      'learning_rate': 0.001,
                      'n_epochs': 10,
                      'optimizer': <Optimizer.ADAMAX: 'Adamax'>,
                      'test_prop': 0.05,
                      'train_prop': 0.9,
                      'validation_prop': 0.05,
                      'validation_split': 0.1},
 'properties': {'aligned': True,
                'balance_input_data': False,
                'gt_names': {

## FVC - Not Aligned

In [14]:
ds = GTName.FVC
aligned = False

kwargs = { 
    'use_neptune': False,
    'exp_params' : {
        'name': '',
        'description': f'',
        'tags': [],
        'src_files': []
    },
    'properties': {
        'reqs': list(cts.ICAO_REQ),
        'aligned': aligned,
        'use_gt_data': True,
        'gt_names': {
            'train_validation': [],
            'test': [],
            'train_validation_test': [ds]
        },
        'balance_input_data': False,
        'train_model': False,
        'save_trained_model': False,
        'model_name': '',
        'orig_model_experiment_id': '-',
        'sample_training_data': False,
        'sample_prop': 1.
    },
    'net_train_params': {
        'base_model': BaseModel.VGG16,
        'batch_size': 64,
        'n_epochs': 10,
        'early_stopping': 10,
        'learning_rate': 1e-3,
        'optimizer': Optimizer.ADAMAX,
        'train_prop': 0.9,
        'validation_prop': 0.05,
        'test_prop': 0.05,
        'validation_split': 0.1,
        'dropout': 0.3
    }
}

runner = ExperimentRunner(**kwargs)

runner.load_training_data()
runner.setup_data_generators()

train_imgs = runner.data_processor.train_gen.filenames
validation_imgs = runner.data_processor.validation_gen.filenames
test_imgs = runner.data_processor.test_gen.filenames

print(len(train_imgs), len(validation_imgs),len(test_imgs))

train_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())
validation_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())
test_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())

train_df['img_path'] = train_imgs
validation_df['img_path'] = validation_imgs
test_df['img_path'] = test_imgs

for idx,req in enumerate(cts.ICAO_REQ.list_reqs_names()):
    train_df[req] = runner.data_processor.train_gen.labels[idx].astype(int)
    validation_df[req] = runner.data_processor.validation_gen.labels[idx].astype(int)
    test_df[req] = runner.data_processor.test_gen.labels[idx].astype(int)

train_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'fvc_not_aligned_train.csv'), index=False)
validation_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'fvc_not_aligned_validation.csv'), index=False)
test_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'fvc_not_aligned_test.csv'), index=False)

-------------------- Init ExperimentRunner -------------------
---------------------------
Parent Process ID: 6145
Process ID: 53600
---------------------------
-----
Use Neptune:  False
-----
-------------------
Args: 
{'exp_params': {'description': '', 'name': '', 'src_files': [], 'tags': []},
 'net_train_params': {'base_model': <BaseModel.VGG16: {'target_size': (224, 224), 'prep_function': <function preprocess_input at 0x7fa223748e50>}>,
                      'batch_size': 64,
                      'dropout': 0.3,
                      'early_stopping': 10,
                      'learning_rate': 0.001,
                      'n_epochs': 10,
                      'optimizer': <Optimizer.ADAMAX: 'Adamax'>,
                      'test_prop': 0.05,
                      'train_prop': 0.9,
                      'validation_prop': 0.05,
                      'validation_split': 0.1},
 'properties': {'aligned': False,
                'balance_input_data': False,
                'gt_names': 

## PYBOSSA - Aligned

In [13]:
ds = GTName.PYBOSSA
aligned = True

kwargs = { 
    'use_neptune': False,
    'exp_params' : {
        'name': '',
        'description': f'',
        'tags': [],
        'src_files': []
    },
    'properties': {
        'reqs': list(cts.ICAO_REQ),
        'aligned': aligned,
        'use_gt_data': True,
        'gt_names': {
            'train_validation': [],
            'test': [],
            'train_validation_test': [ds]
        },
        'balance_input_data': False,
        'train_model': False,
        'save_trained_model': False,
        'model_name': '',
        'orig_model_experiment_id': '-',
        'sample_training_data': False,
        'sample_prop': 1.
    },
    'net_train_params': {
        'base_model': BaseModel.VGG16,
        'batch_size': 64,
        'n_epochs': 10,
        'early_stopping': 10,
        'learning_rate': 1e-3,
        'optimizer': Optimizer.ADAMAX,
        'train_prop': 0.9,
        'validation_prop': 0.05,
        'test_prop': 0.05,
        'validation_split': 0.1,
        'dropout': 0.3
    }
}

runner = ExperimentRunner(**kwargs)

runner.load_training_data()
runner.setup_data_generators()

train_imgs = runner.data_processor.train_gen.filenames
validation_imgs = runner.data_processor.validation_gen.filenames
test_imgs = runner.data_processor.test_gen.filenames

print(len(train_imgs), len(validation_imgs),len(test_imgs))

train_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())
validation_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())
test_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())

train_df['img_path'] = train_imgs
validation_df['img_path'] = validation_imgs
test_df['img_path'] = test_imgs

for idx,req in enumerate(cts.ICAO_REQ.list_reqs_names()):
    train_df[req] = runner.data_processor.train_gen.labels[idx].astype(int)
    validation_df[req] = runner.data_processor.validation_gen.labels[idx].astype(int)
    test_df[req] = runner.data_processor.test_gen.labels[idx].astype(int)

train_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'pybossa_aligned_train.csv'), index=False)
validation_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'pybossa_aligned_validation.csv'), index=False)
test_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'pybossa_aligned_test.csv'), index=False)

-------------------- Init ExperimentRunner -------------------
---------------------------
Parent Process ID: 6145
Process ID: 53600
---------------------------
-----
Use Neptune:  False
-----
-------------------
Args: 
{'exp_params': {'description': '', 'name': '', 'src_files': [], 'tags': []},
 'net_train_params': {'base_model': <BaseModel.VGG16: {'target_size': (224, 224), 'prep_function': <function preprocess_input at 0x7fa223748e50>}>,
                      'batch_size': 64,
                      'dropout': 0.3,
                      'early_stopping': 10,
                      'learning_rate': 0.001,
                      'n_epochs': 10,
                      'optimizer': <Optimizer.ADAMAX: 'Adamax'>,
                      'test_prop': 0.05,
                      'train_prop': 0.9,
                      'validation_prop': 0.05,
                      'validation_split': 0.1},
 'properties': {'aligned': True,
                'balance_input_data': False,
                'gt_names': {

## PYBOSSA - Not Aligned

In [16]:
ds = GTName.PYBOSSA
aligned = False

kwargs = { 
    'use_neptune': False,
    'exp_params' : {
        'name': '',
        'description': f'',
        'tags': [],
        'src_files': []
    },
    'properties': {
        'reqs': list(cts.ICAO_REQ),
        'aligned': aligned,
        'use_gt_data': True,
        'gt_names': {
            'train_validation': [],
            'test': [],
            'train_validation_test': [ds]
        },
        'balance_input_data': False,
        'train_model': False,
        'save_trained_model': False,
        'model_name': '',
        'orig_model_experiment_id': '-',
        'sample_training_data': False,
        'sample_prop': 1.
    },
    'net_train_params': {
        'base_model': BaseModel.VGG16,
        'batch_size': 64,
        'n_epochs': 10,
        'early_stopping': 10,
        'learning_rate': 1e-3,
        'optimizer': Optimizer.ADAMAX,
        'train_prop': 0.9,
        'validation_prop': 0.05,
        'test_prop': 0.05,
        'validation_split': 0.1,
        'dropout': 0.3
    }
}

runner = ExperimentRunner(**kwargs)

runner.load_training_data()
runner.setup_data_generators()

train_imgs = runner.data_processor.train_gen.filenames
validation_imgs = runner.data_processor.validation_gen.filenames
test_imgs = runner.data_processor.test_gen.filenames

print(len(train_imgs), len(validation_imgs),len(test_imgs))

train_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())
validation_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())
test_df = pd.DataFrame(columns=['img_path']+cts.ICAO_REQ.list_reqs_names())

train_df['img_path'] = train_imgs
validation_df['img_path'] = validation_imgs
test_df['img_path'] = test_imgs

for idx,req in enumerate(cts.ICAO_REQ.list_reqs_names()):
    train_df[req] = runner.data_processor.train_gen.labels[idx].astype(int)
    validation_df[req] = runner.data_processor.validation_gen.labels[idx].astype(int)
    test_df[req] = runner.data_processor.test_gen.labels[idx].astype(int)

train_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'pybossa_not_aligned_train.csv'), index=False)
validation_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'pybossa_not_aligned_validation.csv'), index=False)
test_df.to_csv(os.path.join(cts.ICAO_DATASET_PATH, 'pybossa_not_aligned_test.csv'), index=False)

-------------------- Init ExperimentRunner -------------------
---------------------------
Parent Process ID: 6145
Process ID: 53600
---------------------------
-----
Use Neptune:  False
-----
-------------------
Args: 
{'exp_params': {'description': '', 'name': '', 'src_files': [], 'tags': []},
 'net_train_params': {'base_model': <BaseModel.VGG16: {'target_size': (224, 224), 'prep_function': <function preprocess_input at 0x7fa223748e50>}>,
                      'batch_size': 64,
                      'dropout': 0.3,
                      'early_stopping': 10,
                      'learning_rate': 0.001,
                      'n_epochs': 10,
                      'optimizer': <Optimizer.ADAMAX: 'Adamax'>,
                      'test_prop': 0.05,
                      'train_prop': 0.9,
                      'validation_prop': 0.05,
                      'validation_split': 0.1},
 'properties': {'aligned': False,
                'balance_input_data': False,
                'gt_names': 