In [1]:
import h5py
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split

In [2]:
def clean_images(images):
    images = np.nan_to_num(images, copy=False)
    images[images > 1000] = 0
    return images

In [3]:
def cut_images(images, width):
    start = images.shape[1] // 2 - width // 2
    end = images.shape[1] // 2 + width // 2
    return images[:, start:end, start:end, :]

In [4]:
train_path, valid_path, test_path = 'train.h5', 'valid.h5', 'test.h5'
channels = [0, 3]

In [5]:
def split_data(path, train_dest, valid_dest, test_dest, channels=[0, 1, 2, 3], batch=1024):

    # abre o arquivo com os dados e os arquivos de destino
    with h5py.File(path, 'r') as srcf, \
            h5py.File(train_dest, 'w') as train_w, \
            h5py.File(valid_dest, 'w') as valid_w, \
            h5py.File(test_dest, 'w') as test_w:
        
        images = srcf['matrix']
        info = pd.read_hdf(path, key='info', mode='r')

        # recupera o tamanho dos dados
        data_len = info.shape[0]

        # itera sobre todos os dados em chuncks
        for chunck in range(0, data_len, batch):
            # seleciona o chunck
            chunck_slc = slice(chunck, chunck + batch if chunck + batch < data_len else data_len)

            # lê os dados do chunck
            img_chunck = images[chunck_slc]
            info_chunck = info[chunck_slc]
            info_chunck = info_chunck['Vmax']

            # seleciona os canais
            img_chunck = img_chunck[:, :, :, channels]
            
            img_new_shape = img_chunck.shape[1:]

            # separa os dados
            train_img, test_img, train_info, test_info = train_test_split(img_chunck, info_chunck, test_size=.3)
            test_img, valid_img, test_info, valid_info = train_test_split(test_img, test_info, test_size=.5)

            # escreve no arquivo de treinamento
            if 'matrix' not in train_w:
                train_w.create_dataset('matrix', shape=(0,) + img_new_shape, maxshape=(None,) + img_new_shape)
            train_w['matrix'].resize(train_w['matrix'].shape[0] + train_img.shape[0], axis=0)
            train_w['matrix'][-train_img.shape[0]:] = train_img
            if 'info' not in train_w:
                train_w.create_dataset('info', shape=(0,), maxshape=(None,))
            train_w['info'].resize(train_w['info'].shape[0] + train_info.shape[0], axis=0)
            train_w['info'][-train_info.shape[0]:] = train_info

            # escreve no arquivo de validação
            if 'matrix' not in valid_w:
                valid_w.create_dataset('matrix', shape=(0,) + img_new_shape, maxshape=(None,) + img_new_shape)
            valid_w['matrix'].resize(valid_w['matrix'].shape[0] + valid_img.shape[0], axis=0)
            valid_w['matrix'][-valid_img.shape[0]:] = valid_img
            if 'info' not in valid_w:
                valid_w.create_dataset('info', shape=(0,), maxshape=(None,))
            valid_w['info'].resize(valid_w['info'].shape[0] + valid_info.shape[0], axis=0)
            valid_w['info'][-valid_info.shape[0]:] = valid_info

            # escreve no arquivo de teste
            if 'matrix' not in test_w:
                test_w.create_dataset('matrix', shape=(0,) + img_new_shape, maxshape=(None,) + img_new_shape)
            test_w['matrix'].resize(test_w['matrix'].shape[0] + test_img.shape[0], axis=0)
            test_w['matrix'][-test_img.shape[0]:] = test_img
            if 'info' not in test_w:
                test_w.create_dataset('info', shape=(0,), maxshape=(None,))
            test_w['info'].resize(test_w['info'].shape[0] + test_info.shape[0], axis=0)
            test_w['info'][-test_info.shape[0]:] = test_info

split_data('TCIR-ATLN_EPAC_WPAC.h5', train_path, valid_path, test_path, channels=channels)

In [None]:
with h5py.File(train_path, mode='r') as trainsrc, h5py.File(valid_path, mode='r') as validsrc, \
        h5py.File(test_path, mode='r') as testsrc:
    data_len = trainsrc['matrix'].shape[0]
    valid_data_len = validsrc['matrix'].shape[0]
    test_data_len = testsrc['matrix'].shape[0]
    print('Dataset de treino: ', trainsrc['matrix'].shape)
    print('Dataset de validação: ', validsrc['matrix'].shape)
    print('Dataset de teste: ', testsrc['matrix'].shape)

In [None]:
with h5py.File(train_path, mode='r') as f:
    img = f['matrix'][np.random.randint(0, data_len)]
    plt.imshow(img[:, :, 1])

In [None]:
def get_mean(file, batch=1024):
    with h5py.File(file, mode='r') as src:
        images = src['matrix']
        data_len = images.shape[0]
        img_h, img_w = images.shape[1], images.shape[2]
        accumulators = np.zeros(images.shape[-1])
        for i in range(0, data_len, batch):
            chunck = images[i: i + batch if i + batch < data_len else data_len]
            chunck = clean_images(chunck)
            for j in range(accumulators.shape[0]):
                accumulators[j] += np.sum(chunck[:, :, :, j])
    means = accumulators / (data_len * img_h * img_w)
    return means

mean = get_mean(train_path)
print(mean)

In [None]:
def get_std(file, mean, batch=1024):
    with h5py.File(file, mode='r') as src:
        images = src['matrix']
        data_len = images.shape[0]
        img_h, img_w = images.shape[1], images.shape[2]
        accumulators = np.zeros(images.shape[-1])
        for i in range(0, data_len, batch):
            chunck = images[i: i + batch if i + batch < data_len else data_len]
            chunck = clean_images(chunck)
            for j in range(accumulators.shape[0]):
                accumulators[j] += np.sum((chunck[:, :, :, j] - mean[j]) ** 2)
    stds = accumulators / (data_len * img_h * img_w)
    stds = np.sqrt(stds)
    return stds

std = get_std(train_path, mean)
print(std)

In [10]:
epochs = 500
load_batch = 2048
batch = 32
iter_train = data_len // batch
iter_valid = valid_data_len // batch
img_w = 64
processed_train, processed_valid, processed_test = '_train.h5', '_valid.h5', '_test.h5'

In [11]:

def pre_process(file, dest, width, means, stds, batch=1024):
    with h5py.File(file, mode='r') as src, h5py.File(dest, mode='w') as destf:
        src_imgs = src['matrix']
        data_len = src_imgs.shape[0]
        for i in range(0, data_len, batch):
            slc = slice(i, i + batch if i + batch < data_len else data_len)
            img_chunck = src_imgs[slc]
            img_chunck = cut_images(img_chunck, width)
            img_chunck = clean_images(img_chunck)
            for j, (m, s) in enumerate(zip(means, stds)):
                img_chunck[:, :, :, j] -= m
                img_chunck[:, :, :, j] /= s
            
            img_new_shape = img_chunck.shape[1:]
            # escreve no arquivo novo
            if 'matrix' not in destf:
                destf.create_dataset('matrix', shape=(0,) + img_new_shape, maxshape=(None,) + img_new_shape)
            destf['matrix'].resize(destf['matrix'].shape[0] + img_chunck.shape[0], axis=0)
            destf['matrix'][-img_chunck.shape[0]:] = img_chunck

            info_chunck = src['info'][slc]
            if 'info' not in destf:
                destf.create_dataset('info', shape=(0,), maxshape=(None,))
            destf['info'].resize(destf['info'].shape[0] + info_chunck.shape[0], axis=0)
            destf['info'][-info_chunck.shape[0]:] = info_chunck


pre_process(train_path, processed_train, img_w, mean, std)
pre_process(valid_path, processed_valid, img_w, mean, std)
pre_process(test_path, processed_test, img_w, mean, std)


In [None]:
with h5py.File(processed_train, mode='r') as f:
    img = f['matrix'][np.random.randint(0, data_len)]
    plt.imshow(img[:, :, 1])

In [13]:
def generator(file):
    with h5py.File(file, mode='r') as src:
        data_len = src['matrix'].shape[0]
        indexes = list(range(data_len))
        for e in range(epochs):
            random.shuffle(indexes)
            for i in range(0, data_len, load_batch):
                load_indexes = indexes[i : i + load_batch]
                load_indexes.sort()
                images = src['matrix'][load_indexes]
                info = src['info'][load_indexes]
                shuffled_idx = np.random.permutation(images.shape[0])
                images = images[shuffled_idx]
                info = info[shuffled_idx]
                for j in range(0, load_batch, batch):
                    yield images[j : j + batch], info[j : j + batch]


In [14]:
def full_gen(file):
    with h5py.File(file, mode='r') as src:
        images = src['matrix'][:]
        info = src['info'][:]
    data_len = images.shape[0]
    indexes = list(range(data_len))
    for e in range(epochs):
        random.shuffle(indexes)
        for i in range(0, data_len, batch):
            batch_indexes = indexes[i : i + batch]
            batch_images = images[batch_indexes]
            batch_info = info[batch_indexes]
            yield batch_images, batch_info

In [15]:
def get_data(file):
    with h5py.File(file, mode='r') as src:
        images = src['matrix'][:]
        info = src['info'][:]
    return images, info

In [16]:
train_generator = full_gen(processed_train)
valid_generator = full_gen(processed_valid)
test_generator = full_gen(processed_test)

output_signature = (
    tf.TensorSpec(shape=(None, img_w, img_w, len(channels)), dtype=tf.float32),
    tf.TensorSpec(shape=(None,), dtype=tf.float32)
)

train_ds = tf.data.Dataset.from_generator(
    lambda: train_generator,
    output_signature=output_signature
)

valid_ds = tf.data.Dataset.from_generator(
    lambda: valid_generator,
    output_signature=output_signature
)

test_ds = tf.data.Dataset.from_generator(
    lambda: test_generator,
    output_signature=output_signature
)

In [17]:

from tensorflow import keras

def build_model(input_shape, strides=(2, 2)):
    initializer = keras.initializers.RandomNormal(mean=0.0, stddev=0.01)
    # reg = keras.regularizers.L2(1e-5)
    reg = keras.regularizers.L1(1e-5)
    # reg = keras.regularizers.L1L2(1e-5)
    model = keras.models.Sequential()
    model.add(keras.layers.Input(input_shape))
    model.add(keras.layers.Conv2D(16, (4, 4), strides=strides, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Conv2D(32, (3, 3), strides=strides, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Conv2D(64, (3, 3), strides=strides, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Conv2D(128, (3, 3), strides=strides, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    
    model.add(keras.layers.Flatten())
    
    model.add(keras.layers.Dense(256, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Dense(64, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Dense(1, activation='linear', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=.5e-4), loss='mse', metrics=['mse', 'mae'])
    return model

In [None]:
model = build_model((img_w, img_w, len(channels)),)
model.summary()

In [None]:
X, Y = get_data(processed_train)
print(X.shape)
valid_X, valid_y = get_data(processed_valid)
print(valid_X.shape)

# with tf.device('/GPU:0'):
# model.fit(
#     train_ds,
#     validation_data=valid_ds,
#     epochs=epochs,
#     steps_per_epoch=iter_train,
#     validation_steps=iter_valid
# )
with tf.device('/GPU:0'):
    model.fit(
        X,
        Y,
        epochs=epochs,
        validation_data=(valid_X, valid_y),
        validation_freq=3
    )

model.save('result.h5')