In [None]:
import os
import h5py
import time
import datetime
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import json
from tensorflow import keras
from keras.callbacks import ModelCheckpoint

In [None]:
# seed = 1747265027
seed = int(time.time()) % (2**32 - 1)  # ou: random.randint(0, 999999)
print(f"Usando seed: {seed}")

os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.config.threading.set_inter_op_parallelism_threads(1)
tf.config.threading.set_intra_op_parallelism_threads(1)

In [None]:
train_path, valid_path, test_path = '/kaggle/working/train.h5', '/kaggle/working/valid.h5', '/kaggle/working/test.h5'
ds = [f'/kaggle/input/tcir-atln-epac-wpac-h5/{i}' for i in ['TCIR-ATLN_EPAC_WPAC.h5', 'TCIR-CPAC_IO_SH.h5']]
channels = [0, 3]
img_w = 64
load_batch = 4096
epochs = 500
batch = 8

In [None]:
def get_images_slice(images_shape, width):
    start = images_shape[1] // 2 - width // 2
    end = images_shape[1] // 2 + width // 2
    return slice(start, end)

In [None]:
def cut_images(images, width):
    slc = get_images_slice(images.shape, width)
    return images[:, slc, slc, :]

In [None]:
def clean_images(images):
    images = np.nan_to_num(images, copy=False)
    images[images > 1000] = 0
    return images

In [None]:
def split_data(images, info):
    years = [datetime.datetime.strptime(i, "%Y%m%d%H").year for i in list(info['time'])]
    years = np.array(years)
    train_values = (years >= 2003) & (years <= 2014)
    valid_values = (years >= 2015) & (years <= 2016)
    train_idx = np.where(train_values)[0]
    valid_idx = np.where(valid_values)[0]
    info = info['Vmax'].to_numpy()
    train_img, train_info = images[train_idx], info[train_idx]
    valid_img, valid_info = images[valid_idx], info[valid_idx]
    return (train_img, train_info), (valid_img, valid_info)

In [None]:
def get_mean(files, batch=1024, width=64):
    accumulators = np.zeros(len(channels))
    files_data_len = 0.0
    for fi, file in enumerate(files):
        with h5py.File(file, mode='r') as src:
            images = src['matrix']
            info = pd.read_hdf(file, key='info', mode='r')
            slc = get_images_slice(images.shape, width)
            file_len = images.shape[0]
            for i in range(0, file_len, batch):
                image_chunck = images[i: i + batch if i + batch < file_len else file_len, slc, slc, channels]
                info_chunck = info[i: i + batch if i + batch < file_len else file_len]
                image_chunck = clean_images(image_chunck)
                (train_image, _), _ = split_data(image_chunck, info_chunck)
                files_data_len += train_image.shape[0]
                for j in range(accumulators.shape[0]):
                    accumulators[j] += np.sum(train_image[:, :, :, j])
    means = accumulators / (files_data_len * width * width)
    return means

mean = get_mean(ds, batch=load_batch, width=img_w)
print(mean)

In [None]:
def get_std(files, mean, batch=1024, width=64):
    accumulators = np.zeros(len(channels))
    files_data_len = 0.0
    for fi, file in enumerate(files):
        with h5py.File(file, mode='r') as src:
            images = src['matrix']
            info = pd.read_hdf(file, key='info', mode='r')
            slc = get_images_slice(images.shape, width)
            file_len = images.shape[0]
            for i in range(0, file_len, batch):
                image_chunck = images[i: i + batch if i + batch < file_len else file_len, slc, slc, channels]
                info_chunck = info[i: i + batch if i + batch < file_len else file_len]
                image_chunck = clean_images(image_chunck)
                (train_image, _), _ = split_data(image_chunck, info_chunck)
                files_data_len += train_image.shape[0]
                for j in range(accumulators.shape[0]):
                    accumulators[j] += np.sum((train_image[:, :, :, j] - mean[j]) ** 2)
    stds = accumulators / (files_data_len * width * width)
    stds = np.sqrt(stds)
    return stds

std = get_std(ds, mean, batch=load_batch, width=img_w)
print(std)

In [None]:

def pre_process(files, width, means, stds, batch=1024):
    # corta imagem grande o suficiente para poder rotacionar
    rotation_width = int(np.ceil(np.sqrt((width ** 2) * 2)))
    if rotation_width % 2 != 0:
        rotation_width += 1

    files_data_len = np.zeros(len(files))
    with h5py.File(train_path, 'w') as train, h5py.File(valid_path, 'w') as valid:
        for fi, file in enumerate(files):
            with h5py.File(file, mode='r') as src:
                images = src['matrix']
                info = pd.read_hdf(file, key='info', mode='r')[['Vmax', 'time']]
                slc = get_images_slice(images.shape, rotation_width)
                file_len = images.shape[0]
                for i in range(0, file_len, batch):
                    batch_slc = slice(i, i + batch if i + batch < file_len else file_len)
                    img_chunck = images[batch_slc, slc, slc, channels]
                    info_chunck = info[batch_slc]
                    img_chunck = clean_images(img_chunck)
                    for j, (m, s) in enumerate(zip(means, stds)):
                        img_chunck[:, :, :, j] -= m
                        img_chunck[:, :, :, j] /= s

                    img_new_shape = img_chunck.shape[1:]

                    (train_img, train_info), (valid_img, valid_info) = split_data(img_chunck, info_chunck)

                    if train_img.shape[0] > 0:
                        if 'matrix' not in train:
                            train.create_dataset('matrix', shape=(0,) + img_new_shape, maxshape=(None,) + img_new_shape)
                        train['matrix'].resize(train['matrix'].shape[0] + train_img.shape[0], axis=0)
                        train['matrix'][-train_img.shape[0]:] = train_img
                        if 'info' not in train:
                            train.create_dataset('info', shape=(0,), maxshape=(None,))
                        train['info'].resize(train['info'].shape[0] + train_info.shape[0], axis=0)
                        train['info'][-train_info.shape[0]:] = train_info


                    if valid_img.shape[0] > 0:
                        if 'matrix' not in valid:
                            valid.create_dataset('matrix', shape=(0,) + img_new_shape, maxshape=(None,) + img_new_shape)
                        valid['matrix'].resize(valid['matrix'].shape[0] + valid_img.shape[0], axis=0)
                        valid['matrix'][-valid_img.shape[0]:] = valid_img
                        if 'info' not in valid:
                            valid.create_dataset('info', shape=(0,), maxshape=(None,))
                        valid['info'].resize(valid['info'].shape[0] + valid_info.shape[0], axis=0)
                        valid['info'][-valid_info.shape[0]:] = valid_info


pre_process(ds, img_w, mean, std, batch=load_batch)


In [None]:
with h5py.File(train_path, mode='r') as trainsrc, h5py.File(valid_path, mode='r') as validsrc:
    data_len = trainsrc['matrix'].shape[0]
    valid_data_len = validsrc['matrix'].shape[0]

    print('Dataset de treino: ', trainsrc['matrix'].shape)
    print('Dataset de validação: ', validsrc['matrix'].shape)

iter_train = data_len // batch
iter_valid = valid_data_len // batch

In [None]:
def parse_example(image, label):
    image = tf.cast(image, tf.float32)
    image = preprocess_image_tf(image)
    return image, label

def preprocess_image_tf(image):
    angle_rad = tf.random.uniform([], 0, 2 * np.pi)
    image_shape = tf.shape(image)[0:2]
    cx = tf.cast(image_shape[1] / 2, tf.float32)
    cy = tf.cast(image_shape[0] / 2, tf.float32)
    cos_a = tf.math.cos(angle_rad)
    sin_a = tf.math.sin(angle_rad)
    transform = tf.stack([
        cos_a, -sin_a, (1 - cos_a) * cx + sin_a * cy,
        sin_a,  cos_a, (1 - cos_a) * cy - sin_a * cx,
        0.0,    0.0
    ])
    transform = tf.reshape(transform, [8])
    transform = tf.expand_dims(transform, 0)
    image = tf.expand_dims(image, 0)
    rotated = tf.raw_ops.ImageProjectiveTransformV3(
        images=image,
        transforms=transform,
        output_shape=image_shape,
        interpolation="BILINEAR",
        fill_mode="REFLECT",
        fill_value=0.0
    )
    rotated = tf.squeeze(rotated, 0)
    return tf.image.resize_with_crop_or_pad(rotated, img_w, img_w)

def load_dataset(file, batch_size):
    with h5py.File(file, 'r') as f:
        images = f['matrix'][:]
        labels = f['info'][:]

    dataset = tf.data.Dataset.from_tensor_slices((images, labels))
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size=len(images))
    dataset = dataset.map(parse_example, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset


In [None]:
def get_data(file):
    with h5py.File(file, mode='r') as src:
        images = src['matrix'][:]
        info = src['info'][:]
        images = cut_images(images, img_w)
        return tf.constant(images), tf.constant(info)

In [None]:
train_ds = load_dataset(train_path, batch)
valid_ds = get_data(valid_path)

In [None]:

def build_model(input_shape, strides=(2, 2)):
    initializer = keras.initializers.RandomNormal(mean=0.0, stddev=0.01)
    reg = keras.regularizers.L2(1e-5)
    model = keras.models.Sequential()
    model.add(keras.layers.Input(input_shape))
    model.add(keras.layers.Conv2D(16, (4, 4), strides=strides, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Conv2D(32, (3, 3), strides=strides, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Conv2D(64, (3, 3), strides=strides, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Conv2D(128, (3, 3), strides=strides, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    
    model.add(keras.layers.Flatten())
    
    model.add(keras.layers.Dense(256, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Dense(64, activation='relu', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))
    model.add(keras.layers.Dense(1, activation='linear', kernel_initializer=initializer, kernel_regularizer=reg, bias_initializer=initializer, bias_regularizer=reg))

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=.5e-4), loss='mse', metrics=['mse'])
    return model

In [None]:
def training_number():
    counter = 0
    while True:
        yield counter
        counter += 1

training_n_gen = training_number()

In [None]:
training_n = next(training_n_gen)

model = build_model((img_w, img_w, len(channels)),)
model.summary()

best_model_path = '{epoch:02d}-{val_loss:.2f}.keras'
callback = ModelCheckpoint(filepath=best_model_path,
                           monitor='val_loss',
                           verbose=0,
                           save_best_only=True,
                           mode='min')

with tf.device('/GPU:0'):
    history = model.fit(
        train_ds,
        validation_data=valid_ds,
        epochs=epochs,
        steps_per_epoch=iter_train,
        validation_steps=iter_valid,
        callbacks=[callback],
    )

model_info_dict = {
    "seed": seed,
    "shape": [
        img_w,
        img_w,
        len(channels)
    ],
    "channels": channels,
    "dataset": ds,
    "batch": batch,
    "normparams": [{"mean": mean[i], "std": std[i]} for i in range(len(channels))],
    "validmse": list(history.history['val_loss']),
    "trainingmse": list(history.history['loss'])
}

json_info = json.dumps(model_info_dict, indent=4)
with open(f'n{training_n}-model_info.json', 'w') as outfile:
    outfile.write(json_info)