In [1]:
import pandas as pd
import numpy as np
import polars as pl
import pyarrow.parquet as pq

import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Lambda
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam


import gc


from utils import reduce_memory, config

CONFIG = config.CONFIG

tf.config.experimental.list_physical_devices("GPU")
# for gpu in gpus:
#     tf.config.experimental.set_memory_growth(gpu, True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
def ANN(shape):
    inputs = layers.Input(shape=(shape,), name="input_layer")

    x1 = layers.BatchNormalization()(inputs)
    x1 = layers.Dense(512, activation=None)(x1)
    x1 = layers.Activation("silu")(x1)

    x2 = layers.Dropout(0.1)(x1)
    x2 = layers.BatchNormalization()(x2)
    x2 = layers.Dense(512, activation=None)(x2)
    x2 = layers.Activation("silu")(x2)

    x3 = layers.Dropout(0.1)(x2)
    x3 = layers.BatchNormalization()(x3)
    x3 = layers.Dense(256, activation=None)(x3)
    x3 = layers.Activation("silu")(x3)

    outputs = layers.Dense(1, activation="tanh")(x3)
    outputs = Lambda(lambda x: x * 5)(outputs)

    model = models.Model(inputs=inputs, outputs=outputs)

    model.compile(
        optimizer=Adam(learning_rate=1e-4, decay=5e-4),
        loss="mean_squared_error",
        weighted_metrics=[R2Metric()],
    )
    return model


class R2Metric(tf.keras.metrics.Metric):
    def __init__(self, name="r2", **kwargs):
        super(R2Metric, self).__init__(name=name, **kwargs)
        self.squared_error = self.add_weight(name="squared_error", initializer="zeros")
        self.total_error = self.add_weight(name="total_error", initializer="zeros")

    def update_state(self, y_true, y_pred, sample_weight=None):
        # Calculate squared error
        y_true = tf.cast(y_true, dtype=tf.float32)
        y_pred = tf.cast(y_pred, dtype=tf.float32)
        sample_weight = tf.cast(sample_weight, dtype=tf.float32)
        squared_error = (y_pred - y_true) ** 2
        total_error = y_true**2

        # Update the total squared error, total error, and total weight
        self.squared_error.assign_add(tf.reduce_sum(squared_error * sample_weight))
        self.total_error.assign_add(tf.reduce_sum(total_error * sample_weight))

    def result(self):
        # Compute R²: 1 - (squared_error / total_error)
        return 1 - (self.squared_error / (self.total_error + 1e-38))

    def reset_state(self):
        # Reset all metrics at the end of each epoch
        self.squared_error.assign(0)
        self.total_error.assign(0)

In [3]:
valid_features_file_path = f"{CONFIG.main}/data/training_data_impt/X_valid.parquet"
valid_labels_file_path = f"{CONFIG.main}/data/training_data_impt/y_valid.parquet"
valid_weights_file_path = f"{CONFIG.main}/data/training_data_impt/w_valid.parquet"

In [4]:
batch_size = 8192
features_shape = 159
epochs = 2_000

callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=25,
)

features_batch = reduce_memory.reduce_mem_usage(
    pd.read_parquet(valid_features_file_path).fillna(0)
).values
labels_batch = reduce_memory.reduce_mem_usage(
    pd.read_parquet(valid_labels_file_path).fillna(0)
).values.squeeze()
weights_batch = reduce_memory.reduce_mem_usage(
    pd.read_parquet(valid_weights_file_path).fillna(0)
).values.squeeze()

with tf.device("/CPU:0"):
    valid_dataset = tf.data.Dataset.from_tensor_slices(
        (features_batch, labels_batch, weights_batch)
    )
    valid_dataset = valid_dataset.batch(batch_size)

Memory usage of dataframe is 2055.52 MB
Memory usage after optimization is: 1183.70 MB
Decreased by 42.41%
Memory usage of dataframe is 7.09 MB
Memory usage after optimization is: 7.09 MB
Decreased by 0.00%
Memory usage of dataframe is 7.09 MB
Memory usage after optimization is: 7.09 MB
Decreased by 0.00%


In [5]:
for fold in range(4, CONFIG.N_fold):
    model = ANN(features_shape)

    train_features_file_path = (
        f"{CONFIG.main}/data/training_data_impt/X_train_{fold}.parquet"
    )
    train_labels_file_path = (
        f"{CONFIG.main}/data/training_data_impt/y_train_{fold}.parquet"
    )
    train_weights_file_path = (
        f"{CONFIG.main}/data/training_data_impt/w_train_{fold}.parquet"
    )

    # Create the TensorFlow Dataset
    train_features_file = pq.ParquetFile(train_features_file_path)
    train_labels_file = pq.ParquetFile(train_labels_file_path)
    train_weights_file = pq.ParquetFile(train_weights_file_path)

    train_row_group = train_features_file.num_row_groups

    step = int(train_row_group / 3)

    for i in range(0, train_row_group, step):
        print(i)
        batch_end = min(i + step, train_row_group)
        features_batch = reduce_memory.reduce_mem_usage(
            train_features_file.read_row_groups([i for i in range(i, batch_end)])
            .to_pandas()
            .fillna(0)
        ).values
        labels_batch = reduce_memory.reduce_mem_usage(
            train_labels_file.read_row_groups([i for i in range(i, batch_end)])
            .to_pandas()
            .fillna(0)
        ).values.squeeze()
        weights_batch = reduce_memory.reduce_mem_usage(
            train_weights_file.read_row_groups([i for i in range(i, batch_end)])
            .to_pandas()
            .fillna(0)
        ).values.squeeze()

        with tf.device("/CPU:0"):
            train_dataset = tf.data.Dataset.from_tensor_slices(
                (features_batch, labels_batch, weights_batch)
            )
            train_dataset = (
                train_dataset.shuffle(buffer_size=batch_size)
                .batch(batch_size)
                .prefetch(tf.data.experimental.AUTOTUNE)
            )

        with tf.device("/GPU:0"):
            model.fit(
                train_dataset,
                epochs=epochs,
                validation_data=valid_dataset,
                callbacks=[callback],
            )

        del train_dataset

    tf.keras.models.save_model(model, f"{CONFIG.main}/Models_impt/NN/NN_{fold+1}.h5")

0
Memory usage of dataframe is 3960.00 MB
Memory usage after optimization is: 3696.00 MB
Decreased by 6.67%
Memory usage of dataframe is 22.00 MB
Memory usage after optimization is: 22.00 MB
Decreased by 0.00%
Memory usage of dataframe is 22.00 MB
Memory usage after optimization is: 22.00 MB
Decreased by 0.00%
Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
11
Memory usage of dataframe is 3960.00 MB
Memory usage after optimization is: 3696.00 MB
Decreased by 6.67%
Memory usage of dataframe is 22.00 MB
Memory usage after optimization is: 22.00 MB
Decreased by 0.00%
Memory usage of datafram