In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

from models.tensorflow.deepwide import DeepWide
from models.tensorflow.autoint import AutoInt
from models.tensorflow.dcn import DCN


def set_seed(seed=0):
    tf.keras.utils.set_random_seed(seed)
    tf.config.experimental.enable_op_determinism()

In [None]:
def df_to_dataset(X, y, shuffle=True, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices((X, y))

    if shuffle:
        ds = ds.shuffle(buffer_size=len(X))

    ds = ds.batch(batch_size, drop_remainder=True)
    ds = ds.prefetch(tf.data.AUTOTUNE)

    return ds


batch_size = 4096
learning_rate = 1e-3

# prepare splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, shuffle=False)

assert len(set(X_train.index) & set(X_val.index) & set(X_test.index)) == 0

num_embedding = np.max(X_train)
num_features = X_train.shape[1]

ds_train = df_to_dataset(X_train, y_train, batch_size=batch_size)
ds_val = df_to_dataset(X_val, y_val, batch_size=batch_size)
ds_test = df_to_dataset(X_test, y_test, shuffle=False, batch_size=batch_size)

In [None]:
def train(
    model,
    ds_train,
    ds_val,
    max_epochs=10,
    learning_rate=1e-3
):
    tf.keras.backend.clear_session()
    optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss="binary_crossentropy",
        metrics=["mse"]
    )
    callbacks = [
        tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", patience=3, mode="min"),
        tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3)
    ]

    history = model.fit(
        ds_train,
        validation_data=ds_val,
        epochs=max_epochs,
        callbacks=callbacks,
    )
    
    return history