In [None]:
import os, random
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# reproducibility
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

# load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train_new = pd.read_csv('./data/train_new.csv')
test_new = pd.read_csv('./data/test_new.csv')

train = pd.concat([train, train_new], axis=1)
test = pd.concat([test, test_new], axis=1)

# fill NaNs with means
train = train.fillna(train.mean(numeric_only=True), inplace=True)
test = test.fillna(test.mean(numeric_only=True), inplace=True)

features = [col for col in train.columns if col not in ['Y1', 'Y2']]
X_full = train[features].values
X_test = test[features].values
y2_full = train['Y2'].values.astype(float)

# model making
def make_model(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(BatchNormalization())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(1))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    return model

# 5-fold cv
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof = np.zeros(len(train))
test_preds = np.zeros(len(test), kf.get_n_splits())
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full), 1):
    X_tr, X_va = X_full[train_idx], X_full[val_idx]
    y2_tr, y2_va = y2_full[train_idx], y2_full[val_idx]

    # feature scaling
    x_scaler = StandardScaler().fit(X_tr)
    X_tr_s = x_scaler.transform(X_tr)
    X_va_s = x_scaler.transform(X_va)
    X_test_s = x_scaler.transform(X_test)

    # target scaling
    y_scaler = StandardScaler().fit(y2_tr.reshape(-1, 1))
    y2_tr_s = y_scaler.transform(y2_tr.reshape(-1, 1)).ravel()
    y2_va_s = y_scaler.transform(y2_va.reshape(-1, 1)).ravel()

    # callbacks
    callbacks = [
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=6, verbose=1),
        EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True, verbose=1)
    ]

    model = make_model(X_tr_s.shape[1])
    model.fit(
        X_tr_s, y2_tr_s, 
        validation_data=(X_va_s, y2_va_s),
        epochs=500, 
        batch_size=256, 
        callbacks=callbacks, 
        verbose=1
    )

    # fold validation preds (invert target scaling)
    va_pred_s = model.predict(X_va_s, verbose=0).ravel()
    va_pred = y_scaler.inverse_transform(va_pred_s.reshape(-1, 1)).ravel()
    oof[val_idx] = va_pred

    fold_r2 = r2_score(y2_va, va_pred)
    fold_scores.append(fold_r2)

    te_pred_s = model.predict(X_test_s, verbose=0).ravel()
    te_pred = y_scaler.inverse_transform(te_pred_s.reshape(-1, 1)).ravel()
    test_preds[:, fold-1] = te_pred

y2_test_pred = test_preds.mean(axis=1)
y1_base_pred = np.full(len(test), train["Y1"].mean())

submission = pd.DataFrame({
    'ID': test['ID'],
    'Y1': y1_base_pred,
    'Y2': y2_test_pred
})
submission.to_csv('./submission.csv', index=False)
print("Saved submission.csv")