In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from torcheval.metrics import R2Score
import torch
import optuna
from time import time
from optuna.visualization import plot_optimization_history


In [None]:
print(f'PyTorch version: {torch.__version__}')
print('*'*10)
print(f'_CUDA version: ')
print('*'*10)
print(f'CUDNN version: {torch.backends.cudnn.version()}')
print(f'Available GPU devices: {torch.cuda.device_count()}')

In [None]:
feature_data = pd.read_csv("./feature_data/features_big.csv").to_numpy()

In [None]:
auxiliary_data = pd.read_csv("./data/train.csv").to_numpy()
means = auxiliary_data.mean(axis=0)[1:]
stds = auxiliary_data.std(axis=0)[1:]

In [None]:
X = feature_data[:, :-6]
y = feature_data[:, -6:]

In [None]:
print(X.shape, y.shape)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.95, test_size=0.05, random_state=42)

In [None]:
def train_validate(X_train, y_train, X_val, y_val, params):
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, verbose=True)
    predictions = model.predict(X_val)

    preds_tensor = torch.from_numpy(predictions)
    y_test_tensor = torch.from_numpy(y_val)

    metric = R2Score()
    metric.update(preds_tensor, y_test_tensor)
    r2 = metric.compute()
    print(r2)
    
    return model, r2

In [None]:
def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": 1000,
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 5),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "device": "cuda",
    }

    _, r2 = train_validate(X_train, y_train, X_val, y_val, params)
    return r2


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=8)

In [None]:
plot_optimization_history(study)

In [None]:
best_params = {
    "learning_rate": 0.018497092338319334,
    "max_depth": 9, ## Should be around 7-9
    "subsample": 0.5786198769876968,
    "colsample_bytree": 0.12123574373238373,
    "min_child_weight": 9,
    "objective": "reg:squarederror",
    "n_estimators": 1000,
    "device": "cuda",
    "verbosity": 2
}

In [None]:
model, r2 = train_validate(X_train, y_train, X_val, y_val, best_params)

In [None]:
def predict_test(model, ids, X_test):
    y_test = model.predict(X_test)
    y_test_scaled = y_test * stds[-6:] + means[-6:]

    predictions = np.concatenate((ids, y_test_scaled), axis=1)

    DF = pd.DataFrame(predictions, columns=["id","X4","X11","X18","X50","X26","X3112"])
    ts = time()
    DF.to_csv(f"./submission/submission_{ts}.csv", index=False)

In [None]:
test_data = pd.read_csv("./feature_data/features_test.csv").to_numpy()

In [None]:
ids = test_data[:, 0].reshape(-1, 1)
X_test = test_data[:, 1:]

In [None]:
predict_test(model, ids, X_test)