In [None]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
import torch
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

from experiment_2 import utils
from data.benchmark_selection import ksg_selection, hsic_selection, boruta_selection
from data.minerva_selection import minerva_selection_1

In [None]:
dataset_path = 'data/exp2.csv' # or 'data/exp2filtered.csv'

In [None]:
xdf, ydf, float_features, cat_features, targets = utils.load_data(dataset_path)
X_train, X_test, y_train, y_test = train_test_split(
    xdf,
    ydf,
    test_size=0.2,
    random_state=None
)
X_train, X_val, y_train, y_val = train_test_split(
    xdf,
    ydf,
    test_size=0.3,
    random_state=None
)
print(f'X_train.shape: {X_train.shape}')
print(f'X_val.shape: {X_val.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'y_val.shape: {y_val.shape}')
print(f'y_test.shape: {y_test.shape}')

### CatBoost parameters

In [None]:
params = { 
    "iterations": 5000,
    "depth": 8,
    'random_state': 15, 
    'verbose': False
}   

In [None]:
def train_and_evaluate(X_train, y_train, X_val, y_val, X_test, y_test, selection=None):
    if selection is None:
        selection = X_train.columns.tolist()
    x_train = X_train.loc[:, selection].copy()
    x_val = X_val.loc[:, selection].copy()
    x_test = X_test.loc[:, selection].copy()
    model = CatBoostRegressor(**params)
    model.fit(x_train, y_train, eval_set=(x_val, y_val), early_stopping_rounds=20)
    train_predictions = model.predict(x_train)
    test_predictions = model.predict(x_test)
    r2_insample = r2_score(y_true=y_train, y_pred=train_predictions)
    r2_outsample = r2_score(y_true=y_test, y_pred=test_predictions)
    return r2_insample, r2_outsample

## Accuracy of prediction based on all features

In [None]:
r2_insample, r2_outsample = train_and_evaluate(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    selection=None
)
print(f'In-sample R2 score: {r2_insample}')
print(f'Out-sample R2 score: {r2_outsample}')

## Accuracy of prediction based on KSG selection

In [None]:
r2_insample_ksg, r2_outsample_ksg = train_and_evaluate(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    selection=ksg_selection,
)
print(f'In-sample R2 score with KSG selection: {r2_insample_ksg}')
print(f'Out-sample R2 score with KSG selection: {r2_outsample_ksg}')

## Accuracy of prediction based on HSIC selection

In [None]:
r2_insample_hsic, r2_outsample_hsic = train_and_evaluate(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    selection=hsic_selection,
)
print(f'In-sample R2 score: {r2_insample_hsic}')
print(f'Out-sample R2 score: {r2_outsample_hsic}')

## Accuracy of prediction based on boruta selection

In [None]:
r2_insample_boruta, r2_outsample_boruta = train_and_evaluate(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    selection=boruta_selection,
)
print(f'In-sample R2 score: {r2_insample_boruta}')
print(f'Out-sample R2 score: {r2_outsample_boruta}')

## Accuracy of prediction based on minerva selection

In [None]:
r2_insample_minerva, r2_outsample_minerva = train_and_evaluate(
    X_train, y_train,
    X_val, y_val,
    X_test, y_test,
    selection=minerva_selection_1
)
print(f'In-sample R2 score: {r2_insample_minerva}')
print(f'Out-sample R2 score: {r2_outsample_minerva}')