In [None]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

import tools

In [None]:
n = 100000
dy = 1
num_cat_features = 10
num_cont_features = 30
feature_cols = [f'x{n}' for n in range(num_cat_features + num_cont_features)]
cat_features = feature_cols[:num_cat_features]
float_features = feature_cols[num_cat_features:]
targets = [f'y{n}' for n in range(dy)]

In [None]:
data = pd.read_csv('data/large.csv')
xdf = data.loc[:, feature_cols]
x = xdf.values
ydf = data.loc[:, targets]
y = ydf.values
store = pickle.load(open('data/store.exp2', 'rb'))

5. Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    xdf,
    ydf,
    test_size=0.3,
    random_state=40
)

In [None]:
print(f'X_train.shape: {X_train.shape}')
print(f'X_test.shape: {X_test.shape}')
print(f'y_train.shape: {y_train.shape}')
print(f'y_test.shape: {y_test.shape}')

In [None]:
expected_cat = store['expected_cat']
expected_cont0  = store['expected_cont0']
expected_cont1  = store['expected_cont1']
expected_cont = store['expected_cont']
expected_features = store['expected_features']

In [None]:
ksg_selection = np.array(
    [14, 18, 19, 20, 23, 25, 28, 31, 34, 38]
)

In [None]:
len(ksg_selection)

In [None]:
hsic_selection = np.array(
    [4, 11, 14, 18, 19, 20, 23, 24, 28, 31]
)

In [None]:
boruta_selection = np.array(
    [14, 18, 19, 20, 23, 24, 28, 31]
)

In [None]:
minerva_selection = np.array(
    [3, 5, 6, 8, 14, 18, 19, 20, 24, 28, 31]
)

### Uncover relation between features and data

In [None]:
_chooser = data.iloc[:, expected_cat[1]] == data.iloc[:, expected_cat[0]]
idx0 = _chooser == 0
idx1 = _chooser == 1
y_ = np.zeros(shape=(len(data), dy))
y_[idx0, :] = (
    store['t0'] @ np.expand_dims(
        np.sin(2 * np.pi * data.loc[idx0].iloc[:, expected_cont0]),
axis=2))[:, :, 0]
y_[idx1, :] = (
    store['t1'] @ np.expand_dims(
        np.cos(2 * np.pi * data.loc[idx1].iloc[:, expected_cont1]),
axis=2))[:, :, 0]

In [None]:
assert np.allclose(np.squeeze(y_), data['y0'].values, atol=1e-6, rtol=1e-4)

### CatBoost parameters

In [None]:
params = {
    "iterations": 150,
    "depth": 8,
    "verbose": 10,
    'random_state': 40,
    'verbose': False
}

## Accuracy of prediction based on all features

In [None]:
model = CatBoostRegressor(**params) 
model.fit(X_train, y_train)

In [None]:
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

#### Performance assesment

In [None]:
r2_insample = r2_score(y_true=y_train, y_pred=train_predictions)
r2_outsample = r2_score(y_true=y_test, y_pred=test_predictions)
print(f'In-sample R2 score: {r2_insample}')
print(f'Out-sample R2 score: {r2_outsample}')

## Accuracy of prediction based on expected features

In [None]:
X_train_expected = X_train.iloc[:, expected_features]
X_test_expected = X_test.iloc[:, expected_features]

In [None]:
model_expected = CatBoostRegressor(**params) 
model_expected.fit(X_train_expected, y_train)

In [None]:
train_predictions_expected = model_expected.predict(X_train_expected)
test_predictions_expected = model_expected.predict(X_test_expected)

#### Performance assesment

In [None]:
r2_insample_expected = r2_score(y_true=y_train, y_pred=train_predictions_expected)
r2_outsample_expected = r2_score(y_true=y_test, y_pred=test_predictions_expected)
print(f'In-sample R2 score with expected features: {r2_insample_expected}')
print(f'Out-sample R2 score with expected features: {r2_outsample_expected}')

## Accuracy of prediction based on KSG selection

In [None]:
X_train_ksg = X_train.iloc[:, ksg_selection]
X_test_ksg = X_test.iloc[:, ksg_selection]

In [None]:
model_ksg = CatBoostRegressor(**params) 
model_ksg.fit(X_train_ksg, y_train)

In [None]:
train_predictions_ksg = model_ksg.predict(X_train_ksg)
test_predictions_ksg = model_ksg.predict(X_test_ksg)

#### Performance assesment

In [None]:
r2_insample_ksg = r2_score(y_true=y_train, y_pred=train_predictions_ksg)
r2_outsample_ksg = r2_score(y_true=y_test, y_pred=test_predictions_ksg)
print(f'In-sample R2 score with KSG selection: {r2_insample_ksg}')
print(f'Out-sample R2 score with KSG selection: {r2_outsample_ksg}')

## Accuracy of prediction based on HSIC selection

In [None]:
X_train_hsic = X_train.iloc[:, hsic_selection]
X_test_hsic = X_test.iloc[:, hsic_selection]

In [None]:
model_hsic = CatBoostRegressor(**params) 
model_hsic.fit(X_train_hsic, y_train)

In [None]:
train_predictions_hsic = model_hsic.predict(X_train_hsic)
test_predictions_hsic = model_hsic.predict(X_test_hsic)

#### Performance assesment

In [None]:
r2_insample_hsic = r2_score(y_true=y_train, y_pred=train_predictions_hsic)
r2_outsample_hsic = r2_score(y_true=y_test, y_pred=test_predictions_hsic)
print(f'In-sample R2 score: {r2_insample_hsic}')
print(f'Out-sample R2 score: {r2_outsample_hsic}')

## Accuracy of prediction based on boruta selection

In [None]:
X_train_boruta = X_train.iloc[:, boruta_selection]
X_test_boruta = X_test.iloc[:, boruta_selection]

In [None]:
model_boruta = CatBoostRegressor(**params) 
model_boruta.fit(X_train_boruta, y_train)

In [None]:
train_predictions_boruta = model_boruta.predict(X_train_boruta)
test_predictions_boruta = model_boruta.predict(X_test_boruta)

#### Performance assesment

In [None]:
r2_insample_boruta = r2_score(y_true=y_train, y_pred=train_predictions_boruta)
r2_outsample_boruta = r2_score(y_true=y_test, y_pred=test_predictions_boruta)
print(f'In-sample R2 score: {r2_insample_boruta}')
print(f'Out-sample R2 score: {r2_outsample_boruta}')

## Accuracy of prediction based on minerva selection

In [None]:
X_train_minerva = X_train.iloc[:, minerva_selection]
X_test_minerva = X_test.iloc[:, minerva_selection]

In [None]:
model_minerva = CatBoostRegressor(**params) 
model_minerva.fit(X_train_minerva, y_train)

In [None]:
train_predictions_minerva = model_minerva.predict(X_train_minerva)
test_predictions_minerva = model_minerva.predict(X_test_minerva)

#### Performance assesment

In [None]:
r2_insample_minerva = r2_score(y_true=y_train, y_pred=train_predictions_minerva)
r2_outsample_minerva = r2_score(y_true=y_test, y_pred=test_predictions_minerva)
print(f'In-sample R2 score with minerva selection: {r2_insample_minerva}')
print(f'Out-sample R2 score with minerva selection: {r2_outsample_minerva}')