In [1]:
from pathlib import Path
import pickle
import pandas as pd
import numpy as np
import torch

import tools

In [2]:
n = 100000
dy = 1
num_cat_features = 10
num_cont_features = 30
feature_cols = [f'x{n}' for n in range(num_cat_features + num_cont_features)]
cat_features = feature_cols[:num_cat_features]
float_features = feature_cols[num_cat_features:]
targets = [f'y{n}' for n in range(dy)]

In [3]:
data = pd.read_csv('data/large.csv')
xdf = data.loc[:, feature_cols]
x = xdf.values
ydf = data.loc[:, targets]
y = ydf.values
store = pickle.load(open('data/store.exp2', 'rb'))

In [4]:
expected_cat = store['expected_cat']
expected_cont0  = store['expected_cont0']
expected_cont1  = store['expected_cont1']
expected_cont = store['expected_cont']
expected_features = store['expected_features']

### Uncover relation between features and data

In [5]:
_chooser = data.iloc[:, expected_cat[1]] == data.iloc[:, expected_cat[0]]
idx0 = _chooser == 0
idx1 = _chooser == 1
y_ = np.zeros(shape=(len(data), dy))
y_[idx0, :] = (
    store['t0'] @ np.expand_dims(
        np.sin(2 * np.pi * data.loc[idx0].iloc[:, expected_cont0]),
axis=2))[:, :, 0]
y_[idx1, :] = (
    store['t1'] @ np.expand_dims(
        np.cos(2 * np.pi * data.loc[idx1].iloc[:, expected_cont1]),
axis=2))[:, :, 0]

In [6]:
assert np.allclose(np.squeeze(y_), data['y0'].values, atol=1e-6, rtol=1e-4)

### Selection with marginal 1D ksg mutual info

In [7]:
ksgselection, mis = tools.ksgmi(xdf, ydf, threshold=0.02)

ksg-mi preprocessing: 10 features have been selected


In [8]:
print(f'Expected features: {sorted(expected_features)}')
print(f'Marginal KSG selection: {sorted(ksgselection)}')

Expected features: [6, 8, 14, 18, 19, 20, 23, 24, 28, 31]
Marginal KSG selection: [14, 18, 19, 20, 23, 25, 28, 31, 34, 38]


### Selection with HSIC Lasso

In [9]:
xfeattype = tools.FeatureType.FLOAT
yfeattype = tools.FeatureType.FLOAT
hsiclasso_selection = tools.pyhsiclasso(
    x[:50000, :], y[:50000, :], xfeattype=xfeattype, yfeattype=yfeattype, n_features=10, batch_size=400)

Block HSIC Lasso B = 400.
M set to 3.
Using Gaussian kernel for the features, Gaussian kernel for the outcomes.


In [10]:
print(f'Expected features: {sorted(expected_features)}')
print(f'HSIC Lasso selection: {sorted(hsiclasso_selection)}')

Expected features: [6, 8, 14, 18, 19, 20, 23, 24, 28, 31]
HSIC Lasso selection: [4, 11, 14, 18, 19, 20, 23, 24, 28, 31]
