In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

import minerva

In [None]:
pth = Path('./data/linear')
pth.mkdir(exist_ok=True, parents=True)

In [None]:
# Parameters for the synthetis of data
n = 50000
dx = 10
num_relevant = 2 
dy = 1 
train_size = int(.66 * n)
val_size = int(.15 * n)
test_size = n - train_size - val_size

In [None]:
# Set metaparameters

num_samples = n
# The below makes things quite slow; 256 and 3 seem to perform almost as well, but way faster
dimension_of_residual_block = 512
num_res_layers = 4
scaler = 2  # Scaler = 4 did the best so far, scaler=8 diverged
batch_size = scaler*2048
num_batches = num_samples // batch_size
max_epochs =  int(2000*scaler)  # to keep the number of batches constant

lr = 1e-5  # scaling that as sqrt(scaler) didn't seem to work

In [None]:
# Synthesize the data
x = np.random.uniform(size=(n, dx))
expected = np.random.choice(dx, replace=False, size=num_relevant)
y = ( 
    np.random.uniform(size=(1, dy, num_relevant)
                      ) @ np.expand_dims(x[:, expected], axis=2)
)[:, :, 0]
feature_cols = [f'f{n}' for n in range(dx)]
float_features = feature_cols
cat_features = []
expected_features = list(np.array(feature_cols)[expected])
targets = [f'y{n}' for n in range(dy)]
targets = targets
xdf = pd.DataFrame(
    x,
    columns=feature_cols
)
ydf = pd.DataFrame(
    y,
    columns=targets
)
data = pd.concat((xdf, ydf), axis=1)
train_data = data.iloc[:train_size]
val_data = data.iloc[train_size: train_size + val_size]
test_data = data.iloc[train_size + val_size:]

num_cat_features = len(cat_features)
num_cont_features = len(float_features)

In [None]:
# Design architecture
dimension_of_residual_block = 512 
num_res_layers = 4 
scaler = 2 
emb_dim = 4 
reg_coef = 1e5 

In [None]:
# Batches and epochs
max_epochs = int(2000*scaler)
batch_size = scaler*1200

In [None]:
# Pack hyperparameters
selector_params = dict(
    cat_features=cat_features,
    float_features=float_features,
    targets=targets,
    dim1_max=dimension_of_residual_block,
    num_res_layers=num_res_layers,
    eps=.001,
    cat_feat_sizes=[],
    emb_dim=emb_dim,
)   

In [None]:
noreg_train_control = minerva.feature_selection.TrainControl(
    number_of_epochs=4000,
    number_of_segments=1,
    data_path='./data/linear',
    model_name='noreg',
    learning_rate=1e-5,
    reg_coef=0.0,
    projection_init=.3, 
    disable_projection=True
)

In [None]:
select_train_control = minerva.feature_selection.TrainControl(
    number_of_epochs=4000,
    number_of_segments=2,
    data_path='./data/linear',
    model_name='selection',
    learning_rate=1e-6,
    reg_coef=1e3,
    projection_init=None,
    disable_projection=False,
)

In [None]:
logger_params = dict(
    name="example_linear"
)

In [None]:
selector = minerva.feature_selection.run(
    train_data,
    val_data,
    test_data,
    float_features,
    cat_features,
    targets,
    selector_params,
    logger_params,
    noreg_train_control,
    select_train_control,
    batch_size,
)

print(f'Expected features: {expected_features}')
print(f'Selected features: {selector.selected_feature_names()}')

In [None]:
# print results
print(
    f'Normalised coefficients of the projection matrix:\n{selector.normalized_proj()}\n')
print(f'Selected features:\n{selector.selected_feature_names()}\n')
print(f'Expected features:\n{expected_features}\n')

In [None]:
print(f'Mutual information on train dataset: {float(selector.train_mutual_information())}')
print(f'Mutual information on val dataset: {float(selector.val_mutual_information())}')
print(f'Mutual information on test dataset: {float(selector.test_mutual_information())}')