In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

import minerva

In [None]:
pth = Path('./data/categorical')
pth.mkdir(exist_ok=True, parents=True)

In [None]:
# Parameters for the synthetis of data
n = 50000
num_samples = n
dx = 10
num_relevant = 2
feat_sizes = np.random.randint(low=7, high=10, size=(dx))
dy = 1 
train_size = int(.66 * n)
val_size = int(.15 * n)
test_size = n - train_size - val_size

In [None]:
# Synthesize the data
xs = [
        np.random.randint(low=0, high=size, size=(n, 1))
        for size in feat_sizes
]
x = np.concatenate(xs, axis=1)
expected = np.random.choice(dx, replace=False, size=num_relevant)
y = np.zeros(shape=(n,), dtype=int)
for f0, f1 in zip(expected[:-1], expected[1:]):
    x0 = x[:, f0] / feat_sizes[f0]
    x1 = x[:, f1] / feat_sizes[f1]
    y += np.array(x0 > x1, dtype=int)
    
feature_cols = [f'f{n}' for n in range(dx)]
float_features = []
cat_features = feature_cols
expected_features = list(np.array(feature_cols)[expected])
targets = [f'y{n}' for n in range(dy)]
targets = targets
xdf = pd.DataFrame(
    x,
    columns=feature_cols
)
ydf = pd.DataFrame(
    y,
    columns=targets
)
data = pd.concat((xdf, ydf), axis=1)
train_data = data.iloc[:train_size]
val_data = data.iloc[train_size: train_size + val_size]
test_data = data.iloc[train_size + val_size:]

num_cat_features = len(cat_features)
num_cont_features = 0

In [None]:
# Set hyperparameters
projection_init = np.array(
    [.10] * num_cat_features + [.10] * num_cont_features
)   

# Design architecture
dimension_of_residual_block = 512 
num_res_layers = 4 
scaler = 2 
emb_dim = 4 
reg_coef = 1e5 

In [None]:
# Batches and epochs
max_epochs = int(2000*scaler)
batch_size = scaler*1200

In [None]:
# Pack hyperparameters
selector_params = dict(
    cat_features=cat_features,
    float_features=float_features,
    targets=targets,
    dim1_max=dimension_of_residual_block,
    num_res_layers=num_res_layers,
    eps=.001,
    cat_feat_sizes=feat_sizes,
    emb_dim=emb_dim,
)   

In [None]:
noreg_train_control = minerva.feature_selection.TrainControl(
    number_of_epochs=4000,
    number_of_segments=1,
    data_path='./data/categorical',
    model_name='noreg',
    learning_rate=1e-5,
    reg_coef=0.0,
    projection_init=projection_init,
    disable_projection=True,
)

In [None]:
select_train_control = minerva.feature_selection.TrainControl(
    number_of_epochs=4000,
    number_of_segments=1,
    data_path='./data/categorical',
    model_name='selection',
    learning_rate=1e-5,
    reg_coef=1e5,
    projection_init=None,
    disable_projection=False,
)

In [None]:
logger_params = dict(
    name="example_cat"
)


In [None]:
selector = minerva.feature_selection.run(
    train_data,
    val_data,
    test_data,
    float_features,
    cat_features,
    targets,
    selector_params,
    logger_params,
    noreg_train_control,
    select_train_control,
    batch_size,
)

print(f'Expected features: {expected}')
print(f'Selected features: {selector.selected_feature_names()}')

In [None]:
# print results
print(
    f'Normalised coefficients of the projection matrix:\n{selector.normalized_proj()}\n')
print(f'Selected features:\n{selector.selected_feature_names()}\n')
print(f'Expected features:\n{expected_features}\n')


In [None]:
print(f'Mutual information on train dataset: {float(selector.train_mutual_information())}')
print(f'Mutual information on val dataset: {float(selector.val_mutual_information())}')
print(f'Mutual information on test dataset: {float(selector.test_mutual_information())}')