In [None]:
import pickle
import numpy as np
import pandas as pd
import torch
from minerva.select import Selector
import minerva

In [None]:
n = 100000
n_samples = n
dy = 1
num_cat_features = 10
num_cont_features = 30
feature_cols = [f'x{n}' for n in range(
    num_cat_features + num_cont_features)]
cat_features = feature_cols[:num_cat_features]
float_features = feature_cols[num_cat_features:]
targets = [f'y{n}' for n in range(dy)]

data = pd.read_csv('data/large.csv')
xdf = data.loc[:, feature_cols]
x = xdf.values
ydf = data.loc[:, targets]
y = ydf.values
cat_feat_sizes = 1+data.loc[:, cat_features].max().values
store = pickle.load(open('data/store.exp2', 'rb'))

expected_cat = store['expected_cat']
expected_cont0 = store['expected_cont0']
expected_cont1 = store['expected_cont1']
expected_cont = store['expected_cont']
expected_features = store['expected_features']


In [None]:
# Set hyperparameters
dimension_of_residual_block = 512
num_res_layers = 4
scaler = 2
batch_size = scaler*1200
num_batches = n_samples // batch_size
max_epochs = int(2000*scaler)
lr = 5e-6
emb_dim = 4
reg_coef = 1e4

# Pack hyperparameters
selector_params = dict(
    cat_features=cat_features,
    float_features=float_features,
    targets=targets,
    dim1_max=dimension_of_residual_block,
    lr=lr,
    num_res_layers=num_res_layers,
    eps=.001,
    cat_feat_sizes=cat_feat_sizes,
    emb_dim=emb_dim,
)


In [None]:
# Split train, validation, and test
n_samples = len(data)
train_size = int(.70 * n_samples)
val_size = int(.25 * n_samples)
test_size = n_samples - train_size - val_size
train_data = data.iloc[:train_size]
val_data = data.iloc[train_size: train_size + val_size]
test_data = data.iloc[:-test_size]


In [None]:
# Set dataloaders
train_dataloader, val_dataloader, test_dataloader = minerva.feature_selection.dataloaders(
    train_data=train_data,
    val_data=val_data,
    test_data=test_data,
    float_features=float_features,
    categorical_features=cat_features,
    targets=targets,
    batch_size=batch_size,
)   

In [None]:
load_path = 'data/run4/trained.model.7.0.0'

In [None]:
selector = Selector(**selector_params)

In [None]:
selector.load_state_dict(torch.load(load_path))

In [None]:
weights = selector.projection_weights()

In [None]:
print(f'weights:\n{weights}\n')
print(f'expected_cat:\n{expected_cat}\n')
print(f'expected_cont0:\n{expected_cont0}\n')
print(f'expected_cont1:\n{expected_cont1}\n')
print(f'expected_features:\n{expected_features}\n')
print(f'Selected features:\n{selector.selected_feature_names()}')

In [None]:
selector.set_loaders(train_dataloader, val_dataloader, test_dataloader)

In [None]:
selector.val_mutual_information()

In [None]:
weight_histories = []
for s in range(1):
    weight_histories.append(pd.read_csv(f'data/run4/weight_history_7_segment{s}.csv'))
weight_history = pd.concat(weight_histories, axis=0, sort=True, ignore_index=True)
weight_history.rename(
    columns={c: 'x'+c for c in weight_history.columns},
    inplace=True
)
sorted_cols = [f'x{n}' for n in range(weight_history.shape[1])]
weight_history = weight_history[sorted_cols]

In [None]:
weight_history.iloc[:, 0:10].plot(title='Weights of categorical features')

In [None]:
features = expected_cont
weight_history.iloc[:, features].plot(title='Weights of expected cont features', figsize=(10, 7))

In [None]:
features = list(set(range(10, 40)).difference(set(expected_cont)))
weight_history.iloc[:, features].plot(title='Weights of irrelevant cont features', figsize=(8, 6))