In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

import torch

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger


from minerva.select import Selector
from minerva.iterable_dataset import MyDataset, MyIterableDataset
from minerva import normalize

In [None]:
torch.set_float32_matmul_precision("medium")

In [None]:
pth = Path('./data/categorical')
pth.mkdir(exist_ok=True, parents=True)

In [None]:
# Parameters for the synthetis of data
n = 50000
dx = 10
num_relevant = 2
feat_sizes = np.random.randint(low=7, high=10, size=(dx))
dy = 1 
train_size = int(.66 * n)
val_size = int(.15 * n)
test_size = n - train_size - val_size

In [None]:
# Set metaparameters

num_samples = n
# The below makes things quite slow; 256 and 3 seem to perform almost as well, but way faster
dimension_of_residual_block = 512
num_res_layers = 4
scaler = 2  # Scaler = 4 did the best so far, scaler=8 diverged
batch_size = scaler*2048
num_batches = num_samples // batch_size
max_epochs =  int(2000*scaler)  # to keep the number of batches constant

lr = 1e-5  # scaling that as sqrt(scaler) didn't seem to work
emb_dim = 3

In [None]:
# Synthesize the data
xs = [
        np.random.randint(low=0, high=size, size=(n, 1))
        for size in feat_sizes
]
x = np.concatenate(xs, axis=1)
expected = np.random.choice(dx, replace=False, size=num_relevant)
y = np.zeros(shape=(n,), dtype=int)
for f0, f1 in zip(expected[:-1], expected[1:]):
    x0 = x[:, f0] / feat_sizes[f0]
    x1 = x[:, f1] / feat_sizes[f1]
    y += np.array(x0 > x1, dtype=int)
    
feature_cols = [f'f{n}' for n in range(dx)]
float_features = []
cat_features = feature_cols
targets = [f'y{n}' for n in range(dy)]
targets = targets
xdf = pd.DataFrame(
    x,
    columns=feature_cols
)
ydf = pd.DataFrame(
    y,
    columns=targets
)
data = pd.concat((xdf, ydf), axis=1)
train_data = data.iloc[:train_size]
val_data = data.iloc[train_size: train_size + val_size]
test_data = data.iloc[train_size + val_size:]

In [None]:
# Prepare the data for the training
dn = normalize.DatasetNormalizer(
    float_cols=[], categorical_cols=cat_features + targets)
train_data = dn.fit_transform(train_data)
val_data = dn.transform(val_data)
test_data = dn.transform(test_data)

train_dataset = MyDataset(
    train_data,
    float_features,
    cat_features,
    targets
)
val_dataset = MyDataset(
    val_data,
    float_features,
    cat_features,
    targets
)
test_dataset = MyDataset(
    test_data,
    float_features,
    cat_features,
    targets
)

train_dataloader = MyIterableDataset(train_dataset, batch_size=batch_size)
val_dataloader = MyIterableDataset(val_dataset, batch_size=batch_size)
test_dataloader = MyIterableDataset(test_dataset, batch_size=batch_size)

In [None]:
def run_this(reg_coef: float, load_path=None, wgt_mult=None):
    
    selector = Selector(
        cat_features=cat_features,
        float_features=float_features,
        targets=targets,
        dim1_max=dimension_of_residual_block,
        lr=lr,
        num_res_layers=num_res_layers,
        regularization_coef=reg_coef,
        eps=.001,
        cat_feat_sizes=feat_sizes,
        emb_dim=emb_dim,
    )
    if load_path is not None:
        selector.load_state_dict(torch.load(load_path))

    # Set dataloaders
    selector.set_loaders(train_dataloader, val_dataloader, test_dataloader)
    
    selector.enable_projection(wgt_mult=wgt_mult)

    # Train the model
    logger = TensorBoardLogger("tb_logs", name="categorical")
    trainer = pl.Trainer(
        gradient_clip_val=0.5,
        accelerator="auto",
        log_every_n_steps=50,
        max_epochs=max_epochs,
        logger=logger,
    )

    trainer.fit(
        selector,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader
    )

    final_test_loss = trainer.test(selector)
    out = final_test_loss[0]
    out["selected_features"] = selector.selected_feature_names()
    return out, selector


In [None]:
noreg_path = "./data/categorical/noreg.pth"

In [None]:
# Train a long run without reg, to get the MI network right
out, selector = run_this(reg_coef=0.0, wgt_mult=None)
torch.save(selector.state_dict(), noreg_path)

In [None]:
# now add reg starting from that snapshot
# Regularization level appears to have almost no effect as long as it's > 100
reg_coefs = [1e5]
results = []
for reg_coef in reg_coefs:
    out, selector = run_this(reg_coef=reg_coef, load_path=noreg_path, wgt_mult=.25)
    results.append(out)
    results[-1]["reg_coef"] = reg_coef
    df = pd.DataFrame(results)
    df.to_csv("./data/categorical/results.csv")

In [None]:
# print results
print(
    f'Normalised coefficients of the projection matrix:\n{selector.normalized_proj()}\n')
print(f'Selected features:\n{selector.selected_feature_names()}\n')
print(f'Expected features:\n{expected}\n')


In [None]:
print(f'Mutual information on train dataset: {float(selector.train_mutual_information())}')
print(f'Mutual information on val dataset: {float(selector.val_mutual_information())}')
print(f'Mutual information on test dataset: {float(selector.test_mutual_information())}')