In [None]:
!pip install optuna rtdl category_encoders ruamel.yaml einops

In [4]:
from scipy.optimize import fmin
import random
import os
import sys
import pandas as pd
import pickle
import pathlib
import argparse
import numpy as np
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from tqdm import tqdm, trange
from typing import Optional, Sequence, Tuple, Union, Any, Dict, List
from copy import deepcopy
import enum
import optuna
import rtdl
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import category_encoders as ce
import ruamel.yaml
import math
from collections import OrderedDict, defaultdict
from sklearn.metrics import roc_auc_score, f1_score, precision_score, accuracy_score, recall_score, roc_auc_score, balanced_accuracy_score, log_loss, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.pipeline import make_pipeline
from scipy.spatial import distance_matrix
from scipy.linalg import qr
from torch.autograd import Function
from einops import rearrange
from torch import nn, einsum
from einops import rearrange

In [5]:
def set_global_seed(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


set_global_seed(42)

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Preprocessings

In [7]:
def cos_sin(x: torch.Tensor) -> torch.Tensor:
    return torch.cat([torch.cos(x), torch.sin(x)], -1)

def positional(d, pos):
    return torch.Tensor([np.sin(1/10000 ** (2 * int(i / 2) / d) * pos) if i % 2 == 0 else 
                       np.cos(1/10000 ** (2 * int(i / 2) / d) * pos) for i in range(d)])

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, n_features: int = 5000, dropout: float = 0.1, tf: bool=False):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.tf = tf
        self.d_model = d_model
        position = torch.arange(n_features).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(n_features, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        self.pe = pe

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if self.tf:
            x = x + self.pe.unsqueeze(0)
        else:
            x = x + self.pe.view(1, -1)
        return self.dropout(x)
    
class Periodic(nn.Module):
    def __init__(self, n_features: int, n: int, sigma: float, trainable: bool, initialization: str, tf: bool) -> None:
        super().__init__()
        self.tf = tf
        if initialization == 'log-linear':
            coefficients = sigma ** (torch.arange(n) / n)
            coefficients = coefficients[None].repeat(n_features, 1)
        else:
            assert initialization == 'normal'
            coefficients = torch.normal(0.0, sigma, (n_features, n))
        if trainable:
            self.coefficients = nn.Parameter(coefficients)  
        else:
            self.register_buffer('coefficients', coefficients)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if not self.tf:
            return cos_sin(2 * np.pi * self.coefficients[None] * x[..., None]).view(-1, 2 * x.shape[1] * self.coefficients.shape[1])
        else:
            return cos_sin(2 * np.pi * self.coefficients[None] * x[..., None]).view(x.shape[0], x.shape[1], 2 * self.coefficients.shape[1])
    
    
class NLinear(nn.Module):
    def __init__(self, n: int, d_in: int, d_out: int, bias: bool = True) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.Tensor(n, d_in, d_out))
        self.bias = nn.Parameter(torch.Tensor(n, d_out)) if bias else None
        with torch.no_grad():
            for i in range(n):
                layer = nn.Linear(d_in, d_out)
                self.weight[i] = layer.weight.T
                if self.bias is not None:
                    self.bias[i] = layer.bias

    def forward(self, x):
        if x.ndim == 3:
            x = x[..., None] * self.weight[None]
            x = x.sum(-2)
            if self.bias is not None:
                x = x + self.bias[None]
            return x
    

class FeaturesTokenizer(nn.Module):
    def __init__(self, n_features: int, d_embedding: int, tf: bool) -> None:
        super().__init__()
        self.tf = tf
        self.first_layer = rtdl.NumericalFeatureTokenizer(n_features, d_embedding, True, 'uniform')
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.first_layer(x)
        if not self.tf:
            return x.view(x.shape[0], -1)
        else:
            return x
    
class LinearEmbeddings(nn.Module):
    def __init__(self, n_layers, n_features, d_embeddings, tf):
        super().__init__()
        self.n_layers = n_layers
        self.tf = tf
        self.layers = nn.ModuleList()
        for i in range(n_layers):
            if i == 0:
                self.layers.append(rtdl.NumericalFeatureTokenizer(n_features, d_embeddings[i], False, 'uniform'))
            else:
                self.layers.append(NLinear(n_features, d_embeddings[i-1], d_embeddings[i], False))
        self.leaky_relu = nn.LeakyReLU()
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        for i in range(self.n_layers):
            x = self.layers[i](x)
            x = self.leaky_relu(x)
        if not self.tf:
            return x.view(x.shape[0], -1)
        else:
            return x    
    
    
class AutoDis(nn.Module):
    def __init__(
        self, n_features: int, d_embedding: int, n_meta_embeddings: int, temperature: float, tf: bool
    ) -> None:
        super().__init__()
        self.first_layer = rtdl.NumericalFeatureTokenizer(
            n_features,
            n_meta_embeddings,
            False,
            'uniform',
        )
        self.tf = tf
        self.leaky_relu = nn.LeakyReLU()
        self.second_layer = NLinear(
            n_features, n_meta_embeddings, n_meta_embeddings, False
        )
        self.softmax = nn.Softmax(-1)
        self.temperature = temperature
        self.third_layer = NLinear(
            n_features, n_meta_embeddings, d_embedding, False
        )
        nn.init.uniform_(self.third_layer.weight, 0.01)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.first_layer(x)
        x = self.leaky_relu(x)
        x = self.second_layer(x)
        x = self.softmax(x / self.temperature)
        x = self.third_layer(x)
        if not self.tf:
            return x.view(x.shape[0], -1)
        else:
            return x
    
    
class SoftEmbedding(torch.nn.Module):
    def __init__(self, num_embeddings, embeddings_dim, emb_initializer=None, tf=True):
        super(SoftEmbedding, self).__init__()
        self.embedding_table = torch.nn.Embedding(num_embeddings, embeddings_dim)
        if emb_initializer:
            emb_initializer(self.embedding_table.weight)
        self.projection_layer = torch.nn.Linear(1, num_embeddings, bias=True)
        self.softmax = torch.nn.Softmax(dim=-1)
        self.tf = tf

    def forward(self, input_numeric):
        input_numeric = input_numeric.unsqueeze(-1)
        weights = self.softmax(self.projection_layer(input_numeric)) 
        soft_one_hot_embeddings = (weights.unsqueeze(-1) * self.embedding_table.weight).sum(-2)
        if not self.tf:
            return soft_one_hot_embeddings.view(soft_one_hot_embeddings.shape[0], -1)
        else:
            return soft_one_hot_embeddings
        
        
class EntmaxBisectFunction(Function):
    @classmethod
    def _gp(cls, x, alpha):
        return x ** (alpha - 1)

    @classmethod
    def _gp_inv(cls, y, alpha):
        return y ** (1 / (alpha - 1))

    @classmethod
    def _p(cls, X, alpha):
        return cls._gp_inv(torch.clamp(X, min=0), alpha)

    @classmethod
    def forward(cls, ctx, X, alpha=1.5, dim=-1, n_iter=50, ensure_sum_one=True):

        if not isinstance(alpha, torch.Tensor):
            alpha = torch.tensor(alpha, dtype=X.dtype, device=X.device)

        alpha_shape = list(X.shape)
        alpha_shape[dim] = 1
        alpha = alpha.expand(*alpha_shape)

        ctx.alpha = alpha
        ctx.dim = dim
        d = X.shape[dim]

        X = X * (alpha - 1)

        max_val, _ = X.max(dim=dim, keepdim=True)

        tau_lo = max_val - cls._gp(1, alpha)
        tau_hi = max_val - cls._gp(1 / d, alpha)

        f_lo = cls._p(X - tau_lo, alpha).sum(dim) - 1

        dm = tau_hi - tau_lo

        for it in range(n_iter):

            dm /= 2
            tau_m = tau_lo + dm
            p_m = cls._p(X - tau_m, alpha)
            f_m = p_m.sum(dim) - 1

            mask = (f_m * f_lo >= 0).unsqueeze(dim)
            tau_lo = torch.where(mask, tau_m, tau_lo)

        if ensure_sum_one:
            p_m /= p_m.sum(dim=dim).unsqueeze(dim=dim)

        ctx.save_for_backward(p_m)

        return p_m

    @classmethod
    def backward(cls, ctx, dY):
        Y, = ctx.saved_tensors

        gppr = torch.where(Y > 0, Y ** (2 - ctx.alpha), Y.new_zeros(1))

        dX = dY * gppr
        q = dX.sum(ctx.dim) / gppr.sum(ctx.dim)
        q = q.unsqueeze(ctx.dim)
        dX -= q * gppr

        d_alpha = None
        if ctx.needs_input_grad[1]:
            S = torch.where(Y > 0, Y * torch.log(Y), Y.new_zeros(1))
            ent = S.sum(ctx.dim).unsqueeze(ctx.dim)
            Y_skewed = gppr / gppr.sum(ctx.dim).unsqueeze(ctx.dim)

            d_alpha = dY * (Y - Y_skewed) / ((ctx.alpha - 1) ** 2)
            d_alpha -= dY * (S - Y_skewed * ent) / (ctx.alpha - 1)
            d_alpha = d_alpha.sum(ctx.dim).unsqueeze(ctx.dim)

        return dX, d_alpha, None, None, None

        
        
        
def entmax_bisect(X, alpha=1.5, dim=-1, n_iter=50, ensure_sum_one=True):
    return EntmaxBisectFunction.apply(X, alpha, dim, n_iter, ensure_sum_one)

        
        
class EntmaxBisect(nn.Module):
    def __init__(self, alpha=1.5, dim=-1, n_iter=50):
        self.dim = dim
        self.n_iter = n_iter
        self.alpha = alpha
        super().__init__()

    def forward(self, X):
        return entmax_bisect(
            X, alpha=self.alpha, dim=self.dim, n_iter=self.n_iter
        )
  
        
class SparseAttLayer(nn.Module):
    def __init__(self, nhead: int, nfield: int, nemb: int, d_k: int, nhid: int, alpha: float = 1.5):
        super(SparseAttLayer, self).__init__()
        self.sparsemax = nn.Softmax(dim=-1) if alpha == 1. \
            else EntmaxBisect(alpha, dim=-1)

        self.scale = d_k ** -0.5
        self.bilinear_w = nn.Parameter(torch.zeros(nhead, nemb, d_k))                   
        self.query = nn.Parameter(torch.zeros(nhead, nhid, d_k))                        
        self.values = nn.Parameter(torch.zeros(nhead, nhid, nfield))                    
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.xavier_uniform_(self.bilinear_w, gain=1.414)
        nn.init.xavier_uniform_(self.query, gain=1.414)
        nn.init.xavier_uniform_(self.values, gain=1.414)

    def forward(self, x):
        keys = x                                                                       
        att_gates = torch.einsum('bfx,kxy,koy->bkof',
                                 keys, self.bilinear_w, self.query) * self.scale       
        sparse_gates = self.sparsemax(att_gates)                                        
        return torch.einsum('bkof,kof->bkof', sparse_gates, self.values)

    
class Embedding(nn.Module):

    def __init__(self, nfeat, nemb):
        super().__init__()
        self.embedding = nn.Embedding(nfeat, nemb)
        nn.init.xavier_uniform_(self.embedding.weight)

    def forward(self, x):
        emb = self.embedding(x['id'])                           
        return emb * x['value'].unsqueeze(2)                    


def exists(val):
    return val is not None

def default(val, d):
    return val if exists(val) else d

class Residual(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) + x

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class GEGLU(nn.Module):
    def forward(self, x):
        x, gates = x.chunk(2, dim = -1)
        return x * F.gelu(gates)

class FeedForward(nn.Module):
    def __init__(self, dim, mult = 4, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * mult * 2),
            GEGLU(),
            nn.Dropout(dropout),
            nn.Linear(dim * mult, dim)
        )

    def forward(self, x, **kwargs):
        return self.net(x)

class Attention(nn.Module):
    def __init__(
        self,
        dim,
        heads = 8,
        dim_head = 16,
        dropout = 0.
    ):
        super().__init__()
        inner_dim = dim_head * heads
        self.heads = heads
        self.scale = dim_head ** -0.5

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Linear(inner_dim, dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        h = self.heads
        q, k, v = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
        sim = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        attn = sim.softmax(dim = -1)
        dropped_attn = self.dropout(attn)

        out = einsum('b h i j, b h j d -> b h i d', dropped_attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)', h = h)
        return self.to_out(out), attn


class Transformer(nn.Module):
    def __init__(self, num_tokens, dim, depth, heads, dim_head, attn_dropout, ff_dropout):
        super().__init__()
        self.embeds = nn.Embedding(num_tokens, dim)
        self.layers = nn.ModuleList([])

        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = attn_dropout)),
                PreNorm(dim, FeedForward(dim, dropout = ff_dropout)),
            ]))

    def forward(self, x, return_attn = False):
        x = self.embeds(x)

        post_softmax_attns = []

        for attn, ff in self.layers:
            attn_out, post_softmax_attn = attn(x)
            post_softmax_attns.append(post_softmax_attn)

            x = x + attn_out
            x = ff(x) + x

        if not return_attn:
            return x

        return x, torch.stack(post_softmax_attns)

class MLP_TT(nn.Module):
    def __init__(self, dims, act = None):
        super().__init__()
        dims_pairs = list(zip(dims[:-1], dims[1:]))
        layers = []
        for ind, (dim_in, dim_out) in enumerate(dims_pairs):
            is_last = ind >= (len(dims_pairs) - 1)
            linear = nn.Linear(dim_in, dim_out)
            layers.append(linear)

            if is_last:
                continue

            act = default(act, nn.ReLU())
            layers.append(act)

        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x)

class TabTransformer(nn.Module):
    def __init__(
        self,
        *,
        categories, 
        num_continuous,
        dim, 
        depth, 
        heads, 
        dim_head = 16,
        dim_out = 1,
        mlp_hidden_mults = (4, 2),
        mlp_act = None,
        num_special_tokens = 2,
        continuous_mean_std = None,
        attn_dropout = 0.,
        ff_dropout = 0.
    ):
        super().__init__()
        self.num_categories = len(categories)
        self.num_unique_categories = sum(categories)

        self.num_special_tokens = num_special_tokens
        total_tokens = self.num_unique_categories + num_special_tokens

        if self.num_unique_categories > 0:
            categories_offset = F.pad(torch.tensor(list(categories)), (1, 0), value = num_special_tokens)
            categories_offset = categories_offset.cumsum(dim = -1)[:-1]
            self.register_buffer('categories_offset', categories_offset)

        self.num_continuous = num_continuous

        if self.num_continuous > 0:
            if exists(continuous_mean_std):
                assert continuous_mean_std.shape == (num_continuous, 2), f'continuous_mean_std must have a shape of ({num_continuous}, 2) where the last dimension contains the mean and variance respectively'
            self.register_buffer('continuous_mean_std', continuous_mean_std)

            self.norm = nn.LayerNorm(num_continuous)

        self.transformer = Transformer(
            num_tokens = total_tokens,
            dim = dim,
            depth = depth,
            heads = heads,
            dim_head = dim_head,
            attn_dropout = attn_dropout,
            ff_dropout = ff_dropout
        )

        input_size = (dim * self.num_categories) + num_continuous
        l = input_size // 8

        hidden_dimensions = list(map(lambda t: l * t, mlp_hidden_mults))
        all_dimensions = [input_size, *hidden_dimensions, dim_out]
        
        
        self.mlp = MLP_TT(all_dimensions, act = mlp_act)

    def forward(self, x_categ, x_cont, return_attn = False):
        xs = []

        assert x_categ.shape[-1] == self.num_categories, f'you must pass in {self.num_categories} values for your categories input'

        if self.num_unique_categories > 0:
            x_categ += self.categories_offset

            x, attns = self.transformer(x_categ, return_attn = True)

            xs.append(x)

        if self.num_continuous > 0:
            if exists(self.continuous_mean_std):
                mean, std = self.continuous_mean_std.unbind(dim = -1)
                x_cont = (x_cont - mean) / std

            normed_cont = self.norm(x_cont)
            xs.append(normed_cont)

        x = torch.cat(xs, dim = -1)
        return x


## Models

In [8]:
class _TokenInitialization(enum.Enum):
    UNIFORM = 'uniform'
    NORMAL = 'normal'

    @classmethod
    def from_str(cls, initialization: str) -> '_TokenInitialization':
        try:
            return cls(initialization)
        except ValueError:
            valid_values = [x.value for x in _TokenInitialization]
            raise ValueError(f'initialization must be one of {valid_values}')

    def apply(self, x: torch.Tensor, d: int) -> None:
        d_sqrt_inv = 1 / math.sqrt(d)
        if self == _TokenInitialization.UNIFORM:
            nn.init.uniform_(x, a=-d_sqrt_inv, b=d_sqrt_inv)
        elif self == _TokenInitialization.NORMAL:
            nn.init.normal_(x, std=d_sqrt_inv)

class CategoricalFeatureTokenizer(nn.Module):
    category_offsets: torch.Tensor

    def __init__(
        self,
        cardinalities: List[int],
        d_token: int,
        bias: bool,
        initialization: str,
    ) -> None:
        super().__init__()
        assert cardinalities, 'cardinalities must be non-empty'
        assert d_token > 0, 'd_token must be positive'
        initialization_ = _TokenInitialization.from_str(initialization)

        category_offsets = torch.tensor([0] + cardinalities[:-1]).cumsum(0)
        self.register_buffer('category_offsets', category_offsets, persistent=False)
        self.embeddings = nn.Embedding(sum(cardinalities), d_token)
        self.bias = nn.Parameter(torch.Tensor(len(cardinalities), d_token)) if bias else None

        for parameter in [self.embeddings.weight, self.bias]:
            if parameter is not None:
                initialization_.apply(parameter, d_token)

    @property
    def n_tokens(self) -> int:
        return len(self.category_offsets)

    @property
    def d_token(self) -> int:
        return self.embeddings.embedding_dim

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embeddings(x + self.category_offsets[None])
        if self.bias is not None:
            x = x + self.bias[None]
        return x


class FeatureTokenizer(nn.Module):
    def __init__(
        self,
        n_num_features: int,
        cat_cardinalities: List[int],
        d_token: int,
        preproc_type: str,
        cat_preproc_type: str,
        preproc_args: dict,
        positional: bool,
        tf: bool
    ) -> None:
        super().__init__()
        assert n_num_features >= 0, 'n_num_features must be non-negative'
        assert (
            n_num_features or cat_cardinalities
        ), 'at least one of n_num_features or cat_cardinalities must be positive/non-empty'
        self.initialization = 'uniform'
        self.n_num_features = n_num_features
        self.d = d_token
        self.tf = tf
        self.args = preproc_args
        self.preproc_type = preproc_type
        self.cat_preproc_type = cat_preproc_type
        self.arm_num = None
        self.arm_num_cat = None
        if preproc_type == 'ARM' or preproc_type == 'ARM_Bin':
            self.attn_layer = SparseAttLayer(8, n_num_features, d_token, d_token, 16, 1.7)
            self.arm_num = 8 * self.d * 16 
        if n_num_features:
            if preproc_type == 'Periodic':
                self.num_tokenizer = Periodic(**preproc_args)
                self.d = self.d * 2
            elif preproc_type == 'Linear':
                self.num_tokenizer = LinearEmbeddings(**preproc_args)
            elif preproc_type == 'AutoDis':
                self.num_tokenizer = AutoDis(**preproc_args)
            elif preproc_type == 'Tokens' or preproc_type == 'ARM':
                self.num_tokenizer = FeaturesTokenizer(**preproc_args)
            elif preproc_type == 'SoftEmbedding':
                self.num_tokenizer = SoftEmbedding(**preproc_args)
            elif preproc_type == 'BinEncoding' or preproc_type == 'ARM_Bin':
                self.num_tokenizer = BinEncoding(**preproc_args)
            elif preproc_type == 'None':
                self.num_tokenizer = None
        else:
            self.num_tokenizer = None
        if cat_cardinalities is None:
            self.cat_tokenizer = None
        elif cat_preproc_type == 'Lookup' or cat_preproc_type == 'ARM': 
            self.cat_tokenizer = (
                CategoricalFeatureTokenizer(
                    cat_cardinalities, self.d, True, self.initialization
                )
                if cat_cardinalities
                else None
            )
        elif cat_preproc_type == 'TT':
            self.cat_tokenizer = (
                TabTransformer(
                    num_continuous=0,
                    categories=cat_cardinalities, 
                    dim=self.d, 
                    depth=3,
                    heads=8,
                )
                if cat_cardinalities
                else None
            )
        if cat_preproc_type == 'ARM':
            self.arm_num_cat = (self.n_num_features + 8 * 16) * self.d
            self.cat_attn_layer = SparseAttLayer(8, len(cat_cardinalities), self.d, self.d, 16, 1.7)
        if positional:
            self.positional = PositionalEncoding(self.d, self.n_num_features, 0.1, True)
        else:
            self.positional = None

    @property
    def n_tokens(self) -> int:
        return self.n_num_features + self.cat_tokenizer.n_tokens
        
    @property
    def d_token(self) -> int:
        return self.d

    def forward(self, x_num: Optional[torch.Tensor], x_cat: Optional[torch.Tensor]) -> torch.Tensor:
        if self.num_tokenizer is None:
            return x_num[..., None]
        assert (
            x_num is not None or x_cat is not None
        ), 'At least one of x_num and x_cat must be presented'
        assert _all_or_none(
            [self.num_tokenizer, x_num]
        ), 'If self.num_tokenizer is (not) None, then x_num must (not) be None'
        assert _all_or_none(
            [self.cat_tokenizer, x_cat]
        ), 'If self.cat_tokenizer is (not) None, then x_cat must (not) be None'
        x = []
        if self.num_tokenizer is not None:
            if self.positional is not None:
                x.append(self.positional(self.num_tokenizer(x_num)))
            else:
                x.append(self.num_tokenizer(x_num))
        if self.cat_tokenizer is not None:
            if self.cat_preproc_type == 'TT':
                x.append(self.cat_tokenizer(x_cat, None))
            elif self.cat_preproc_type == 'ARM':
                temp = self.cat_tokenizer(x_cat)
                w = self.cat_attn_layer(temp)
                w = torch.clamp(w, -1e5, 1e5)
                x_cat_arm = torch.exp(torch.einsum('bfe,bkof->bkoe', temp, w))          
                x_cat_arm = rearrange(x_cat_arm, 'b k o e -> b (k o) e')
                x_cat_arm = torch.clamp(x_cat_arm, -1e5, 1e5)
                x.append(x_cat_arm)
            else:
                x.append(self.cat_tokenizer(x_cat))

        if self.preproc_type == 'ARM' or self.preproc_type == 'ARM_Bin':
            e = x[0]
            arm_weight = self.attn_layer(e)
            arm_weight = torch.clamp(arm_weight, -1e5, 1e5)                             
            x_arm = torch.exp(torch.einsum('bfe,bkof->bkoe', e, arm_weight))         
            x_arm = rearrange(x_arm, 'b k o e -> b (k o) e')
            x_arm = torch.clamp(x_arm, -1e5, 1e5)
            x[0] = x_arm
            x = torch.cat(x, dim=1)
            if not self.tf:
                return x.reshape(x.shape[0], -1)
            else:
                return x
        if not self.tf:
            return x[0].view(x[0].shape[0], -1) if len(x) == 1 else torch.cat(x, dim=1).view(x[0].shape[0], -1)
        else:
            return x[0] if len(x) == 1 else torch.cat(x, dim=1)


In [9]:
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union, cast

def reglu(x: torch.Tensor) -> torch.Tensor:
    assert x.shape[-1] % 2 == 0
    a, b = x.chunk(2, dim=-1)
    return a * F.relu(b)


def geglu(x: torch.Tensor) -> torch.Tensor:
    assert x.shape[-1] % 2 == 0
    a, b = x.chunk(2, dim=-1)
    return a * F.gelu(b)



ModuleType = Union[str, Callable[..., nn.Module]]
_INTERNAL_ERROR_MESSAGE = 'Internal error. Please, open an issue.'


def _is_glu_activation(activation: ModuleType):
    return (
        isinstance(activation, str)
        and activation.endswith('GLU')
        or activation in [ReGLU, GEGLU]
    )


def _all_or_none(values):
    return all(x is None for x in values) or all(x is not None for x in values)


class ReGLU(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return reglu(x)


class GEGLU(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return geglu(x)


In [10]:
def get_baseline_transformer_subconfig() -> Dict[str, Any]:
    return {
        'attention_n_heads': 8,
        'attention_initialization': 'kaiming',
        'ffn_activation': 'ReGLU',
        'attention_normalization': 'LayerNorm',
        'ffn_normalization': 'LayerNorm',
        'prenormalization': True,
        'first_prenormalization': False,
        'last_layer_query_idx': None,
        'n_tokens': None,
        'kv_compression_ratio': None,
        'kv_compression_sharing': None,
        'head_activation': 'ReLU',
        'head_normalization': 'LayerNorm',
    }


def get_default_transformer_config(n_blocks: int = 3) -> Dict[str, Any]:
    assert 1 <= n_blocks <= 6
    grid = {
        'd_token': [96, 128, 192, 256, 320, 384],
        'attention_dropout': [0.1, 0.15, 0.2, 0.25, 0.3, 0.35],
        'ffn_dropout': [0.0, 0.05, 0.1, 0.15, 0.2, 0.25],
    }
    arch_subconfig = {k: v[n_blocks - 1] for k, v in grid.items()}  # type: ignore
    baseline_subconfig = get_baseline_transformer_subconfig()
    ffn_d_hidden_factor = (
        (4 / 3) if _is_glu_activation(baseline_subconfig['ffn_activation']) else 2.0
    )
    return {
        'n_blocks': n_blocks,
        'residual_dropout': 0.0,
        'ffn_d_hidden': int(arch_subconfig['d_token'] * ffn_d_hidden_factor),
        **arch_subconfig,
        **baseline_subconfig,
    }

In [11]:
class GaussianNoise(nn.Module):
    def __init__(self, stddev, device):
        super().__init__()
        self.stddev = stddev
        self.device = device

    def forward(self, din):
        if self.training:
            return din + torch.autograd.Variable(
                torch.randn(din.size()).to(self.device) * self.stddev
            )
        return din

class ResNetBlock(nn.Module):
    def __init__(
        self,
        n_in,
        hid_factor,
        n_out,
        drop_rate=(0.1, 0.1),
        noise_std=0.05,
        act_fun=nn.ReLU,
        use_bn=True,
        use_noise=True,
        use_dropout=True,
        device=torch.device("cuda:0"),
        **kwargs,
    ):
        super(ResNetBlock, self).__init__()
        self.features = nn.Sequential(OrderedDict([]))

        if use_bn:
            self.features.add_module("norm", nn.BatchNorm1d(n_in))
        if use_noise:
            self.features.add_module("noise", GaussianNoise(noise_std, device))

        self.features.add_module("dense1", nn.Linear(n_in, int(hid_factor * n_in)))
        self.features.add_module("act1", act_fun())

        if use_dropout:
            self.features.add_module("drop1", nn.Dropout(p=drop_rate[0]))

        self.features.add_module("dense2", nn.Linear(int(hid_factor * n_in), n_out))

        if use_dropout:
            self.features.add_module("drop2", nn.Dropout(p=drop_rate[1]))

    def forward(self, x):
        x = self.features(x)
        return x


class ResNetModel(nn.Module):
    def __init__(
        self,
        n_in,
        n_out=1,
        hid_factor=[2, 2],
        drop_rate=[[0.1, 0.1], [0.1, 0.1]],
        bias=None,
        noise_std=0.05,
        act_fun=nn.ReLU,
        num_init_features=None,
        use_bn=True,
        use_noise=True,
        use_dropout=True,
        device=torch.device("cuda:0"),
        preproc=False,
        preproc_type=None,
        preproc_args=None,
        positional=False,
        tokenizer=None,
        **kwargs,
    ):
        super(ResNetModel, self).__init__()
        if isinstance(drop_rate, float):
            drop_rate = [[drop_rate, drop_rate]] * len(hid_factor)
        elif isinstance(drop_rate, tuple) and isinstance(drop_rate[0], float) and isinstance(drop_rate[1], float):
            drop_rate = [drop_rate] * len(hid_factor)
        else:
            assert len(drop_rate) == len(hid_factor), "Wrong number hidden_sizes/drop_rates. Must be equal."
        
        
        num_features = n_in if num_init_features is None else num_init_features
        self.features1 = nn.Sequential(OrderedDict([]))
        if tokenizer is not None:
            self.features1.add_module('preproc', tokenizer)
            if tokenizer.arm_num is not None:
                num_features = tokenizer.arm_num
            elif tokenizer.arm_num_cat is not None:
                num_features = tokenizer.arm_num_cat
            else:
                num_features = num_features * tokenizer.d
            self.d_embedding = tokenizer.d
        if preproc:
            if preproc_type == 'Periodic':
                self.features1.add_module('preproc', Periodic(**preproc_args))
                num_features = 2 * num_features * preproc_args['n']
                self.d_embedding = preproc_args['n']
            elif preproc_type == 'Linear':
                self.features1.add_module('preproc', LinearEmbeddings(**preproc_args))
                num_features = num_features * preproc_args['d_embeddings'][-1]
                self.d_embedding = preproc_args['d_embeddings'][-1]
            elif preproc_type == 'AutoDis':
                self.features1.add_module('preproc', AutoDis(**preproc_args))
                num_features = num_features * preproc_args['d_embedding']
                self.d_embedding = preproc_args['d_embedding']
            elif preproc_type == 'Tokens':
                self.features1.add_module('preproc', FeaturesTokenizer(**preproc_args))
                num_features = num_features * preproc_args['d_embedding']
                self.d_embedding = preproc_args['d_embedding']
            elif preproc_type == 'SoftEmbedding':
                self.features1.add_module('preproc', SoftEmbedding(**preproc_args))
                num_features = num_features * preproc_args['embeddings_dim']
                self.d_embedding = preproc_args['embeddings_dim']
        if positional:
            self.features1.add_module('positional', PositionalEncoding(self.d_embedding, n_in, 0.1, False))
        
        for i, hd_factor in enumerate(hid_factor):
            block = ResNetBlock(
                n_in=num_features,
                hid_factor=hd_factor,
                n_out=num_features,
                drop_rate=drop_rate[i] if use_dropout else 0,
                noise_std=noise_std,
                act_fun=act_fun,
                use_bn=use_bn,
                use_noise=use_noise,
                use_dropout=use_dropout,
                device=device,
            )
            self.features1.add_module("resnetblock%d" % (i + 1), block)

        self.features2 = nn.Sequential(OrderedDict([]))
        if use_bn:
            self.features2.add_module("norm", nn.BatchNorm1d(num_features))

        self.features2.add_module("act", act_fun())
        self.fc = nn.Linear(num_features, n_out)

        if bias is not None:
            print("init bias!")
            bias = torch.Tensor(bias)
            self.fc.bias.data = bias
            self.fc.weight.data = torch.zeros(n_out, num_features, requires_grad=True)

    def forward(self, x, x_cat=None):
        identity = None
        for name, layer in self.features1.named_children():
            if name != "preproc" and name != "resnetblock1" and name != 'positional':
                x += identity
                identity = x
            if name == "preproc":
                x = layer(x, x_cat)
                identity = x
            else:
                x = layer(x)

        x = self.features2(x)
        logits = self.fc(x)
        return logits.view(logits.shape[0], -1)


## Metrics

In [12]:
class Metric():
    def __init__(self, metric, higher_is_better=True, name='name', optimize=False, discrete=False, **kwargs):
        self.name = name
        self.higher_is_better = higher_is_better
        self.optimize = optimize
        self.discrete = discrete
        self.metric = metric
        self.best_thr = 0.5

    def __repr__(self):
        return str(self.name)

    def __call__(self, y_true, y_pred, thr=0.5, use_best=False):
        if self.discrete:
            return self.metric(y_true, y_pred, thr=thr if not use_best else self.best_thr)
        else:
            return self.metric(y_true, y_pred)

    def find_threshold(self, y_true, y_pred):
        if self.optimize:
            w0 = [0.5]
            res = fmin(self.opt, w0, args=(y_true, y_pred), disp=0)[0]
            self.best_thr = res
            return res
        else:
            return 0.5

    def opt(self, w, y_true, y_pred):
        return (-1) ** (self.higher_is_better) * self(y_true, y_pred, w[0])


def f1_custom(y_true, y_pred, thr=0.5):
    return f1_score(y_true, y_pred > thr, average='micro')


def f1_macro(y_true, y_pred, thr=0.5):
    return f1_score(y_true, y_pred > thr, average='macro')


def acc_score(y_true, y_pred, thr=0.5):
    return accuracy_score(y_true, y_pred > thr)


def bacc_score(y_true, y_pred, thr=0.5):
    return balanced_accuracy_score(y_true, y_pred > thr)


class MetricFactory:
    def __init__(self, ):
        self.metrics = {
            'auc': Metric(metric=roc_auc_score, higher_is_better=True, name='auc', optimize=False, discrete=False),
            'log-loss': Metric(metric=log_loss, higher_is_better=False, name='log-loss', optimize=False,
                              discrete=False),
            'f1': Metric(metric=f1_custom, higher_is_better=True, name='f1', optimize=True, discrete=True),
            'f1-macro': Metric(metric=f1_macro, higher_is_better=True, name='f1_macro', optimize=True, discrete=True),
            'balanced-acc': Metric(metric=bacc_score, higher_is_better=True, name='balanced-acc', optimize=True,
                                   discrete=True),
            'acc': Metric(metric=acc_score, higher_is_better=True, name='acc', optimize=True, discrete=True),
            'mse': Metric(metric=mean_squared_error, higher_is_better=False, name='mse', optimize=False, discrete=False),
            'r2': Metric(metric=r2_score, higher_is_better=True, name='r2', optimize=False, discrete=False),
            'mae': Metric(metric=mean_absolute_error, higher_is_better=False, name='mae', optimize=False, discrete=False)
        }

    def get_allowed(self):
        return sorted(list(self.metrics.keys()))

    def add(self, metric_name, metric_class):
        self.metrics[metric_name] = metric_class
        return self

    def remove(self, metric_name):
        del self.models[metric_name]
        return self

    def __getitem__(self, metric_name):
        return deepcopy(self.metrics[metric_name])


## Train and evaluation

In [13]:
def compute_metrics(
    outputs,
    targets,
) -> Dict[str, float]:
    metrics = {}
    
    y_true = np.array(targets.cpu())
    y_pred = (np.array(outputs.detach().cpu()))
    
    mse = metric_factory['mse']
    mse_score = mse(y_true, y_pred)
    
    mae = metric_factory['mae']
    mae_score = mae(y_true, y_pred)
    
    r2 = metric_factory['r2']
    r2_score = r2(y_true, y_pred)
    
    metrics['rmse'] = np.sqrt(mse_score)
    metrics['mae'] = mae_score
    metrics['r2'] = r2_score
    
    return metrics

In [14]:
def train_epoch_ftt(
    model: nn.Module,
    dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    criterion: torch.nn.Module,
    device: torch.device,
    epoch: int,
    silent: bool
) -> None:
    model.train()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)
    if not silent:
        for i, (data, targets) in tqdm(
            enumerate(dataloader),
            total=len(dataloader),
        ):

            data, targets = data.to(device), targets.to(device)
            data_num = data
            data_cat = None
            optimizer.zero_grad()
            pred = model(data_num, data_cat)
            loss = criterion(pred, targets)
            loss.backward()
            optimizer.step()
    else:
        for i, (data, targets) in enumerate(dataloader):

            data, targets = data.to(device), targets.to(device)
            data_num = data
            data_cat = None
            optimizer.zero_grad()
            pred = model(data_num, data_cat)
            loss = criterion(pred, targets)
            loss.backward()
            optimizer.step()

In [15]:
def evaluate_epoch_ftt(
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    criterion: torch.nn.Module,
    scheduler: torch.optim.lr_scheduler,
    writer: list,
    device: torch.device,
    epoch: int,
    dataset: str,
    glob_silent: bool,
    silent: bool
) -> None:

    model.eval()

    epoch_loss = []
    batch_metrics_list = defaultdict(list)
    
    true_val = None
    pred_val = None

    with torch.no_grad():
        if dataset == 'train':
            desc = 'loop over train batches'
        else:
            desc = 'loop over test batches'

        if not glob_silent:
            for i, (data, targets) in tqdm(
                enumerate(dataloader),
                total=len(dataloader),
                desc=desc,
            ):

                data, targets = data.to(device), targets.to(device)
                data_num = data
                data_cat = None
                outputs = model(data_num, data_cat)
                loss = criterion(outputs, targets)

                epoch_loss.append(loss.item())

                if true_val is None:
                    true_val = targets
                else:
                    true_val = torch.cat((true_val, targets), 0)
                
                if pred_val is None:
                    pred_val = outputs
                else:
                    pred_val = torch.cat((pred_val, outputs), 0)
        else: 
            for i, (data, targets) in enumerate(dataloader):

                data, targets = data.to(device), targets.to(device)
                data_num = data
                data_cat = None
                outputs = model(data_num, data_cat)
                loss = criterion(outputs, targets)

                epoch_loss.append(loss.item())

                if true_val is None:
                    true_val = targets
                else:
                    true_val = torch.cat((true_val, targets), 0)
                
                if pred_val is None:
                    pred_val = outputs
                else:
                    pred_val = torch.cat((pred_val, outputs), 0)
                    
        batch_metrics = compute_metrics(
            outputs = pred_val,
            targets = true_val
        )
        
        for metric_name, metric_value in batch_metrics.items():
            batch_metrics_list[metric_name].append(metric_value)


        loss = criterion(pred_val, true_val)
        if dataset == 'test' and scheduler is not None:
            scheduler.step(loss)
        val_loss = loss.item()
        
        if not silent:
            if dataset == 'train':
                print(f'Train loss: {val_loss}\n')
            else:
                print(f'Test loss: {val_loss}\n')

        writer.append((batch_metrics_list, val_loss))
        
        for metric_name, metric_value_list in batch_metrics_list.items():
            metric_value = metric_value_list[0]
            if not silent:
                if dataset == 'train':
                    print(f'Train {metric_name}: {metric_value}\n')
                else:
                    print(f'Test {metric_name}: {metric_value}\n')

In [16]:
def train_ftt(
    n_epochs: int,
    model: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    val_dataloader: torch.utils.data.DataLoader,
    test_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    scheduler: torch.optim.lr_scheduler,
    criterion: torch.nn.Module,
    writer_train: list,
    writer_val: list,
    writer_test: list,
    device: torch.device,
    patience=10,
    silent=False,
    glob_silent=False
) -> None:
    best_epoch = -1
    best_metric = np.inf
    best_r2 = np.inf
    best_mae = np.inf
    
    
    for epoch in range(n_epochs):

        if not silent:
            print(f"Epoch [{epoch+1} / {n_epochs}]\n")

        train_epoch_ftt(
            model=model,
            dataloader=train_dataloader,
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            epoch=epoch,
            silent=silent
        )
        evaluate_epoch_ftt(
            model=model,
            dataloader=train_dataloader,
            criterion=criterion,
            scheduler=scheduler,
            writer=writer_train,
            device=device,
            epoch=epoch,
            dataset='train',
            glob_silent=glob_silent,
            silent=silent
        )
        evaluate_epoch_ftt(
            model=model,
            dataloader=val_dataloader,
            criterion=criterion,
            scheduler=scheduler,
            writer=writer_val,
            device=device,
            epoch=epoch,
            dataset='test',
            glob_silent=glob_silent,
            silent=silent
        )
        temp_metric = writer_val[-1][0]['rmse'][0]
        if temp_metric < best_metric:
            best_epoch = epoch
            best_metric = temp_metric
            best_r2 = writer_val[-1][0]['r2'][0]
            best_mae = writer_val[-1][0]['mae'][0]
        elif epoch - best_epoch > patience:
            break
    evaluate_epoch_ftt(
            model=model,
            dataloader=test_dataloader,
            criterion=criterion,
            scheduler=scheduler,
            writer=writer_test,
            device=device,
            epoch=None,
            dataset='test',
            glob_silent=glob_silent,
            silent=silent
    )
    rmse = writer_test[-1][0]['rmse'][0]
    r2 = writer_test[-1][0]['r2'][0]
    mae = writer_test[-1][0]['mae'][0]
    return best_metric, best_r2, best_mae
    

## Data

### NYC Taxi

In [17]:
from sklearn.datasets import fetch_openml
df_nt = fetch_openml(data_id=42729, as_frame=True, parser='auto').frame
df_nt = df_nt[df_nt['tip_amount'] <= 20]
nf_nt = ['PULocationID', 'DOLocationID', 'passenger_count', 'tolls_amount', 'total_amount',
         'lpep_pickup_datetime_day', 'lpep_pickup_datetime_hour', 'lpep_pickup_datetime_minute',
        'lpep_dropoff_datetime_day', 'lpep_dropoff_datetime_hour', 'lpep_dropoff_datetime_minute']
cf_nt = ['VendorID', 'store_and_fwd_flag', 'RatecodeID', 'extra', 'mta_tax', 
        'improvement_surcharge', 'trip_type']
scaler = StandardScaler()
df_nt[nf_nt] = scaler.fit_transform(df_nt[nf_nt])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_nt[cf_nt])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_nt.index
df_nt = df_nt.drop(cf_nt, axis=1)
df_nt = pd.concat([df_nt, tdf], axis=1)
df_nt_target_name = 'tip_amount'
df_nt.shape

(581006, 34)

### Colleges

In [18]:
df_cl = fetch_openml(data_id=42727, as_frame=True).frame
df_cl.fillna(df_cl.median(), inplace=True)
nf_cl = ['city', 'state', 'zip', 'latitude', 'longitude', 'admission_rate',
        'sat_verbal_midrange', 'sat_math_midrange', 'sat_writing_midrange', 'act_combined_midrange', 
        'act_english_midrange', 'act_math_midrange', 'act_writing_midrange', 'sat_total_average', 'undergrad_size',
        'percent_white', 'percent_black', 'percent_hispanic', 'percent_asian', 'percent_part_time', 
        'average_cost_academic_year', 'average_cost_program_year', 'tuition_(instate)', 'tuition_(out_of_state)',
        'spend_per_student', 'faculty_salary', 'percent_part_time_faculty', 'completion_rate', 'percent_female',
        'agege24', 'faminc', 'mean_earnings_6_years', 'median_earnings_6_years', 'mean_earnings_10_years',
        'median_earnings_10_years', 'carnegie_basic_classification', 'carnegie_undergraduate', 'carnegie_size',
        'religious_affiliation', ]
cf_cl = ['predominant_degree', 'highest_degree', 'ownership', 'region', 'gender']
le = LabelEncoder()
df_cl['city'] = le.fit_transform(df_cl['city'])
df_cl['state'] = le.fit_transform(df_cl['state'])
df_cl['zip'] = le.fit_transform(df_cl['zip'])
df_cl['carnegie_basic_classification'] = le.fit_transform(df_cl['carnegie_basic_classification'])
df_cl['carnegie_undergraduate'] = le.fit_transform(df_cl['carnegie_undergraduate'])
df_cl['carnegie_size'] = le.fit_transform(df_cl['carnegie_size'])
df_cl['religious_affiliation'] = le.fit_transform(df_cl['religious_affiliation'])
scaler = StandardScaler()
df_cl[nf_cl] = scaler.fit_transform(df_cl[nf_cl])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_cl[cf_cl])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_cl.index
df_cl = df_cl.drop(cf_cl, axis=1)
df_cl = pd.concat([df_cl, tdf], axis=1)
df_cl_target_name = 'percent_pell_grant'
df_cl.shape

  warn(
  df_cl.fillna(df_cl.median(), inplace=True)


(7063, 65)

### House sales

In [19]:
df_hs = fetch_openml(data_id=42731, as_frame=True).frame
df_hs.fillna(df_hs.median(), inplace=True)
df_hs = df_hs[df_hs['price'] <= 3000000]
nf_hs = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built', 
    'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_year', 'date_month', 'date_day']
cf_hs = ['floors', 'waterfront', 'view', 'condition', 'grade']
le = LabelEncoder()
df_hs['zipcode'] = le.fit_transform(df_hs['zipcode'])
scaler = StandardScaler()
df_hs[nf_hs] = scaler.fit_transform(df_hs[nf_hs])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_hs[cf_hs])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_hs.index
df_hs = df_hs.drop(cf_hs, axis=1)
df_hs = pd.concat([df_hs, tdf], axis=1)
df_hs_target_name = 'price'
df_hs.shape

  warn(
  df_hs.fillna(df_hs.median(), inplace=True)


(21568, 47)

### Black friday

In [20]:
df_bf = fetch_openml(data_id=41540, as_frame=True).frame
df_bf.fillna(df_bf.median(), inplace=True)
nf_bf = ['Occupation', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3']
cf_bf = ['Gender', 'Age', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status']
le = LabelEncoder()
scaler = StandardScaler()
df_bf[nf_bf] = scaler.fit_transform(df_bf[nf_bf])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_bf[cf_bf])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_bf.index
df_bf = df_bf.drop(cf_bf, axis=1)
df_bf = pd.concat([df_bf, tdf], axis=1)
df_bf_target_name = 'Purchase'
df_bf.shape

  warn(
  df_bf.fillna(df_bf.median(), inplace=True)


(166821, 24)

### Beijing PM2.5

In [22]:
df_bp = pd.read_csv('./PRSA_data_2010.1.1-2014.12.31.csv')
df_bp.drop(['No'], axis=1, inplace=True)
df_bp.fillna(df_bp.median(), inplace=True)
df_bp = df_bp[df_bp['pm2.5'] <= 600]
nf_bp = ['month', 'day', 'hour', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir']
cf_bp = ['year', 'cbwd']
le = LabelEncoder()
df_bp['year'] = le.fit_transform(df_bp['year'])
df_bp['cbwd'] = le.fit_transform(df_bp['cbwd'])
scaler = StandardScaler()
df_bp[nf_bp] = scaler.fit_transform(df_bp[nf_bp])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_bp[cf_bp])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_bp.index
df_bp = df_bp.drop(cf_bp, axis=1)
df_bp = pd.concat([df_bp, tdf], axis=1)
df_bp_target_name = 'pm2.5'
df_bp.shape

  df_bp.fillna(df_bp.median(), inplace=True)


(43792, 19)

### Brazilian houses

In [23]:
df_bh = fetch_openml(data_id=42688, as_frame=True).frame
df_bh.fillna(df_bh.median(), inplace=True)
df_bh = df_bh[df_bh['total_(BRL)'] <= 40000]
nf_bh = ['area', 'rooms', 'bathroom', 'parking_spaces', 'floor', 'hoa_(BRL)', 'rent_amount_(BRL)',
        'property_tax_(BRL)', 'fire_insurance_(BRL)']
cf_bh = ['city', 'animal', 'furniture']
le = LabelEncoder()
scaler = StandardScaler()
df_bh['city'] = le.fit_transform(df_bh['city'])
df_bh['animal'] = le.fit_transform(df_bh['animal'])
df_bh['furniture'] = le.fit_transform(df_bh['furniture'])
df_bh[nf_bh] = scaler.fit_transform(df_bh[nf_bh])
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
transf = ohe.fit_transform(df_bh[cf_bh])
tdf = pd.DataFrame(transf, columns=ohe.get_feature_names_out())
tdf.index = df_bh.index
df_bh = df_bh.drop(cf_bh, axis=1)
df_bh = pd.concat([df_bh, tdf], axis=1)
df_bh_target_name = 'total_(BRL)'
df_bh.shape

  warn(
  df_bh.fillna(df_bh.median(), inplace=True)


(10685, 19)

## Runs

In [24]:
dfs = {'nt': df_nt, 'cl': df_cl, 'hs': df_hs, 'bf': df_bf, 'bp': df_bp, 'bh': df_bh}
dfs_shapes = {'nt': df_nt.shape[1]-1, 'cl': df_cl.shape[1]-1, 'hs': df_hs.shape[1]-1, 'bf': df_bf.shape[1]-1, 'bp': df_bp.shape[1]-1, 'bh': df_bh.shape[1]-1}
dfs_targets = {'nt': df_nt_target_name, 'cl': df_cl_target_name, 'hs': df_hs_target_name, 'bf': df_bf_target_name, 'bp': df_bp_target_name, 'bh': df_bh_target_name}
dfs_names = ['nt', 'cl', 'hs', 'bf', 'bp', 'bh']
preproc_types = ['Periodic', 'Fourier', 'Linear', 'AutoDis', 'Tokens', 'SoftEmbedding']

In [25]:
print(df_nt.shape)
print(df_cl.shape)
print(df_hs.shape)
print(df_bf.shape)
print(df_bp.shape)
print(df_bh.shape)

(581006, 34)
(7063, 65)
(21568, 47)
(166821, 24)
(43792, 19)
(10685, 19)


In [26]:
def get_preproc_params(dataset, preproc_type, tf, d_embedding):
    preproc_params = {}
    if preproc_type == 'Periodic':
        preproc_params = {'n_features': dfs_shapes[dataset], 'n': d_embedding, 'sigma': 0.1, 'trainable': True,
                         'initialization': 'normal', 'tf': tf}
        preproc_name = 'Periodic'
    elif preproc_type == 'Fourier':
        preproc_params = {'n_features': dfs_shapes[dataset], 'n': d_embedding, 'sigma': 0.1, 'trainable': False,
                         'initialization': 'normal', 'tf': tf}
        preproc_name = 'Periodic'
    elif preproc_type == 'Linear':
        preproc_params = {'n_layers': 2, 'n_features': dfs_shapes[dataset], 
                          'd_embeddings': [d_embedding, d_embedding], 'tf': tf}
        preproc_name = 'Linear'
    elif preproc_type == 'AutoDis':
        preproc_params = {'n_features': dfs_shapes[dataset], 'd_embedding': d_embedding, 'n_meta_embeddings': 20,
                         'temperature': 0.5, 'tf': tf}
        preproc_name = 'AutoDis'
    elif preproc_type == 'Tokens' or preproc_type == 'ARM':
        preproc_params = {'n_features': dfs_shapes[dataset], 'd_embedding': d_embedding, 'tf': tf}
        preproc_name = preproc_type
    elif preproc_type == 'SoftEmbedding':
        preproc_params = {'num_embeddings': dfs_shapes[dataset], 'embeddings_dim': d_embedding, 
                          'emb_initializer': None, 'tf': tf}
        preproc_name = 'SoftEmbedding'
    elif preproc_type == 'BinEncoding' or preproc_type == 'ARM_Bin':
        preproc_params = {'bin_edges': bin_edges, 'bins': bins, 'bin_values': bin_values, 'nbins': nbins,
                         'tf': tf, 'd_token': d_embedding, 'bias': True, 'initialization': 'normal', 
                          'device': device}
        preproc_name = 'BinEncoding'
    return preproc_params, preproc_name

In [27]:
path = './output/reg/resnet/'
positional = False
tf = False
d_embedding = 10

In [28]:
torch.cuda.empty_cache()
import gc
gc.collect()

0

In [None]:
%%time
for df_name in tqdm(dfs_names):
    for preproc_type in tqdm(preproc_types):
        temp_path = df_name + '/' + preproc_type + '/'
        X = dfs[df_name].drop([dfs_targets[df_name]], axis=1)
        y = dfs[df_name][dfs_targets[df_name]]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
        X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_train, y_train, test_size=0.2, shuffle=False)
        temp = []
        final_res = defaultdict(list)
        n_runs=15
        silent=True,
        glob_silent=True
        best_metric = np.inf
        best_r2 = np.inf
        best_mae = np.inf
        cat_len = 0
        seeds = np.random.randint(1, 100500, 15)
        for i in range(n_runs):
            if not silent:
                print(f'Run {i+1}:')
                print()
            set_global_seed(seeds[i])
            res_metrics = defaultdict(list)

            X_train_t = torch.from_numpy(X_train_t.values).float()
            y_train_t = torch.from_numpy(y_train_t.values.ravel()).float().unsqueeze(1)
            X_test_t = torch.from_numpy(X_test_t.values).float()
            y_test_t = torch.from_numpy(y_test_t.values.ravel()).float().unsqueeze(1)
            X_test = torch.from_numpy(X_test.values).float()
            y_test = torch.from_numpy(y_test.values.ravel()).float().unsqueeze(1)
        
            train_ds = torch.utils.data.TensorDataset(X_train_t, y_train_t)
            val_ds = torch.utils.data.TensorDataset(X_test_t, y_test_t)
            test_ds = torch.utils.data.TensorDataset(X_test, y_test)
            train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=256, shuffle=True)
            val_dataloader = torch.utils.data.DataLoader(val_ds, batch_size=256, shuffle=True)
            test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=256, shuffle=True)
    
            writer_train = []
            writer_val = []
            writer_test = []
        
        
            metric_factory = MetricFactory()
            preproc_params, preproc_name = get_preproc_params(df_name, preproc_type, tf, d_embedding)
                
                
            preproc_params['tf'] = True
            feature_tokenizer = FeatureTokenizer(X.shape[1], None, d_embedding, 
                                                     preproc_name, 'Lookup', preproc_params, positional, tf)
            model = ResNetModel(n_in=dfs_shapes[df_name], device=device, use_bn=True, use_noise=True, use_dropout=True,
                preproc=False, preproc_type=preproc_name, preproc_args=preproc_params, positional=positional,
                                    tokenizer=feature_tokenizer).to(device)
                
            optimizer = torch.optim.Adam(model.parameters(), lr=5e-3, weight_decay=0)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, patience=5, factor=1e-1, min_lr=1e-6)
            criterion = torch.nn.MSELoss()
            metric_rmse, metric_r2, metric_mae = train_ftt(100, model, train_dataloader, val_dataloader, test_dataloader, optimizer, None, criterion, writer_train,
                               writer_val, writer_test, device, 20, silent, glob_silent)
            test_metrics = writer_test[-1][0]
            test_loss = writer_test[-1][1]
            for key, value in test_metrics.items():
                res_metrics[key].append(value[0])
            res_metrics['loss'].append(test_loss)
            for key, value in res_metrics.items():
                final_res[key].append(np.mean(value))
            s = 'run' + str(i + 1) + '.pickle'
            with open(path + temp_path + s, 'wb') as f:
                pickle.dump(res_metrics, f)
        
        ans = {}
        for key, value in final_res.items():
            ans[key] = float(np.mean(value))
        with open(path + temp_path + 'metrics.yml', 'w') as f:
           ruamel.yaml.round_trip_dump(ans, f)