In [1]:
!pip install delu
!pip install ucimlrepo
!pip install gdown
!pip install rtdl_num_embeddings
!git clone https://github.com/gbulgakov/testing-kan.git


Collecting delu
  Downloading delu-0.0.26-py3-none-any.whl.metadata (805 bytes)
Downloading delu-0.0.26-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: delu
Successfully installed delu-0.0.26
Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7
Collecting rtdl_num_embeddings
  Downloading rtdl_num_embeddings-0.0.11-py3-none-any.whl.metadata (882 bytes)
Downloading rtdl_num_embeddings-0.0.11-py3-none-any.whl (13 kB)
Installing collected packages: rtdl_num_embeddings
Successfully installed rtdl_num_embeddings-0.0.11
Cloning into 'testing-kan'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (133/133), done.[K
remote: Compressing objects: 100% (125/125), done.[K
rem

In [5]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import optuna
import numpy as np
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from typing import Literal, Optional
from torch import Tensor
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import rtdl_num_embeddings
import delu
from IPython.display import FileLink
from tqdm import tqdm


# наши импорты
sys.path.append('/kaggle/working/testing-kan/optimizers')
sys.path.append('/kaggle/working/testing-kan')
from ademamix import AdEMAMix
from mars import MARS
from muon import Muon
from efficient_kan import KAN
import utils

In [3]:
# !gdown 1xvRa_-OEeG6xNRYE5V5iAfTwyWM1NiLl # otto
# !gdown 1tYyhbHdYs_8I9jvXznMoeUAfBzwitaax # house
# !gdown 1hy1dOAL2SE-XZSuMcjLcVgml2CoYkF9q # higgs-small
# !gdown 1hr076cK9QFxH6YZRg5V4av-H7IAve59r # gesture
# !gdown 1ZNScy5fgqtgudT6MZ4EjLt1nwdqirtmX # fb-comments
!gdown 1s0w7gnhiwBCkF49Wdi_cUDpUtXlz2_6q # eye
# !gdown 1T04iP04UGVo95Om84ww1Ed8AFNziOaeY # covtype
# !gdown 1GOkNlinj4zHVSNKbqjN1rR4cvsAf2IgR # churn
# !gdown 11B-l4EasJkclK_Q-RBcxYfGJLSvz-v5c # california
# !gdown 1p8uqDPMfRlFIc69m7iikS6wGkA6JGj1H # adult

Downloading...
From: https://drive.google.com/uc?id=1s0w7gnhiwBCkF49Wdi_cUDpUtXlz2_6q
To: /kaggle/working/eye.zip
100%|█████████████████████████████████████████| 534k/534k [00:00<00:00, 101MB/s]


Я вынес в файл ``utils`` функции ``count_parameters``,``load_dataset``, ``seed_everything``,  ``write_results``. 
Модели не меняем.

In [6]:
DATASETS = ['adult', 'california', 'churn', 'covtype', 'eye', 'fb-comments',
           'gesture', 'higgs-small', 'house', 'microsoft', 'otto', 'santander']


BATCH_SIZES = {'gesture' : 128, 'churn' : 128, 'california' : 256, 'house' : 256, 'adult' : 256, 'otto' : 512, 
               'higgs-small' : 512, 'fb-comments' : 512, 'santander' : 1024, 'covtype' : 1024, 'microsoft' : 1024, 'eye': 128}

REGRESSION = ['house', 'fb-comments', 'microsoft', 'california']
MULTICLASS = ['covtype', 'eye', 'gesture', 'otto']
BINCLASS = ['adult', 'churn', 'higgs-small', 'santander']

OPTIMIZERS = {'adamw' : torch.optim.AdamW,
              'mars' : MARS,
              'ademamix' : AdEMAMix,
              'muon' : Muon}

Модели не меняем.

In [7]:
class MLP(nn.Sequential):
    def __init__(self, layers, dropout):
        super(MLP, self).__init__()
        
        total_layers = []
        for n_in, n_out in zip(layers[:-2], layers[1:-1]):
            total_layers.append(nn.Linear(n_in, n_out))
            total_layers.append(nn.SiLU(inplace=False))
            total_layers.append(nn.Dropout(dropout, inplace=False))
        total_layers.append(nn.Linear(layers[-2], layers[-1])) # выходной слой

        self.classifier = nn.Sequential(*total_layers)


In [8]:
class ModelWithEmbedding(nn.Module):
    def __init__(
        self,
        n_cont_features,
        d_embedding,
        emb_name,
        backbone_model,
        bins, sigma=None # словарь всех необязательных параметров, например sigma, bins
    ) -> None:
        super().__init__()
        self.d_embedding = d_embedding
        self.emb_name = emb_name
        
        if emb_name == 'periodic':
            self.cont_embeddings = rtdl_num_embeddings.PeriodicEmbeddings(
                n_cont_features, d_embedding, frequency_init_scale=sigma, lite=True
            )
            
        if emb_name == 'piecewiselinearq' or emb_name == 'piecewiselineart':
            self.cont_embeddings = rtdl_num_embeddings.PiecewiseLinearEmbeddings(
                d_embedding=d_embedding, activation=False, version='B', bins=bins
            )

        self.backbone = backbone_model
    
    def forward(
        self,
        x_num : Tensor,
        x_cat : Optional[Tensor] = None
    ) -> Tensor:
        x = []
        # Step 1. Embed the continuous features.
        # Flattening is needed for MLP-like models.
        if self.emb_name != 'none':
              x_num = self.cont_embeddings(x_num)
        x.append(x_num.flatten(1))
        
        #categorical features do not need embeddings
        if x_cat is not None:
            x.append(x_cat.flatten(1))
        
        x = torch.column_stack(x)
        return self.backbone(x)

В ``train`` теперь передаем целиком ``optimizer``.  

In [9]:
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.optim import AdamW, Adam
from torch.optim.lr_scheduler import ExponentialLR, StepLR, CosineAnnealingLR
from torch.nn import MSELoss
import torch.nn as nn
import time


def apply_model(batch: dict[str, Tensor], model) -> Tensor:
    return model(batch['X_num'], batch.get('X_cat')).squeeze(-1)


def train(
    epochs, model, model_emb_name,
    device, dataset, loss_fn,
    optimizer,
    optimizer_name=None
):
    scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    model.to(device)
    dataset_name = dataset['info']['id'].split('--')[0]
    task_type = dataset['info']['task_type']
    batch_size = BATCH_SIZES[dataset_name]

    times = []
    for epoch in tqdm(range(epochs), desc = f'{model_emb_name}_{optimizer_name} on {dataset_name}'):
        start_time = time.time()
        model.train()

        for data in delu.iter_batches(dataset['train'], shuffle=True, batch_size=batch_size):
            for key, tensor in data.items():
                data[key] = tensor.to(device)
            optimizer.zero_grad()
            output = apply_model(data, model)
            if task_type == 'multiclass':
                data['y'] = data['y'].long()
            loss_value = loss_fn(output, data['y']) # здесь был каст к типу long (добавил обратно, без него не работает)
            loss_value.backward()
            optimizer.step()

        scheduler.step()
        end_time = time.time()
        times.append(end_time-start_time)

    # Return the average times of training epochs
    t = sum(times)/len(times)
    return t

def validate(model, device, dataset, loss_fn, part='val'):
    model.eval()
    model.to(device)
    val_loss = 0.0

    pred = []
    gt = [] # настоящие таргеты

    dataset_name = dataset['info']['id'].split('--')[0]
    task_type = dataset['info']['task_type']
    batch_size = BATCH_SIZES[dataset_name]

    with torch.no_grad():
        start_time = time.time()
        for data in delu.iter_batches(dataset[part], shuffle=False, batch_size=batch_size):
            for key, tensor in data.items():
                data[key] = tensor.to(device)
            output = apply_model(data, model)
            if task_type == 'multiclass':
                data['y'] = data['y'].long()
            val_loss += loss_fn(output, data['y']).item()
            if output.dim() > 1:
                pred.append(output.argmax(1))
            else:
                pred.append(output >= 0.5)
            gt.append(data['y'])
        end_time = time.time()
        val_time = start_time - end_time
        

    num_batches = dataset[part]['y'].shape[0] // batch_size + 1
    pred = torch.cat(pred)
    gt = torch.cat(gt)
    val_accuracy = (pred == gt).float().mean().item()

    return val_loss / num_batches, val_accuracy, val_time # с нормировкой


Подбор параметров не меняем, тюним архитектуру сеток, ``lr``, ``weight_decay``.

In [10]:
def suggest_params(trial, optuna_params, model_name, emb_name, optim_name):
    #можно добавить что-то/убрать
    params = {'n_layers': trial.suggest_int('n_layers', 1, optuna_params['max_n_layer']),
            'layer_width': trial.suggest_int('layer_width', optuna_params['min_layer_width'], optuna_params['max_layer_width'], step=optuna_params['layer_width_step']),
            'lr' : trial.suggest_float('lr', optuna_params['min_lr'], optuna_params['max_lr'], log=True)}
    if optim_name != 'muon':
        params['weight_decay'] = trial.suggest_float('weight_decay', optuna_params['min_weight_decay'], optuna_params['max_weight_decay'], log=True)
    
    params['d_embedding'] = (trial.suggest_int('d_embedding', optuna_params['min_d_embedding'], optuna_params['max_d_embedding']) 
                            if emb_name != 'none'
                            else 0)
    
    if model_name == 'mlp':
        use_dropout = trial.suggest_categorical('use_dropout', [True, False])
        params['use_dropout'] = use_dropout
        params['dropout'] = (trial.suggest_float('dropout', 0, 0.5) if use_dropout else 0)
    params['sigma'] = (trial.suggest_float('sigma', optuna_params['min_sigma'], optuna_params['max_sigma'], log=True) if emb_name == 'periodic' else None) #дисперсия инициализации весов plr
    return params
    

Поменял размеры (ширину) ``KAN`` и огрубил шаг в ширине до 4.

In [11]:
def read_optuna_params(dataset_name, model_name, emb_name):
    #здесь можно добавить различное пространство гиперпараметров для разных датасетов (пока возвращает все то же самое)
    params = {'max_n_layer' : 4,
              'min_layer_width' : (1 if model_name == 'mlp' else 1),
              'max_layer_width' : (1024 if model_name == 'mlp' else 64),
              'layer_width_step' : (16 if model_name == 'mlp' else 4),
              'min_lr' : 1e-4,
              'max_lr': 1e-2,
              'min_weight_decay' : 5e-4, # для muon это не актуально, но оставим эти константы
              'max_weight_decay' : 5e-2}

    if emb_name != 'none':
        params['max_d_embedding'] = 128
        params['min_d_embedding'] = 2
    
    if emb_name == 'periodic':
        params['min_sigma'] = 0.01
        params['max_sigma'] = 100

    return params

In [12]:
def model_init_preparation(params, dataset, num_classes, model_name, emb_name):
    dataset_info = dataset['info']
    num_cont_cols = dataset['train']['X_num'].shape[1]
    num_cat_cols = 0
    if dataset_info['n_cat_features'] > 0:
        num_cat_cols = dataset['train']['X_cat'].shape[1]

    # создание модели
    layer_widths = list(range(params['n_layers'] + 2))
    
    if emb_name != 'none':
        layer_widths[0] = num_cont_cols * params['d_embedding'] + num_cat_cols
    else:
        layer_widths[0] = num_cont_cols + num_cat_cols
    layer_widths[1:-1] = [params['layer_width'] for i in range(params['n_layers'])] #скрытые слои
    layer_widths[-1] = num_classes
            
    if model_name == 'kan':
        backbone = KAN(layer_widths, grid_size=15, batch_norm=True)
    elif model_name == 'mlp':
        dropout = (params['dropout'] if params['use_dropout'] else 0)
        backbone = MLP(layer_widths, dropout)
    
    # создание эмбеддингов
    if emb_name == 'piecewiselinearq':
        bins = rtdl_num_embeddings.compute_bins(dataset['train']['X_num'], n_bins=params['d_embedding'])
    elif emb_name == 'piecewiselineart': # это мы  больше не используем
        tree_kwargs = {'min_samples_leaf': 64, 'min_impurity_decrease': 1e-4} #возможно стоит тюнить
        bins = rtdl_num_embeddings.compute_bins(X=dataset['train']['X_num'], y=dataset['train']['y'], n_bins=params['d_embedding'], regression=True, tree_kwargs=tree_kwargs)
    else:
        bins = None
            
    task_type = dataset_info['task_type']
    loss_fn = None
    
    if task_type == 'binclass':
        loss_fn = F.binary_cross_entropy_with_logits
    elif task_type == 'multiclass':
        loss_fn = F.cross_entropy
    else:
        loss_fn =  F.mse_loss
        
    return layer_widths, backbone, bins, loss_fn
    


Функцию, запускающую модель, дополним созданием определеннного ``optimizer``.

In [13]:
def get_optimizer(optim_name, model_params, optuna_params):
    optim_class = OPTIMIZERS[optim_name]
    optim_kwargs = {'lr' : optuna_params['lr']}
    if optim_name != 'muon':
        optim_kwargs['weight_decay'] = optuna_params['weight_decay']
    return optim_class(model_params, **optim_kwargs)

In [None]:
def run_single_model(pkl_path, model_name, emb_name, optim_name, dataset, num_epochs):
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    dataset_info = dataset['info']
    
    optuna_params = read_optuna_params(dataset_info['name'], model_name, emb_name)

    num_classes = 1
    if dataset_info['task_type'] == 'multiclass':
        num_classes = dataset_info['n_classes']
    num_cont_cols = dataset['train']['X_num'].shape[1]
    
    num_params = []
    training_time_per_epoch = []

    # сохранили КЛАСС этого оптимайзера
    def objective(trial):
        #возьмем гипперпараметры из оптуны
        params = suggest_params(trial, 
                                optuna_params=optuna_params, 
                                model_name=model_name, 
                                emb_name=emb_name, 
                                optim_name=optim_name)
        
        # создаем модель и оптимайзер
        _, backbone, bins, loss_fn = model_init_preparation(
            params=params,
            dataset=dataset,
            num_classes=num_classes,
            model_name=model_name,
            emb_name=emb_name
        )
        model = ModelWithEmbedding(
            n_cont_features=num_cont_cols,  # Количество числовых признаков
            d_embedding=params['d_embedding'],    # Размерность эмбеддингов
            emb_name=emb_name,                # Тип используемого эмбеддинга
            backbone_model=backbone,                # Базовая архитектура модели
            bins=bins,                    # Параметры бининга для числовых признаков
            sigma=params['sigma']          # Параметр sigma для Gaussian слоев
        )
        model.to(device) 
        optimizer = get_optimizer(optim_name, model.parameters(), params)
        # optimizer = optim_class(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
        
        # обучаем модель при данных параметрах
        epoch_training_time = train(
            epochs=num_epochs,
            model=model,
            model_emb_name=f'{model_name}_{emb_name}',
            device=device,                               
            dataset=dataset,                    
            loss_fn=loss_fn,
            optimizer=optimizer, 
            optimizer_name=optim_name
        )
        training_time_per_epoch.append(epoch_training_time)
        num_params.append(utils.count_parameters(model))

        val_loss, val_accuracy, _ = validate(model, device, dataset, loss_fn)
        
        return (val_loss if dataset_info['task_type'] == 'regression' else val_accuracy)

    direction = ('minimize' if dataset_info['task_type'] == 'regression' else 'maximize')
    
    study = optuna.create_study(direction=direction)
    study.optimize(objective, n_trials=70)
    
    best_params = study.best_params
    layers, backbone, bins, loss_fn = model_init_preparation(
        params=best_params,
        dataset=dataset,
        num_classes=num_classes,
        model_name=model_name,
        emb_name=emb_name
    )
    d_embedding = (best_params['d_embedding'] if emb_name != 'none' else 1)
    sigma = (best_params['sigma'] if emb_name == 'periodic' else None)
    # lr = best_params['lr']
    # weight_decay = best_params['weight_decay']
    
    test_accuracies = []
    test_losses = []
    test_times = []
    for s in range(10):
        utils.seed_everything(s)
        model = ModelWithEmbedding(num_cont_cols, d_embedding, emb_name, backbone_model=backbone, bins=bins, sigma=sigma)
        model.to(device)   
        optimizer = get_optimizer(optim_name, model.parameters(), best_params)
       # optimizer = optim_class(model.parameters(), lr=lr, weight_decay=weight_decay)
        train(num_epochs, model, f'{model_name}_{emb_name}', device, dataset, loss_fn, optimizer, optim_name)
        test_loss, test_accuracy, test_time = validate(model, device, dataset, loss_fn, part='test')
        test_accuracies.append(test_accuracy)
        test_losses.append(test_loss)
        test_times.append(test_time)
        
    utils.write_results(pkl_path, model_name, emb_name, optim_name, 
                        layers, num_epochs, num_params, best_params, 
                        test_accuracies, test_losses, training_time_per_epoch, test_times)


In [15]:
from IPython.display import clear_output
def run_single_dataset(dataset_name, optim_names, emb_names, model_names, num_epochs):
    # dataset_type = dataset_info['type']
    dataset = utils.load_dataset(dataset_name)
    pkl_path = f'{dataset_name}.pkl'
    for model_name in model_names: # можно оставить только kan, тогда model_names = ['kan']
        for optim_name in optim_names:
            for emb_name in emb_names:
                run_single_model(pkl_path, model_name, emb_name, optim_name, dataset, num_epochs)
                clear_output(wait=True)



In [16]:
optim_names = ['adamw', 'ademamix', 'muon', 'mars']
model_names = ['kan']
emb_names = ['none', 'periodic']

for dataset in ['adult', 'gesture']:
    run_single_dataset(dataset, optim_names, emb_names, model_names, 10)

[I 2025-04-09 13:09:36,354] A new study created in memory with name: no-name-31fbccbf-127f-4f31-b3af-e54339368644
kan_periodic_mars on eye: 100%|██████████| 10/10 [00:07<00:00,  1.27it/s]
[I 2025-04-09 13:09:44,326] Trial 0 finished with value: 0.5428571105003357 and parameters: {'n_layers': 3, 'layer_width': 9, 'lr': 0.00081128612546486, 'weight_decay': 0.0008489616168427463, 'd_embedding': 16, 'sigma': 0.017494562285873944}. Best is trial 0 with value: 0.5428571105003357.
kan_periodic_mars on eye: 100%|██████████| 10/10 [00:07<00:00,  1.37it/s]
