In [24]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
from typing import Tuple, Optional

import torch
from torch import optim, nn

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

from TALENT.model.models.tabr import TabR
from ucimlrepo import fetch_ucirepo

In [25]:
device_type = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_type)

In [26]:
def load_phishing():
    heart_disease = fetch_ucirepo(id=327)

    X = heart_disease.data.features
    y = np.array(heart_disease.data.targets.values.ravel(), dtype=np.str_)
    return X, y


def load_tuandromb():
    data = pd.read_csv("data/TUANDROMD.csv")
    data = data[~data["Label"].isna()]

    X = data.drop(columns=["Label"])
    y = data["Label"]
    return X, y

In [27]:
class TabRWrapper:
    def __init__(
            self,
            X: Tuple[torch.Tensor, Optional[torch.Tensor]],
            y: torch.Tensor,
            n_classes: int = 2,
    ):
        # Сохраняем числовые и категориальные признаки, целевые метки и информацию о размере
        self.X_num, self.X_cat = X
        self.y = y
        self.num_features = self.X_num.shape[1]
        self.n_cat_features = self.X_cat.shape[1] if self.X_cat is not None else 0
        self.n_classes = n_classes

        # Инициализируем модель
        self._init_model()

    def _init_model(self):
        # Параметры модели TabR (определяют архитектуру encoder'а и predictor'а), вытащены из библиотеки
        model_params = {
            "n_num_features": self.num_features,
            "n_cat_features": self.n_cat_features,
            "n_classes": self.n_classes,
            "num_embeddings": None,
            "d_main": 265,
            "context_dropout": 0.3892,
            "d_multiplier": 2.0,
            "encoder_n_blocks": 0,
            "predictor_n_blocks": 1,
            "mixer_normalization": "auto",
            "dropout0": 0.3885,
            "dropout1": 0.0,
            "normalization": "LayerNorm",
            "activation": "ReLU",
        }

        # Инстанциируем модель
        self.model = TabR(**model_params).to(device)

    def fit(
            self,
            epochs: int = 10,
            batch_size: int = 64,
            lr: float = 1e-3,
            context_size: int = 5,
    ):
        # Оптимизатор и функция потерь
        optimizer = optim.Adam(self.model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()

        dataset_size = self.y.size(0)
        self.model.train()

        # Проходим по эпохам
        for _ in tqdm(range(epochs)):
            epoch_loss = 0.0
            perm = torch.randperm(dataset_size)  # случайный порядок сэмплов для аугментации

            # Разделяем на батчи
            for i in range(0, dataset_size, batch_size):
                idx = perm[i:i + batch_size]

                # Загружаем батч входов и меток
                x_num_batch = self.X_num[idx].to(device)
                x_cat_batch = self.X_cat[idx].to(device) if self.X_cat is not None else None
                y_batch = self.y[idx].long().to(device)

                # Выбираем случайный контекст (другие точки для сравнения)
                candidate_idx = torch.randperm(dataset_size)[:batch_size]
                cx_num = self.X_num[candidate_idx].to(device)
                cx_cat = self.X_cat[candidate_idx].to(device) if self.X_cat is not None else None
                cy = self.y[candidate_idx].to(device)

                # Прямой проход через модель
                output = self.model(
                    x_num=x_num_batch,
                    x_cat=x_cat_batch,
                    y=y_batch,
                    candidate_x_num=cx_num,
                    candidate_x_cat=cx_cat,
                    candidate_y=cy,
                    context_size=context_size,
                    is_train=True,
                )

                # Вычисляем ошибку, обратный проход и шаг оптимизации
                loss = criterion(output, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                epoch_loss += loss.item()

    def predict(self, X: Tuple[torch.Tensor, Optional[torch.Tensor]], context_size: int = 5) -> torch.Tensor:
        self.model.eval()

        x_num, x_cat = X
        x_num = x_num.to(device)
        x_cat = x_cat.to(device) if x_cat is not None else None

        # Контекст - вся обучающая выборка
        candidate_x_num = self.X_num.to(device)
        candidate_x_cat = self.X_cat.to(device) if self.X_cat is not None else None
        candidate_y = self.y.to(device)

        # Прямой проход (без градиентов)
        with torch.no_grad():
            logits = self.model(
                x_num=x_num,
                x_cat=x_cat,
                y=None,
                candidate_x_num=candidate_x_num,
                candidate_x_cat=candidate_x_cat,
                candidate_y=candidate_y,
                context_size=context_size,
                is_train=False,
            )

            # Выбираем метку с максимальной вероятностью
            return torch.argmax(logits, dim=1)

    def compare_neighbor_spaces(self, k: int):
        # Стандартизация признаков
        scaler = StandardScaler()
        X_std = scaler.fit_transform(self.X_num.numpy())

        # Применяем encode-ing
        with torch.no_grad():
            x_tensor = torch.tensor(X_std, dtype=torch.float32).to(device)
        z, _ = self.model._encode(x_tensor, None)
        Z = z.cpu().detach().numpy()

        # Находим ближайших соседей в исходном пространстве (Евклид)
        nn_euc_std = NearestNeighbors(n_neighbors=k, metric="euclidean").fit(X_std)

        # В скрытом пространстве (Евклид)
        nn_euc_latent = NearestNeighbors(n_neighbors=k, metric="euclidean").fit(Z)

        _, neighbors_std = nn_euc_std.kneighbors(X_std)
        _, neighbors_latent = nn_euc_latent.kneighbors(Z)

        # Используем псевдообратную матрицу (может быть вырожденной) для метрики Махаланобиса
        VI = np.linalg.pinv(np.cov(X_std.T))
        nn_mahal_std = NearestNeighbors(n_neighbors=k, metric="mahalanobis", metric_params={"VI": VI}).fit(X_std)
        _, neighbors_mahal = nn_mahal_std.kneighbors(X_std)

        # Считаем перекрытие соседей
        overlaps_euclidean = []
        overlaps_mahalanobis = []

        for i in range(len(X_std)):
            eu = len(set(neighbors_std[i]) & set(neighbors_latent[i]))
            ma = len(set(neighbors_mahal[i]) & set(neighbors_latent[i]))
            overlaps_euclidean.append(eu)
            overlaps_mahalanobis.append(ma)

        overlaps_euclidean = np.array(overlaps_euclidean)
        overlaps_mahalanobis = np.array(overlaps_mahalanobis)

        print(f"Average Euclidean overlap: {overlaps_euclidean.mean():.2f} / {k}")
        print(f"Average Mahalanobis overlap: {overlaps_mahalanobis.mean():.2f} / {k}")

        low_overlap_idxs = np.where(overlaps_euclidean <= 1)[0]
        print(f"{len(low_overlap_idxs)} samples have ≤1 Euclidean neighbor in common with encoded space")


In [28]:
def apply_method(X, y):
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X=X)

    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    n_classes = len(label_encoder.classes_)

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    model = TabRWrapper((torch.Tensor(X_train), None), torch.Tensor(y_train), n_classes)

    model.fit(epochs=100)
    pred = model.predict((torch.Tensor(X_test), None))
    model.compare_neighbor_spaces(20)

    accuracy = balanced_accuracy_score(y_test, pred.cpu())
    print(f"Balanced accuracy: {accuracy}")

In [29]:
apply_method(*load_breast_cancer(return_X_y=True))

100%|██████████| 100/100 [00:03<00:00, 29.59it/s]


Average Euclidean overlap: 14.44 / 20
Average Mahalanobis overlap: 7.30 / 20
0 samples have ≤1 Euclidean neighbor in common with encoded space
Balanced accuracy: 0.9448795180722891


In [30]:
apply_method(*load_tuandromb())

100%|██████████| 100/100 [00:29<00:00,  3.44it/s]


Average Euclidean overlap: 15.73 / 20
Average Mahalanobis overlap: 5.53 / 20
20 samples have ≤1 Euclidean neighbor in common with encoded space
Balanced accuracy: 0.9824932130016876


In [31]:
apply_method(*load_phishing())

100%|██████████| 100/100 [00:56<00:00,  1.78it/s]


Average Euclidean overlap: 10.16 / 20
Average Mahalanobis overlap: 9.79 / 20
12 samples have ≤1 Euclidean neighbor in common with encoded space
Balanced accuracy: 0.9614295802043163


Наша моделька из TabR (вопреки трудостям с установкой либы) показала хорошие результаты на всех трёх датаасетах.

В во всех трёх случаях количество пересечений в Евклидовом пространстве больше, чем в пространстве Махаланобисоа

В датасетаз 2 и 3 есть сэмплы с низким пересечением, что говорит о наличие выбросов в данных





