In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

import pandas as pd
import numpy as np
np.random.seed(0)

import scipy

import os
from pathlib import Path

import warnings
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')

In [3]:
dataset_name = 'census-income'
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')

In [4]:
train = pd.read_csv(out)
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [5]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

39 73
 State-gov 9
 Bachelors 16
 13 16
 Never-married 7
 Adm-clerical 15
 Not-in-family 6
 White 5
 Male 2
 2174 119
 0 92
 40 94
 United-States 42
 <=50K 2
Set 3


In [6]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

num_features = [i for i in range(len(features)) if i not in cat_idxs]

In [7]:
X_train = train[features].values[train_indices].astype(float)
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices].astype(float)
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices].astype(float)
y_test = train[target].values[test_indices]

In [8]:
mean = X_train[:, num_features].mean(axis=0)
std = X_train[:, num_features].std(axis=0)

X_train[:, num_features] = (X_train[:, num_features].astype(float) - mean) / std
X_valid[:, num_features] = (X_valid[:, num_features].astype(float) - mean) / std
X_test[:, num_features] = (X_test[:, num_features].astype(float) - mean) / std

In [97]:
from tabr.model import TabRClassifier
clf = TabRClassifier(
    cat_indices=cat_idxs,
    cat_cardinalities=cat_dims,
    device_name="cpu",
    encoder_n_blocks=5,
    d_main=8,
)

In [101]:
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], max_epochs=10, batch_size=1024)

0


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:11<00:00,  2.10it/s]


0.9274766827122848
1


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00,  2.01it/s]


0.9314737037250126
2


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.91it/s]


0.9300822311293517
3


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.92it/s]


0.932450185067986
4


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00,  1.95it/s]


0.9320660917519556
5


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00,  1.93it/s]


0.9304253047708545
6


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.86it/s]


0.931107190269494
7


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.87it/s]


0.9284446404865252
8


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.84it/s]


0.9251188505115208
9


100%|███████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.89it/s]


0.9272683880013723


In [102]:
# preds_train = clf.predict(X_train)
# train_auc = roc_auc_score(y_score=preds_train[:,1], y_true=y_train)

preds = clf.predict(X_test)
test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_test)

preds_valid = clf.predict(X_valid)
valid_auc = roc_auc_score(y_score=preds_valid[:,1], y_true=y_valid)

# print(f"FINAL TRAIN SCORE FOR {dataset_name} : {train_auc}")
print(f"FINAL VALID SCORE FOR {dataset_name} : {valid_auc}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

FINAL VALID SCORE FOR census-income : 0.9272683880013723
FINAL TEST SCORE FOR census-income : 0.9261141744556247


In [103]:
preds

array([[2.5838995e-04, 9.9974161e-01],
       [7.5233853e-01, 2.4766146e-01],
       [9.9751991e-01, 2.4800084e-03],
       ...,
       [2.8553172e-03, 9.9714464e-01],
       [8.3227485e-01, 1.6772513e-01],
       [9.9844754e-01, 1.5524174e-03]], dtype=float32)