In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score

import pandas as pd
import numpy as np
np.random.seed(0)

import scipy

import os
from pathlib import Path

import warnings
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')

In [None]:
dataset_name = 'census-income'
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')

In [None]:
train = pd.read_csv(out)
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [None]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

In [None]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

num_features = [i for i in range(len(features)) if i not in cat_idxs]

In [None]:
X_train = train[features].values[train_indices].astype(float)
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices].astype(float)
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices].astype(float)
y_test = train[target].values[test_indices]

In [None]:
mean = X_train[:, num_features].mean(axis=0)
std = X_train[:, num_features].std(axis=0)

X_train[:, num_features] = (X_train[:, num_features].astype(float) - mean) / std
X_valid[:, num_features] = (X_valid[:, num_features].astype(float) - mean) / std
X_test[:, num_features] = (X_test[:, num_features].astype(float) - mean) / std

In [None]:
from tabr import TabRClassifier
clf = TabRClassifier(
    cat_indices=cat_idxs,
    cat_cardinalities=cat_dims,
    type_embeddings="one-hot",
    device_name="cpu",
    optimizer_params={"lr": 2e-4},
    d_main=96,
    context_size=96,
    # context_dropout=0.5,
    # context_sample_size=2000,
    # num_embeddings={"type": "PLREmbeddings", "n_frequencies": 32, "frequency_scale": 32, "d_embedding": 32, "lite": False},
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_test, y_test), (X_valid, y_valid)], max_epochs=20, batch_size=2048)

In [None]:
# preds_train = clf.predict(X_train)
# train_auc = roc_auc_score(y_score=preds_train[:,1], y_true=y_train)

preds = clf.predict_proba(X_test)
test_auc = roc_auc_score(y_score=preds[:,1], y_true=y_test)

preds_valid = clf.predict_proba(X_valid)
valid_auc = roc_auc_score(y_score=preds_valid[:,1], y_true=y_valid)

# print(f"FINAL TRAIN SCORE FOR {dataset_name} : {train_auc}")
print(f"FINAL VALID SCORE FOR {dataset_name} : {valid_auc}")
print(f"FINAL TEST SCORE FOR {dataset_name} : {test_auc}")

# Catboost

In [None]:
from catboost import CatBoostClassifier

In [None]:
df_X_train = pd.DataFrame(X_train)
df_X_valid = pd.DataFrame(X_valid)
df_X_test = pd.DataFrame(X_test)

In [None]:
cat_idxs

In [None]:
for cat_id in cat_idxs:
    df_X_train[cat_id] = df_X_train[cat_id].astype(int).astype(str)
    df_X_valid[cat_id] = df_X_valid[cat_id].astype(int).astype(str)
    df_X_test[cat_id] = df_X_test[cat_id].astype(int).astype(str)

In [None]:
clf_cat = CatBoostClassifier()
clf_cat.fit(
    df_X_train, y_train,
    eval_set=[(df_X_valid, y_valid)],
    cat_features=cat_idxs,
    early_stopping_rounds=50,
)

In [None]:
roc_auc_score(y_valid, clf_cat.predict_proba(df_X_valid)[:, 1])

In [None]:
roc_auc_score(y_test, clf_cat.predict_proba(df_X_test)[:, 1])

In [None]:
roc_auc_score(y_test, clf_cat.predict_proba(df_X_test)[:, 1] + preds[:, 1])