In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
np.random.seed(0)

import scipy

import os
from pathlib import Path

import warnings
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# # fetch dataset 
data = fetch_ucirepo(id=189) 
  
# data (as pandas dataframes) 
train = data.data.features

In [None]:
train["target"] = data.data.targets["motor_UPDRS"]

In [None]:
train.head()

In [None]:
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [None]:
target = "target"

In [None]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 10:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

In [None]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

num_features = [i for i in range(len(features)) if i not in cat_idxs]

In [None]:
X_train = train[features].values[train_indices].astype(float)
y_train = train[target].values[train_indices].reshape(-1, 1)

X_valid = train[features].values[valid_indices].astype(float)
y_valid = train[target].values[valid_indices].reshape(-1, 1)

X_test = train[features].values[test_indices].astype(float)
y_test = train[target].values[test_indices].reshape(-1, 1)

In [None]:
mean = X_train[:, num_features].mean(axis=0)
std = X_train[:, num_features].std(axis=0)

X_train[:, num_features] = (X_train[:, num_features].astype(float) - mean) / std
X_valid[:, num_features] = (X_valid[:, num_features].astype(float) - mean) / std
X_test[:, num_features] = (X_test[:, num_features].astype(float) - mean) / std

In [None]:
y_mean = y_train.mean()
y_std = y_train.std()

y_train = (y_train - y_mean) / y_std
y_valid = (y_valid - y_mean) / y_std
y_test = (y_test - y_mean) / y_std

In [None]:
from pytorch_tabr import TabRRegressor
clf = TabRRegressor(
    bin_indices=cat_idxs,
    device_name="cuda",
    optimizer_params=dict(lr=2e-3),
)

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], max_epochs=200, batch_size=128)

In [None]:
# preds_train = clf.predict(X_train)
# train_auc = roc_auc_score(y_score=preds_train[:,1], y_true=y_train)

preds = clf.predict(X_test)
test_acc = mean_squared_error(y_true=y_test, y_pred=preds)

preds_valid = clf.predict(X_valid)
valid_acc = mean_squared_error(y_true=y_valid, y_pred=preds_valid)

# print(f"FINAL TRAIN SCORE FOR {dataset_name} : {train_auc}")
print(f"FINAL VALID SCORE : {valid_acc}")
print(f"FINAL TEST SCORE : {test_acc}")

# Catboost

In [None]:
from catboost import CatBoostRegressor

In [None]:
df_X_train = pd.DataFrame(X_train)
df_X_valid = pd.DataFrame(X_valid)
df_X_test = pd.DataFrame(X_test)

In [None]:
for cat_id in cat_idxs:
    df_X_train[cat_id] = df_X_train[cat_id].astype(int).astype(str)
    df_X_valid[cat_id] = df_X_valid[cat_id].astype(int).astype(str)
    df_X_test[cat_id] = df_X_test[cat_id].astype(int).astype(str)

In [None]:
clf = CatBoostRegressor(
    devices=[0],
    iterations=20000,
)
clf.fit(
    df_X_train, y_train,
    eval_set=[(df_X_valid, y_valid)],
    cat_features=cat_idxs,
    early_stopping_rounds=50,
)

In [None]:
mean_squared_error(y_true=y_test, y_pred=clf.predict(df_X_test))