In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, accuracy_score

import pandas as pd
import numpy as np
np.random.seed(0)

import scipy

import os
from pathlib import Path

import warnings
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')

In [3]:
from ucimlrepo import fetch_ucirepo 
  
# # fetch dataset 
covertype = fetch_ucirepo(id=31) 
  
# data (as pandas dataframes) 
train = covertype.data.features 

In [4]:
train = train.reset_index().drop(["level_0", "level_1", "level_2"], axis=1)

In [5]:
target = "Cover_Type"

bool_columns = [
    "Wilderness_Area1", "Wilderness_Area2", "Wilderness_Area3",
    "Wilderness_Area4", "Soil_Type1", "Soil_Type2", "Soil_Type3", "Soil_Type4",
    "Soil_Type5", "Soil_Type6", "Soil_Type7", "Soil_Type8", "Soil_Type9",
    "Soil_Type10", "Soil_Type11", "Soil_Type12", "Soil_Type13", "Soil_Type14",
    "Soil_Type15", "Soil_Type16", "Soil_Type17", "Soil_Type18", "Soil_Type19",
    "Soil_Type20", "Soil_Type21", "Soil_Type22", "Soil_Type23", "Soil_Type24",
    "Soil_Type25", "Soil_Type26", "Soil_Type27", "Soil_Type28", "Soil_Type29",
    "Soil_Type30", "Soil_Type31", "Soil_Type32", "Soil_Type33", "Soil_Type34",
    "Soil_Type35", "Soil_Type36", "Soil_Type37", "Soil_Type38", "Soil_Type39",
    "Soil_Type40"
]

int_columns = [
    "Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways",
    "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points"
]

In [6]:
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [7]:
train_indices

Index([     0,      1,      2,      3,      4,      5,      6,      9,     10,
           11,
       ...
       580999, 581001, 581002, 581003, 581004, 581005, 581006, 581007, 581008,
       581010],
      dtype='int64', length=464875)

In [8]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 10:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

Hillshade_Noon 2
Hillshade_3pm 2
Horizontal_Distance_To_Fire_Points 2
Wilderness_Area 2
Soil_Type1 2
Soil_Type2 2
Soil_Type3 2
Soil_Type4 2
Soil_Type5 2
Soil_Type6 2
Soil_Type7 2
Soil_Type8 2
Soil_Type9 2
Soil_Type10 2
Soil_Type11 2
Soil_Type12 2
Soil_Type13 2
Soil_Type14 2
Soil_Type15 2
Soil_Type16 2
Soil_Type17 2
Soil_Type18 2
Soil_Type19 2
Soil_Type20 2
Soil_Type21 2
Soil_Type22 2
Soil_Type23 2
Soil_Type24 2
Soil_Type25 2
Soil_Type26 2
Soil_Type27 2
Soil_Type28 2
Soil_Type29 2
Soil_Type30 2
Soil_Type31 2
Soil_Type32 2
Soil_Type33 2
Soil_Type34 2
Soil_Type35 2
Soil_Type36 2
Soil_Type37 2
Soil_Type38 2
Soil_Type39 2
Soil_Type40 2
Cover_Type 7
Set 3


In [9]:
unused_feat = ['Set']

features = [ col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

num_features = [i for i in range(len(features)) if i not in cat_idxs]

In [10]:
X_train = train[features].values[train_indices].astype(float)
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices].astype(float)
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices].astype(float)
y_test = train[target].values[test_indices]

In [11]:
mean = X_train[:, num_features].mean(axis=0)
std = X_train[:, num_features].std(axis=0)

X_train[:, num_features] = (X_train[:, num_features].astype(float) - mean) / std
X_valid[:, num_features] = (X_valid[:, num_features].astype(float) - mean) / std
X_test[:, num_features] = (X_test[:, num_features].astype(float) - mean) / std

In [12]:
from tabr.model import TabRClassifier
clf = TabRClassifier(
    bin_indices=cat_idxs,
    device_name="cuda",
    optimizer_params=dict(lr=2e-4),
)

In [13]:
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], max_epochs=50, batch_size=1024)

 epochs:   0%|          | 0/50 [00:00<?, ?it/s]

 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9111970439116611}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9500214832001375}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9579788605310647}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9618114634355933}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9641660221706625}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9651972157772621}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9660393572226519}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9668127524276016}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9675174013921114}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9680158116353011}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9682564234768412}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9678783191544212}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9680845578757412}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.968548594998711}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9686517143593709}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.968411102517831}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9691157514823409}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.968686087479591}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.968617341239151}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9690298186817908}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9688751396408009}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9691157514823409}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9689610724413509}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9689438858812409}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9693735498839907}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9692532439632208}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9689782590014608}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9693391767637707}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9693735498839907}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9693563633238808}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9696141617255306}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9695626020452006}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9693219902036607}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9699063332474005}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9694594826845407}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9694594826845407}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9696313482856407}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9695797886053107}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9696829079659706}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9698204004468506}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9696485348457506}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9697860273266306}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9699235198075106}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9694594826845407}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9699235198075106}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9696141617255306}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9696829079659706}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9696141617255306}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9701641316490505}


 batches:   0%|          | 0/453 [00:00<?, ?it/s]

{'val_0_accuracy': 0.9698719601271806}
Stop training because you reached max_epochs = 50 with best_epoch = 48 and best_val_0_accuracy = 0.97016




In [14]:
# preds_train = clf.predict(X_train)
# train_auc = roc_auc_score(y_score=preds_train[:,1], y_true=y_train)

preds = clf.predict(X_test)
test_acc = accuracy_score(y_true=y_test, y_pred=np.argmax(preds, axis=1))

preds_valid = clf.predict(X_valid)
valid_acc = accuracy_score(y_true=y_valid, y_pred=np.argmax(preds_valid, axis=1))

# print(f"FINAL TRAIN SCORE FOR {dataset_name} : {train_auc}")
print(f"FINAL VALID SCORE : {valid_acc}")
print(f"FINAL TEST SCORE : {test_acc}")

FINAL VALID SCORE : 0.9701641316490505
FINAL TEST SCORE : 0.9703202650469354


# Catboost

In [15]:
from catboost import CatBoostClassifier

In [16]:
df_X_train = pd.DataFrame(X_train)
df_X_valid = pd.DataFrame(X_valid)
df_X_test = pd.DataFrame(X_test)

In [18]:
for cat_id in cat_idxs:
    df_X_train[cat_id] = df_X_train[cat_id].astype(int).astype(str)
    df_X_valid[cat_id] = df_X_valid[cat_id].astype(int).astype(str)
    df_X_test[cat_id] = df_X_test[cat_id].astype(int).astype(str)

In [29]:
clf = CatBoostClassifier(
    devices=[0],
    iterations=20000,
    learning_rate=0.1,
)
clf.fit(
    df_X_train, y_train,
    eval_set=[(df_X_valid, y_valid)],
    cat_features=cat_idxs,
    early_stopping_rounds=50,
)

0:	learn: 1.7252910	test: 1.7257908	best: 1.7257908 (0)	total: 137ms	remaining: 45m 30s
1:	learn: 1.5754391	test: 1.5765325	best: 1.5765325 (1)	total: 227ms	remaining: 37m 47s
2:	learn: 1.4630871	test: 1.4647117	best: 1.4647117 (2)	total: 314ms	remaining: 34m 52s
3:	learn: 1.3750158	test: 1.3764089	best: 1.3764089 (3)	total: 394ms	remaining: 32m 50s
4:	learn: 1.2983083	test: 1.2999643	best: 1.2999643 (4)	total: 471ms	remaining: 31m 21s
5:	learn: 1.2384311	test: 1.2401855	best: 1.2401855 (5)	total: 534ms	remaining: 29m 37s
6:	learn: 1.1880065	test: 1.1899658	best: 1.1899658 (6)	total: 598ms	remaining: 28m 28s
7:	learn: 1.1435731	test: 1.1458204	best: 1.1458204 (7)	total: 667ms	remaining: 27m 45s
8:	learn: 1.1049469	test: 1.1073586	best: 1.1073586 (8)	total: 736ms	remaining: 27m 15s
9:	learn: 1.0741784	test: 1.0766709	best: 1.0766709 (9)	total: 796ms	remaining: 26m 30s
10:	learn: 1.0464783	test: 1.0490311	best: 1.0490311 (10)	total: 862ms	remaining: 26m 6s
11:	learn: 1.0192289	test: 1.02

KeyboardInterrupt: 

In [30]:
accuracy_score(y_true=y_test, y_pred=np.argmax(clf.predict_proba(df_X_test), axis=1))

CatBoostError: There is no trained model to use predict_proba(). Use fit() to train model. Then use this method.