# Import libraries

In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error

import optuna
from optuna import Trial, visualization

import torch
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.augmentations import RegressionSMOTE

# Data Load

In [None]:
train = pd.read_csv()
test = pd.read_csv()
submission = pd.read_csv()

# Simple Pre-Processing

In [None]:
def SPP(df):
    n_unique = df.nunique() # 각 Column에서의 Unique한 Value의 수
    types = df.dtypes
    threshold = 10 # Categorical Feature가 가질 Unique한 Value의 가지 수 제한
    
    cat_columns = [] # Categorical 컬럼을 담을 리스트
    cat_dims = {} # Categorical 컬럼과 Unique한 Value를 담을 딕셔너리
    
    for col in tqdm(df.columns):
        print(col, df[col].nunique())
        if types[col] == 'object' or n_unique[col] < threshold:
            l_enc = LabelEncoder()
            df[col] = df[col].fillna("NULL") # 결측치를 "NULL"이라는 문자열로 치환
            df[col] = l_enc.fit_transform(df[col].values)
            cat_columns.append(col)
            cat_dims[col] = len(l_enc.classes_)
        else:
            df.fillna(df[col].mean(), inplace=True)
    return cat_columns, cat_dims, df

In [None]:
cat_columns, cat_dims, train = SPP(train)
test = SPP(test)[2]

# Define Categorical Features for Categorical embeddings

In [None]:
target = "" # Target Feature
unused_feat = [] # 학습 시 제외할 column (ex. ID)

features = [ col for col in train.columns if col not in unused_feat+[target]] 
cat_idxs = [ i for i, f in enumerate(features) if f in cat_columns]
cat_dims = [ cat_dims[f] for i, f in enumerate(features) if f in cat_columns]
cat_emb_dim = [5, 4, 3, 6, 2, 2, 1] # Random하게 지정?

# Split Dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train.iloc[:,:-1],train.iloc[:,-1],
    test_size=0.3,
    random_state=530,
    shuffle=True,
    stratify=train.iloc[:,-1]
)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train,y_train,
    test_size=0.3,
    random_state=530,
    shuffle=True,
    stratify=y_train
)

# 학습용
X_train = X_train.to_numpy()
y_train = y_train.to_numpy().reshape(-1, 1)

# 확인용
X_valid = X_valid.to_numpy()
y_valid = y_valid.to_numpy().reshape(-1, 1)

# 검증용
X_test = X_test.to_numpy()
y_test = y_test.to_numpy().reshape(-1, 1)

# Optuna Hyperparameter Tuning

In [None]:
def Objective(trial):
    mask_type = trial.suggest_categorical("mask_type", ["entmax", "sparsemax"])
    n_da = trial.suggest_int("n_da", 32, 128, step=4)
    n_steps = trial.suggest_int("n_steps", 1, 5, step=1)
    gamma = trial.suggest_float("gamma", 1., 1.4, step=0.2)
    n_shared = trial.suggest_int("n_shared", 1, 3, step=1)
    lambda_sparse = trial.suggest_float("lambda_sparse", 1e-6, 1e-3, log=True)
    tabnet_params = dict(n_d=n_da, n_a=n_da, n_steps=n_steps, gamma=gamma,
                     lambda_sparse=lambda_sparse, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type=mask_type, n_shared=n_shared,
                     scheduler_params=dict(mode="min",
                                           patience=trial.suggest_int("patienceScheduler",low=30,high=50), # changing sheduler patience to be lower than early stopping patience 
                                           min_lr=1e-5,
                                           factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau) #early stopping
    
    aug = RegressionSMOTE(p=0.2)
    
    regressor = TabNetRegressor(**tabnet_params)
    regressor.fit(X_train=X_train, y_train=y_train,
                  eval_set=[(X_train, y_train), (X_valid, y_valid)],
                  eval_name=['train', 'valid'],
                  eval_metric=['rmsle', 'mae', 'rmse', 'mse'],
                  patience=trial.suggest_int("patience",low=30,high=50), max_epochs=trial.suggest_int('epochs', 50, 100),
                  batch_size=1024, virtual_batch_size=128,
                  drop_last=False,
                  augmentations=aug, #aug, None
                 )
    best_cost = regressor.best_cost
    
    return best_cost

study = optuna.create_study(direction="minimize", study_name='TabNet optimization')
study.optimize(Objective, timeout=6*60) #5 hours

# Training

In [None]:
#train a TabNet with the best params to make submission
TabNet_params = study.best_params

print(TabNet_params)

In [None]:
final_params = dict(n_d=TabNet_params['n_da'], n_a=TabNet_params['n_da'],
                    n_steps=TabNet_params['n_steps'], gamma=TabNet_params['gamma'],
                    lambda_sparse=TabNet_params['lambda_sparse'],
                    optimizer_fn=torch.optim.Adam,
                    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                    mask_type=TabNet_params['mask_type'],
                    n_shared=TabNet_params['n_shared'],
                    scheduler_params=dict(mode="min",
                                          patience=TabNet_params['patienceScheduler'],
                                          min_lr=1e-5,
                                          factor=0.5,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau
                     )
epochs = TabNet_params['epochs']

aug = RegressionSMOTE(p=0.2)

regressor = TabNetRegressor(**final_params)
regressor.fit(X_train=X_train, y_train=y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_name=['train', 'valid'],
              eval_metric=['rmsle', 'mae', 'rmse', 'mse'],
              patience=TabNet_params['patience'], max_epochs=epochs,
              batch_size=1024, virtual_batch_size=128,
              drop_last=False,
              augmentations=aug, #aug, None
             )

# Train Result Visualize

In [None]:
# plot losses
plt.plot(regressor.history['loss'])
plt.show()

# plot rmse
plt.plot(regressor.history['train_rmse'])
plt.plot(regressor.history['valid_rmse'])
plt.show()

# plot learning rates
plt.plot(regressor.history['lr'])
plt.show()

In [None]:
preds = regressor.predict(X_test)

test_mae = mean_absolute_error(y_true=y_test,y_pred=preds[:,])
test_mse = mean_squared_error(y_true=y_test,y_pred=preds[:,])
test_msle = mean_squared_log_error(y_true=y_test,y_pred=preds[:,])

print(f"BEST VALID SCORE : {regressor.best_cost}")
print(f"FINAL TEST MAE : {test_mae}")
print(f"FINAL TEST MSE : {test_mse}")
print(f"FINAL TEST MSLE : {test_msle}")

# Local explainability and masks

In [None]:
explain_matrix, masks = regressor.explain(X_test)

print(f"Mask Lenght : {len(masks)}")

mask_agg = []
for i in tqdm(range(len(masks))):
    mask_df = pd.DataFrame(data=masks[i],columns=features[:])
    
    # 각 Mask의 Columns의 중요도의 합
    col_sums = []
    for j in range(len(mask_df.columns)):
        sums = sum(mask_df.iloc[:,j].values)
        col_sums.append(sums)
    
    plt.title(f"Step {i} Importance Bar Plot")
    plt.bar(mask_df.columns, col_sums)
    plt.xticks(rotation=90)
    plt.show()
    
    # 모든 Mask의 합
    for j in range(len(mask_df)):
        vals = list(mask_df[j].values)
        mask_agg.append(vals)

# Global explainability : feature importance summing to 1

In [None]:
global_exp = regressor.feature_importances_
print(global_exp)

plt.figure(figsize=(20,5))
plt.bar(features[:],global_exp)
plt.xticks(rotation=90)
plt.show()

# Make Submission

In [None]:
pred_X_test = test.to_numpy()
pred_y_test = regressor.predict(pred_X_test)
submission[target] = pred_y_test

submission.to_csv('submission.csv',index=False)