In [1]:
# 把原版 改成groupkfold  做一个小范围的调参

!nvidia-smi
import psutil
psutil.cpu_count()

Fri Sep 17 18:45:23 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 Off |                  N/A |
|  0%   33C    P8    23W / 370W |      1MiB / 24268MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:03:00.0  On |                  N/A |
| 36%   31C    P5    32W / 370W |    161MiB / 24262MiB |     50%      Defaul

20

In [2]:
suff = 917000
ENV = "10900"
n_startup_trials=15
n_trials=20

if ENV == "9700":
    data_dir = 'E:/optiver-realized-volatility-prediction'
    output_dir = "E:/output"
elif ENV == "10900":
    data_dir = '/home/xuming/workspace/optiver-realized-volatility-prediction'
    output_dir = "/home/xuming/workspace/output"
elif ENV == "colab":
    data_dir = '/content/drive/Shareddrives/workspace/optiver-realized-volatility-prediction'
    output_dir = "/content/drive/Shareddrives/workspace/output"
    !pip -q install pytorch_tabnet==3.1.1
    !pip -q install optuna
    !pip -q install -U pandas==1.2.4
    !pip -q install -U scikit-learn==0.23.2
    from google.colab import drive
    drive.mount('/content/drive')

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.matlib

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

from scipy import stats
from scipy.stats import norm
from joblib import Parallel, delayed

import shutil
import glob
import time

from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor
import random
import os
import sys
import optuna
import torch
from torch.optim import Adam, SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau, CosineAnnealingWarmRestarts
from optuna.samplers import TPESampler


import warnings
warnings.filterwarnings("ignore")

In [4]:
def init_logger(log_file=f'{output_dir}/train_tabnet.log'):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

def get_timediff(time1,time2):
    minute_,second_ = divmod(time2-time1,60)
    return f"{int(minute_):02d}:{int(second_):02d}"

LOGGER = init_logger(f'{output_dir}/train_tabnet_{suff}.log')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False # set True to be faster
seed_everything(42) ####

# Loading Data

In [5]:
def read_train_test():
    # Function to read our base train and test set
    train = pd.read_csv(f'{data_dir}/train.csv')
    test = pd.read_csv(f'{data_dir}/test.csv')
    # Create a key to merge with book and trade data
    train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
    test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
    print(f'Our training set has {train.shape[0]} rows')
    print(f'Our test set has {test.shape[0]} rows')
    print(f'Our training set has {train.isna().sum().sum()} missing values')
    print(f'Our test set has {test.isna().sum().sum()} missing values')
    
    return train, test

In [6]:
train, test = read_train_test()

Our training set has 428932 rows
Our test set has 3 rows
Our training set has 0 missing values
Our test set has 0 missing values


# Preprocessing the Data

In [7]:
# data directory
def calc_wap1(df):
    # Function to calculate first WAP
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def calc_wap2(df):
    # Function to calculate second WAP
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(series):
    # Function to calculate the log of the return
    return np.log(series).diff()

def realized_volatility(series):
    # Calculate the realized volatility
    return np.sqrt(np.sum(series**2))

def count_unique(series):
    # Function to count unique elements of a series
    return len(np.unique(series))

def book_preprocessor(file_path):
    # Function to preprocess book data (for each stock id)
    
    df = pd.read_parquet(file_path)
    
    # Calculate Wap
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    
    # Calculate log returns
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return)
    
    # Calculate wap balance
    df['wap_balance'] = abs(df['wap1'] - df['wap2'])
    
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    # Dict for aggregations
    create_feature_dict = {
        'wap1': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return1': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread':[np.sum, np.mean, np.std],
        'price_spread2':[np.sum, np.mean, np.std],
        'bid_spread':[np.sum, np.mean, np.std],
        'ask_spread':[np.sum, np.mean, np.std],
        'total_volume':[np.sum, np.mean, np.std],
        'volume_imbalance':[np.sum, np.mean, np.std],
        "bid_ask_spread":[np.sum, np.mean, np.std],
    }
    
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Function to get group stats for different windows (seconds in bucket)
        
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_400 = get_stats_window(seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(seconds_in_bucket = 200, add_suffix = True)
    
    # Merge all
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')

    # Drop unnecesary time_ids
    df_feature.drop(['time_id__400', 'time_id__300', 'time_id__200'], axis = 1, inplace = True)
    
    
    # Create row_id so we can merge
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    
    return df_feature


def trade_preprocessor(file_path):
    # Function to preprocess trade data (for each stock id)
    
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum, realized_volatility, np.mean, np.std, np.max, np.min],
        'order_count':[np.mean,np.sum,np.max],
    }
    
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Function to get group stats for different windows (seconds in bucket)
        
        # Group by the window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns]
        
        # Add a suffix to differentiate windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get the stats for different windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_400 = get_stats_window(seconds_in_bucket = 400, add_suffix = True)
    df_feature_300 = get_stats_window(seconds_in_bucket = 300, add_suffix = True)
    df_feature_200 = get_stats_window(seconds_in_bucket = 200, add_suffix = True)
    
    def tendency(price, vol):    
        df_diff = np.diff(price)
        val = (df_diff/price[1:])*100
        power = np.sum(val*vol[1:])
        return(power)
    
    lis = []
    for n_time_id in df['time_id'].unique():
        df_id = df[df['time_id'] == n_time_id]        
        tendencyV = tendency(df_id['price'].values, df_id['size'].values)      
        f_max = np.sum(df_id['price'].values > np.mean(df_id['price'].values))
        f_min = np.sum(df_id['price'].values < np.mean(df_id['price'].values))
        df_max =  np.sum(np.diff(df_id['price'].values) > 0)
        df_min =  np.sum(np.diff(df_id['price'].values) < 0)
        abs_diff = np.median(np.abs( df_id['price'].values - np.mean(df_id['price'].values)))        
        energy = np.mean(df_id['price'].values**2)
        iqr_p = np.percentile(df_id['price'].values,75) - np.percentile(df_id['price'].values,25)
        abs_diff_v = np.median(np.abs( df_id['size'].values - np.mean(df_id['size'].values)))        
        energy_v = np.sum(df_id['size'].values**2)
        iqr_p_v = np.percentile(df_id['size'].values,75) - np.percentile(df_id['size'].values,25)
        
        lis.append({'time_id':n_time_id,'tendency':tendencyV,'f_max':f_max,'f_min':f_min,'df_max':df_max,'df_min':df_min,
                   'abs_diff':abs_diff,'energy':energy,'iqr_p':iqr_p,'abs_diff_v':abs_diff_v,'energy_v':energy_v,'iqr_p_v':iqr_p_v})
    
    df_lr = pd.DataFrame(lis)
        
   
    df_feature = df_feature.merge(df_lr, how = 'left', left_on = 'time_id_', right_on = 'time_id')
    
    # Merge all
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    df_feature = df_feature.merge(df_feature_300, how = 'left', left_on = 'time_id_', right_on = 'time_id__300')
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')

    # Drop unnecesary time_ids
    df_feature.drop(['time_id__400', 'time_id__300', 'time_id__200','time_id'], axis = 1, inplace = True)
    df_feature = df_feature.add_prefix('trade_')
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    
    def order_sum(df, sec:str):
        new_col = 'size_tau' + sec
        bucket_col = 'trade_seconds_in_bucket_count_unique' + sec
        df[new_col] = np.sqrt(1/df[bucket_col])
        
        new_col2 = 'size_tau2' + sec
        order_col = 'trade_order_count_sum' + sec
        df[new_col2] = np.sqrt(1/df[order_col])
        
        if sec == '400_':
            df['size_tau2_d'] = df['size_tau2_400'] - df['size_tau2']
        

    
    for sec in ['','_200','_300','_400']:
        order_sum(df_feature, sec)
        
    df_feature['size_tau2_d'] = df_feature['size_tau2_400'] - df_feature['size_tau2']
    
    return df_feature


def get_time_stock(df):
    # Function to get group stats for the stock_id and time_id
    
    # Get realized volatility columns
    vol_cols = ['log_return1_realized_volatility', 'log_return2_realized_volatility', 'log_return1_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'log_return1_realized_volatility_300', 'log_return2_realized_volatility_300', 'log_return1_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'trade_log_return_realized_volatility', 'trade_log_return_realized_volatility_400', 'trade_log_return_realized_volatility_300', 'trade_log_return_realized_volatility_200']

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    
    return df

def create_agg_features(train, test):

    # Making agg features

    train_p = pd.read_csv(f'{data_dir}/train.csv')
    train_p = train_p.pivot(index='time_id', columns='stock_id', values='target')
    corr = train_p.corr()
    ids = corr.index
    kmeans = KMeans(n_clusters=7, random_state=0).fit(corr.values)
    l = []
    for n in range(7):
        l.append ( [ (x-1) for x in ( (ids+1)*(kmeans.labels_ == n)) if x > 0] )

    mat = []
    matTest = []
    n = 0
    for ind in l:
        newDf = train.loc[train['stock_id'].isin(ind) ]
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = str(n)+'c1'
        mat.append ( newDf )
        newDf = test.loc[test['stock_id'].isin(ind) ]    
        newDf = newDf.groupby(['time_id']).agg(np.nanmean)
        newDf.loc[:,'stock_id'] = str(n)+'c1'
        matTest.append ( newDf )
        n+=1

    mat1 = pd.concat(mat).reset_index()
    mat1.drop(columns=['target'],inplace=True)
    mat2 = pd.concat(matTest).reset_index()
    
    mat2 = pd.concat([mat2,mat1.loc[mat1.time_id==5]])
    
    mat1 = mat1.pivot(index='time_id', columns='stock_id')
    mat1.columns = ["_".join(x) for x in mat1.columns.ravel()]
    mat1.reset_index(inplace=True)
    
    mat2 = mat2.pivot(index='time_id', columns='stock_id')
    mat2.columns = ["_".join(x) for x in mat2.columns.ravel()]
    mat2.reset_index(inplace=True)
    
    prefix = ['log_return1_realized_volatility', 'total_volume_mean', 'trade_size_mean', 'trade_order_count_mean','price_spread_mean','bid_spread_mean','ask_spread_mean',
              'volume_imbalance_mean', 'bid_ask_spread_mean','size_tau2']
    selected_cols=mat1.filter(regex='|'.join(f'^{x}.(0|1|3|4|6)c1' for x in prefix)).columns.tolist()
    selected_cols.append('time_id')
    
    train_m = pd.merge(train,mat1[selected_cols],how='left',on='time_id')
    test_m = pd.merge(test,mat2[selected_cols],how='left',on='time_id')
    
    # filling missing values with train means

    features = [col for col in train_m.columns.tolist() if col not in ['time_id','target','row_id']]
    train_m[features] = train_m[features].fillna(train_m[features].mean())
    test_m[features] = test_m[features].fillna(train_m[features].mean())

    return train_m, test_m
    
    
def preprocessor(list_stock_ids, is_train = True):
    # Funtion to make preprocessing function in parallel (for each stock id)
    
    # Parrallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "/book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "/trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "/book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "/trade_test.parquet/stock_id=" + str(stock_id)
    
        # Preprocess book and trade data and merge them
        df_tmp = pd.merge(book_preprocessor(file_path_book), trade_preprocessor(file_path_trade), on = 'row_id', how = 'left')
        
        # Return the merge dataframe
        return df_tmp
    
    # Use parallel api to call paralle for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    
    # Concatenate all the dataframes that return from Parallel
    df = pd.concat(df, ignore_index = True)
    
    return df

def rmspe(y_true, y_pred):
    # Function to calculate the root mean squared percentage error
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

class RMSPE(Metric):
    def __init__(self):
        self._name = "rmspe"
        self._maximize = False

    def __call__(self, y_true, y_score):

        return np.sqrt(np.mean(np.square((y_true - y_score) / y_true)))


def RMSPELoss(y_pred, y_true):
    return torch.sqrt(torch.mean( ((y_true - y_pred) / y_true) ** 2 )).clone()

## Loding the and doing some feature engineering

In [8]:
# Get unique stock ids 
train_stock_ids = train['stock_id'].unique()

# Preprocess them using Parallel and our single stock id functions
train_ = preprocessor(train_stock_ids, is_train = True)
train = train.merge(train_, on = ['row_id'], how = 'left')

# Get unique stock ids 
test_stock_ids = test['stock_id'].unique()

# Preprocess them using Parallel and our single stock id functions
test_ = preprocessor(test_stock_ids, is_train = False)
test = test.merge(test_, on = ['row_id'], how = 'left')

# Get group stats of time_id and stock_id
train = get_time_stock(train)
test = get_time_stock(test)

# Fill inf values
train.replace([np.inf, -np.inf], np.nan,inplace=True)
test.replace([np.inf, -np.inf], np.nan,inplace=True)

# Aggregating some features
train, test = create_agg_features(train,test)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   31.7s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:  3.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [9]:
train

Unnamed: 0,stock_id,time_id,target,row_id,wap1_sum,wap1_mean,wap1_std,wap2_sum,wap2_mean,wap2_std,...,trade_order_count_mean_0c1,trade_order_count_mean_1c1,trade_order_count_mean_3c1,trade_order_count_mean_4c1,trade_order_count_mean_6c1,size_tau2_0c1,size_tau2_1c1,size_tau2_3c1,size_tau2_4c1,size_tau2_6c1
0,0,5,0.004136,0-5,303.125061,1.003725,0.000693,303.105530,1.003661,0.000781,...,3.809776,3.838125,4.213097,3.737627,4.477624,0.058550,0.057267,0.078471,0.054691,0.050700
1,0,11,0.001445,0-11,200.047775,1.000239,0.000262,200.041168,1.000206,0.000272,...,3.536027,3.827231,4.151908,4.292891,5.890478,0.081235,0.078955,0.122289,0.078616,0.045740
2,0,16,0.002168,0-16,187.913849,0.999542,0.000864,187.939819,0.999680,0.000862,...,2.934143,2.865731,3.599012,3.359532,2.350000,0.078550,0.087378,0.116278,0.074977,0.080722
3,0,31,0.002195,0-31,119.859779,0.998832,0.000757,119.835945,0.998633,0.000656,...,3.266455,3.702063,3.768233,3.502680,4.665943,0.100382,0.089673,0.105948,0.094684,0.055447
4,0,62,0.001747,0-62,175.932861,0.999619,0.000258,175.934250,0.999626,0.000317,...,3.367228,3.730579,4.435158,3.860256,4.257065,0.087285,0.089068,0.112663,0.086381,0.046358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
428927,126,32751,0.003461,126-32751,309.870453,0.999582,0.000486,309.871368,0.999585,0.000613,...,3.685610,4.051199,4.464154,3.978511,5.506404,0.064380,0.064007,0.089665,0.061596,0.041157
428928,126,32753,0.003113,126-32753,223.552139,1.002476,0.001264,223.580322,1.002602,0.001303,...,4.169403,4.519246,5.345999,5.369023,4.483678,0.073644,0.065909,0.085298,0.062453,0.062479
428929,126,32758,0.004070,126-32758,256.277039,1.001082,0.000466,256.255066,1.000996,0.000599,...,3.131133,3.431899,3.444819,3.262376,2.590000,0.098654,0.092117,0.124247,0.090574,0.120507
428930,126,32763,0.003357,126-32763,399.721741,1.001809,0.000456,399.714325,1.001790,0.000507,...,3.511471,3.562323,4.019130,4.069066,4.440582,0.051648,0.054263,0.067649,0.051358,0.040747


# Training

First we selecting columns for the training and test.

In [10]:
X = train.drop(['row_id', 'target', 'time_id'], axis = 1)
y = train['target']
X_test=test.copy()
X_test.drop(['time_id','row_id'], axis=1,inplace=True)

In [11]:
nunique = X.nunique()
types = X.dtypes

categorical_columns = []
categorical_dims =  {}

for col in X.columns:
    if  col == 'stock_id':
        l_enc = LabelEncoder()
        X[col] = l_enc.fit_transform(X[col].values)
        X_test[col] = l_enc.transform(X_test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        scaler = StandardScaler()
        X[col] = scaler.fit_transform(X[col].values.reshape(-1, 1))
        X_test[col] = scaler.transform(X_test[col].values.reshape(-1, 1))
        


cat_idxs = [ i for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(X.columns.tolist()) if f in categorical_columns]

In [12]:
cat_idxs

[0]

In [13]:
cat_dims

[112]

In [14]:
import os
import sys
import numpy as np
import pandas as pd
import random
from collections import Counter, defaultdict
from sklearn import model_selection

# ---- GroupKFold ----
class GroupKFold(object):
    """
    GroupKFold with random shuffle with a sklearn-like structure
    """

    def __init__(self, n_splits=4, shuffle=True, random_state=42):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state

    def get_n_splits(self, X=None, y=None, group=None):
        return self.n_splits

    def split(self, X, y, group):
        kf = model_selection.KFold(n_splits=self.n_splits, shuffle=self.shuffle, random_state=self.random_state)
        unique_ids = X[group].unique()
        for fold, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_ids)):
            # split group
            tr_group, va_group = unique_ids[tr_group_idx], unique_ids[va_group_idx]
            train_idx = np.where(X[group].isin(tr_group))[0]
            val_idx = np.where(X[group].isin(va_group))[0]
            yield train_idx, val_idx



### TabNet train

In [15]:
def objective(trial):
    start_time = time.time()
    global suff
    class CFG:
        suffix = str(suff)
    suff += 1
    cv = GroupKFold(n_splits=5, shuffle=True, random_state=42)
    kf = cv.split(train, train['target'], 'time_id')
    # kfold = KFold(n_splits = 5, random_state = 42, shuffle = True)
    # Create out of folds array
    oof_predictions = np.zeros((X.shape[0], 1))
    # test_predictions = np.zeros(X_test.shape[0])
    feature_importances = pd.DataFrame()
    feature_importances["feature"] = X.columns.tolist()
    explain_matrices = []

    tabnet_params = dict(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        cat_emb_dim=1,
        n_d = trial.suggest_int("n_d", 14, 18),
        n_a = trial.suggest_int("n_a", 14, 18),
        n_steps = trial.suggest_int("n_steps", 1, 3), #2
        gamma = trial.suggest_uniform("gamma", 1.8, 2.0),
        n_independent = trial.suggest_int("n_independent", 1, 3), # 2
        n_shared = trial.suggest_int("n_shared", 1, 3), #2
        lambda_sparse = trial.suggest_categorical("lambda_sparse",  [0.0, 1e-3]), #0
        optimizer_fn = Adam,
        optimizer_params = dict(lr = (trial.suggest_loguniform("learning_rate", 1e-2, 3e-2))),# dict(lr = (2e-2)), # dict(lr = (trial.suggest_loguniform("learning_rate", 7e-3, 5e-2))),
        mask_type = "entmax",
        scheduler_params = dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1, verbose=False),
        scheduler_fn = CosineAnnealingWarmRestarts,
        seed = 42,
        verbose = 50,
    )

    for fold, (trn_ind, val_ind) in enumerate(kf):
    # for fold, (trn_ind, val_ind) in enumerate(kfold.split(X)):
        print(f'Training fold {fold}')
        X_train, X_val = X.iloc[trn_ind].values, X.iloc[val_ind].values
        y_train, y_val = y.iloc[trn_ind].values.reshape(-1,1), y.iloc[val_ind].values.reshape(-1,1)


        clf =  TabNetRegressor(**tabnet_params)
        clf.fit(
          X_train, y_train,
          eval_set=[(X_val, y_val)],
          max_epochs = 200,
          patience = 50,
          batch_size = 1024*16,
          virtual_batch_size = 128*16,
          num_workers = 10,
          drop_last = False,
          eval_metric=[RMSPE],
          loss_fn=RMSPELoss
          )

        clf.save_model(f"{output_dir}/tabnet_{CFG.suffix}_fold{fold}.model")
        oof_predictions[val_ind] = clf.predict(X_val)
        feature_importances[f"importance_fold{fold}+1"] = clf.feature_importances_
        feature_importances.to_csv(f"{output_dir}/tabnet_feature_imp_{CFG.suffix}_fold{fold}.csv",index=False)


    elapsed_time = get_timediff(start_time, time.time())
    rmspe_score = rmspe(y, oof_predictions.flatten())
    print(f'OOF score across folds: {rmspe_score}')
    LOGGER.info(f' suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time')
    LOGGER.info(f' {CFG.suffix}   {rmspe_score:.7f}      {tabnet_params["cat_emb_dim"]:>3d}      {tabnet_params["n_d"]:>3d}  {tabnet_params["n_a"]:>3d}     {tabnet_params["n_steps"]:>2d}     {tabnet_params["gamma"]:.2f}        {tabnet_params["n_independent"]:>2d}           {tabnet_params["n_shared"]:>2d}          {tabnet_params["lambda_sparse"]:.2f}          {tabnet_params["optimizer_params"]["lr"]:.8f}      {tabnet_params["mask_type"]}  {tabnet_params["scheduler_params"]["T_0"]:>4d}    {tabnet_params["scheduler_params"]["T_mult"]:>2d}   {tabnet_params["scheduler_params"]["eta_min"]:.8f}  {elapsed_time} ')
    return rmspe_score

In [16]:
study = optuna.create_study(sampler=TPESampler(n_startup_trials=n_startup_trials),direction="minimize")
study.optimize(objective, n_trials=n_trials)

[32m[I 2021-09-17 18:49:06,042][0m A new study created in memory with name: no-name-cb0949eb-4997-43f4-a4ad-6e728e8c893e[0m


Training fold 0
Device used : cuda
epoch 0  | loss: 35.77371| val_0_rmspe: 3.26264 |  0:00:02s
epoch 50 | loss: 0.22953 | val_0_rmspe: 0.22243 |  0:01:49s
epoch 100| loss: 0.20859 | val_0_rmspe: 0.2149  |  0:03:36s
epoch 150| loss: 0.20146 | val_0_rmspe: 0.21573 |  0:05:24s

Early stopping occurred at epoch 158 with best_epoch = 108 and best_val_0_rmspe = 0.21283
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917000_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 47.63717| val_0_rmspe: 4.38736 |  0:00:02s
epoch 50 | loss: 0.21934 | val_0_rmspe: 0.22739 |  0:01:50s
epoch 100| loss: 0.20921 | val_0_rmspe: 0.21544 |  0:03:38s

Early stopping occurred at epoch 140 with best_epoch = 90 and best_val_0_rmspe = 0.21402
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917000_fold1.model.zip
Training fold 2
Device used : cuda
epoch 0  | l

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917000   0.2135363        1       14   15      1     1.86         1            3          0.00          0.02770695      entmax   200     1   0.00010000  26:56 
[32m[I 2021-09-17 19:16:02,124][0m Trial 0 finished with value: 0.21353625583126887 and parameters: {'n_d': 14, 'n_a': 15, 'n_steps': 1, 'gamma': 1.855270815491862, 'n_independent': 1, 'n_shared': 3, 'lambda_sparse': 0.001, 'learning_rate': 0.02770695027554611}. Best is trial 0 with value: 0.21353625583126887.[0m


OOF score across folds: 0.21353625583126887
Training fold 0
Device used : cuda
epoch 0  | loss: 273.74466| val_0_rmspe: 69.66301|  0:00:02s
epoch 50 | loss: 0.23167 | val_0_rmspe: 0.2274  |  0:02:07s
epoch 100| loss: 0.21246 | val_0_rmspe: 0.21732 |  0:04:12s
epoch 150| loss: 0.20536 | val_0_rmspe: 0.21182 |  0:06:17s
Stop training because you reached max_epochs = 200 with best_epoch = 150 and best_val_0_rmspe = 0.21182
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917001_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 332.07167| val_0_rmspe: 152.8688|  0:00:02s
epoch 50 | loss: 0.24979 | val_0_rmspe: 0.24021 |  0:02:08s
epoch 100| loss: 0.21672 | val_0_rmspe: 0.22238 |  0:04:13s
epoch 150| loss: 0.20781 | val_0_rmspe: 0.23357 |  0:06:17s

Early stopping occurred at epoch 161 with best_epoch = 111 and best_val_0_rmspe = 0.21861
Best weights from best epoch are automatically used!
Successfully sa

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917001   0.2138404        1       17   17      2     1.85         2            3          0.00          0.01767668      entmax   200     1   0.00010000  39:29 
[32m[I 2021-09-17 19:55:31,888][0m Trial 1 finished with value: 0.2138404018966662 and parameters: {'n_d': 17, 'n_a': 17, 'n_steps': 2, 'gamma': 1.8548744443818146, 'n_independent': 2, 'n_shared': 3, 'lambda_sparse': 0.0, 'learning_rate': 0.017676675348710213}. Best is trial 0 with value: 0.21353625583126887.[0m


OOF score across folds: 0.2138404018966662
Training fold 0
Device used : cuda
epoch 0  | loss: 252.97805| val_0_rmspe: 166.98995|  0:00:02s
epoch 50 | loss: 0.2364  | val_0_rmspe: 0.22887 |  0:02:11s
epoch 100| loss: 0.20789 | val_0_rmspe: 0.21328 |  0:04:20s
epoch 150| loss: 0.20177 | val_0_rmspe: 0.21227 |  0:06:28s

Early stopping occurred at epoch 154 with best_epoch = 104 and best_val_0_rmspe = 0.21164
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917002_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 274.30318| val_0_rmspe: 56.43198|  0:00:02s
epoch 50 | loss: 0.22169 | val_0_rmspe: 0.22426 |  0:02:13s
epoch 100| loss: 0.20823 | val_0_rmspe: 0.21567 |  0:04:21s
epoch 150| loss: 0.20392 | val_0_rmspe: 0.21556 |  0:06:31s

Early stopping occurred at epoch 162 with best_epoch = 112 and best_val_0_rmspe = 0.21475
Best weights from best epoch are automatically used!
Successfully saved model at 

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917002   0.2140049        1       17   16      3     1.87         2            1          0.00          0.02893596      entmax   200     1   0.00010000  38:09 
[32m[I 2021-09-17 20:33:41,603][0m Trial 2 finished with value: 0.21400485488737345 and parameters: {'n_d': 17, 'n_a': 16, 'n_steps': 3, 'gamma': 1.8729158987711878, 'n_independent': 2, 'n_shared': 1, 'lambda_sparse': 0.0, 'learning_rate': 0.028935963899215492}. Best is trial 0 with value: 0.21353625583126887.[0m


OOF score across folds: 0.21400485488737345
Training fold 0
Device used : cuda
epoch 0  | loss: 102.42106| val_0_rmspe: 19.11488|  0:00:02s
epoch 50 | loss: 0.23253 | val_0_rmspe: 0.22536 |  0:01:50s
epoch 100| loss: 0.21508 | val_0_rmspe: 0.21635 |  0:03:39s
epoch 150| loss: 0.20762 | val_0_rmspe: 0.21291 |  0:05:28s
Stop training because you reached max_epochs = 200 with best_epoch = 188 and best_val_0_rmspe = 0.21209
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917003_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 199.51747| val_0_rmspe: 28.63246|  0:00:02s
epoch 50 | loss: 0.24173 | val_0_rmspe: 0.23947 |  0:01:50s
epoch 100| loss: 0.22034 | val_0_rmspe: 0.23113 |  0:03:38s
epoch 150| loss: 0.21077 | val_0_rmspe: 0.21687 |  0:05:28s
Stop training because you reached max_epochs = 200 with best_epoch = 172 and best_val_0_rmspe = 0.21685
Best weights from best epoch are automatically used!
Su

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917003   0.2152273        1       16   16      1     1.85         2            2          0.00          0.01158809      entmax   200     1   0.00010000  35:32 
[32m[I 2021-09-17 21:09:14,184][0m Trial 3 finished with value: 0.21522729523252876 and parameters: {'n_d': 16, 'n_a': 16, 'n_steps': 1, 'gamma': 1.8512532363486294, 'n_independent': 2, 'n_shared': 2, 'lambda_sparse': 0.001, 'learning_rate': 0.011588091035802618}. Best is trial 0 with value: 0.21353625583126887.[0m


OOF score across folds: 0.21522729523252876
Training fold 0
Device used : cuda
epoch 0  | loss: 149.63884| val_0_rmspe: 25.37738|  0:00:02s
epoch 50 | loss: 0.22709 | val_0_rmspe: 0.24037 |  0:02:11s
epoch 100| loss: 0.21476 | val_0_rmspe: 0.21909 |  0:04:19s
epoch 150| loss: 0.20589 | val_0_rmspe: 0.21134 |  0:06:27s

Early stopping occurred at epoch 179 with best_epoch = 129 and best_val_0_rmspe = 0.211
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917004_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 150.89711| val_0_rmspe: 20.89955|  0:00:02s
epoch 50 | loss: 0.22931 | val_0_rmspe: 0.26365 |  0:02:11s
epoch 100| loss: 0.21089 | val_0_rmspe: 0.21658 |  0:04:20s
epoch 150| loss: 0.20665 | val_0_rmspe: 0.21457 |  0:06:29s
Stop training because you reached max_epochs = 200 with best_epoch = 171 and best_val_0_rmspe = 0.21359
Best weights from best epoch are automatically used!
Successfully save

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917004   0.2134098        1       17   17      2     1.96         3            3          0.00          0.02421223      entmax   200     1   0.00010000  38:44 
[32m[I 2021-09-17 21:47:58,433][0m Trial 4 finished with value: 0.2134098425361398 and parameters: {'n_d': 17, 'n_a': 17, 'n_steps': 2, 'gamma': 1.9607034966122827, 'n_independent': 3, 'n_shared': 3, 'lambda_sparse': 0.001, 'learning_rate': 0.024212229612217914}. Best is trial 4 with value: 0.2134098425361398.[0m


OOF score across folds: 0.2134098425361398
Training fold 0
Device used : cuda
epoch 0  | loss: 248.14693| val_0_rmspe: 36.97003|  0:00:02s
epoch 50 | loss: 0.24611 | val_0_rmspe: 0.24452 |  0:02:06s
epoch 100| loss: 0.21724 | val_0_rmspe: 0.22275 |  0:04:10s
epoch 150| loss: 0.20725 | val_0_rmspe: 0.21188 |  0:06:14s
Stop training because you reached max_epochs = 200 with best_epoch = 166 and best_val_0_rmspe = 0.21164
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917005_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 181.37516| val_0_rmspe: 31.54778|  0:00:02s
epoch 50 | loss: 0.23879 | val_0_rmspe: 0.248   |  0:02:07s
epoch 100| loss: 0.21254 | val_0_rmspe: 0.21741 |  0:04:12s
epoch 150| loss: 0.20591 | val_0_rmspe: 0.21563 |  0:06:16s
Stop training because you reached max_epochs = 200 with best_epoch = 184 and best_val_0_rmspe = 0.21518
Best weights from best epoch are automatically used!
Suc

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917005   0.2136636        1       18   16      2     1.90         2            3          0.00          0.02856818      entmax   200     1   0.00010000  35:17 
[32m[I 2021-09-17 22:23:16,061][0m Trial 5 finished with value: 0.2136636232332618 and parameters: {'n_d': 18, 'n_a': 16, 'n_steps': 2, 'gamma': 1.899131184141647, 'n_independent': 2, 'n_shared': 3, 'lambda_sparse': 0.0, 'learning_rate': 0.028568181121925343}. Best is trial 4 with value: 0.2134098425361398.[0m


OOF score across folds: 0.2136636232332618
Training fold 0
Device used : cuda
epoch 0  | loss: 109.61789| val_0_rmspe: 11.52867|  0:00:02s
epoch 50 | loss: 0.24651 | val_0_rmspe: 0.23671 |  0:01:48s
epoch 100| loss: 0.21844 | val_0_rmspe: 0.21591 |  0:03:35s
epoch 150| loss: 0.20785 | val_0_rmspe: 0.21387 |  0:05:21s

Early stopping occurred at epoch 196 with best_epoch = 146 and best_val_0_rmspe = 0.21225
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917006_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 127.09303| val_0_rmspe: 7.35667 |  0:00:02s
epoch 50 | loss: 0.23839 | val_0_rmspe: 0.23454 |  0:01:48s
epoch 100| loss: 0.21258 | val_0_rmspe: 0.22552 |  0:03:35s
epoch 150| loss: 0.20527 | val_0_rmspe: 0.21895 |  0:05:22s

Early stopping occurred at epoch 194 with best_epoch = 144 and best_val_0_rmspe = 0.21542
Best weights from best epoch are automatically used!
Successfully saved model at /

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917006   0.2133087        1       15   18      1     1.97         2            1          0.00          0.02100062      entmax   200     1   0.00010000  32:27 
[32m[I 2021-09-17 22:55:43,150][0m Trial 6 finished with value: 0.2133086646354432 and parameters: {'n_d': 15, 'n_a': 18, 'n_steps': 1, 'gamma': 1.9723264184253528, 'n_independent': 2, 'n_shared': 1, 'lambda_sparse': 0.001, 'learning_rate': 0.02100062287063875}. Best is trial 6 with value: 0.2133086646354432.[0m


OOF score across folds: 0.2133086646354432
Training fold 0
Device used : cuda
epoch 0  | loss: 53.46574| val_0_rmspe: 4.37901 |  0:00:02s
epoch 50 | loss: 0.21404 | val_0_rmspe: 0.22715 |  0:01:48s
epoch 100| loss: 0.20592 | val_0_rmspe: 0.21253 |  0:03:35s
epoch 150| loss: 0.20141 | val_0_rmspe: 0.21345 |  0:05:21s

Early stopping occurred at epoch 153 with best_epoch = 103 and best_val_0_rmspe = 0.21197
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917007_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 78.57781| val_0_rmspe: 4.19806 |  0:00:02s
epoch 50 | loss: 0.22323 | val_0_rmspe: 0.21967 |  0:01:48s
epoch 100| loss: 0.21003 | val_0_rmspe: 0.21692 |  0:03:36s
epoch 150| loss: 0.20198 | val_0_rmspe: 0.21646 |  0:05:22s

Early stopping occurred at epoch 183 with best_epoch = 133 and best_val_0_rmspe = 0.21536
Best weights from best epoch are automatically used!
Successfully saved model at /ho

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917007   0.2129706        1       14   16      1     1.94         2            1          0.00          0.01986204      entmax   200     1   0.00010000  30:03 
[32m[I 2021-09-17 23:25:46,353][0m Trial 7 finished with value: 0.21297060056178208 and parameters: {'n_d': 14, 'n_a': 16, 'n_steps': 1, 'gamma': 1.938938819615773, 'n_independent': 2, 'n_shared': 1, 'lambda_sparse': 0.0, 'learning_rate': 0.01986203869418597}. Best is trial 7 with value: 0.21297060056178208.[0m


OOF score across folds: 0.21297060056178208
Training fold 0
Device used : cuda
epoch 0  | loss: 134.84606| val_0_rmspe: 25.19872|  0:00:02s
epoch 50 | loss: 0.23381 | val_0_rmspe: 0.29996 |  0:01:51s
epoch 100| loss: 0.22338 | val_0_rmspe: 0.22133 |  0:03:40s
epoch 150| loss: 0.20589 | val_0_rmspe: 0.21175 |  0:05:29s

Early stopping occurred at epoch 199 with best_epoch = 149 and best_val_0_rmspe = 0.21162
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917008_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 138.45581| val_0_rmspe: 24.38071|  0:00:02s
epoch 50 | loss: 0.33355 | val_0_rmspe: 0.35836 |  0:01:51s
epoch 100| loss: 0.24386 | val_0_rmspe: 0.2414  |  0:03:40s
epoch 150| loss: 0.21504 | val_0_rmspe: 0.22231 |  0:05:28s
Stop training because you reached max_epochs = 200 with best_epoch = 184 and best_val_0_rmspe = 0.21873
Best weights from best epoch are automatically used!
Successfully sa

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917008   0.2145262        1       17   17      1     1.98         3            1          0.00          0.01627786      entmax   200     1   0.00010000  33:16 
[32m[I 2021-09-17 23:59:02,951][0m Trial 8 finished with value: 0.214526172871051 and parameters: {'n_d': 17, 'n_a': 17, 'n_steps': 1, 'gamma': 1.980689222867, 'n_independent': 3, 'n_shared': 1, 'lambda_sparse': 0.0, 'learning_rate': 0.016277859441750763}. Best is trial 7 with value: 0.21297060056178208.[0m


OOF score across folds: 0.214526172871051
Training fold 0
Device used : cuda
epoch 0  | loss: 240.38018| val_0_rmspe: 99.22467|  0:00:02s
epoch 50 | loss: 0.22955 | val_0_rmspe: 0.22494 |  0:02:08s
epoch 100| loss: 0.21717 | val_0_rmspe: 0.21476 |  0:04:13s
epoch 150| loss: 0.20769 | val_0_rmspe: 0.21314 |  0:06:19s
Stop training because you reached max_epochs = 200 with best_epoch = 159 and best_val_0_rmspe = 0.21217
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917009_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 238.87055| val_0_rmspe: 306.61303|  0:00:02s
epoch 50 | loss: 0.25141 | val_0_rmspe: 0.25736 |  0:02:08s
epoch 100| loss: 0.21539 | val_0_rmspe: 0.21795 |  0:04:13s
epoch 150| loss: 0.20743 | val_0_rmspe: 0.21557 |  0:06:19s
Stop training because you reached max_epochs = 200 with best_epoch = 181 and best_val_0_rmspe = 0.21415
Best weights from best epoch are automatically used!
Suc

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917009   0.2141445        1       18   17      2     1.81         3            2          0.00          0.01375725      entmax   200     1   0.00010000  41:54 
[32m[I 2021-09-18 00:40:57,586][0m Trial 9 finished with value: 0.21414452534835468 and parameters: {'n_d': 18, 'n_a': 17, 'n_steps': 2, 'gamma': 1.813161814638165, 'n_independent': 3, 'n_shared': 2, 'lambda_sparse': 0.0, 'learning_rate': 0.013757245381037059}. Best is trial 7 with value: 0.21297060056178208.[0m


OOF score across folds: 0.21414452534835468
Training fold 0
Device used : cuda
epoch 0  | loss: 152.37041| val_0_rmspe: 50.72075|  0:00:02s
epoch 50 | loss: 0.23046 | val_0_rmspe: 0.22622 |  0:02:11s
epoch 100| loss: 0.21132 | val_0_rmspe: 0.21433 |  0:04:20s
epoch 150| loss: 0.20573 | val_0_rmspe: 0.21292 |  0:06:29s

Early stopping occurred at epoch 199 with best_epoch = 149 and best_val_0_rmspe = 0.21271
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917010_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 136.95321| val_0_rmspe: 56.8595 |  0:00:02s
epoch 50 | loss: 0.22703 | val_0_rmspe: 0.22983 |  0:02:11s
epoch 100| loss: 0.21322 | val_0_rmspe: 0.21833 |  0:04:21s
epoch 150| loss: 0.20818 | val_0_rmspe: 0.2183  |  0:06:30s

Early stopping occurred at epoch 167 with best_epoch = 117 and best_val_0_rmspe = 0.21503
Best weights from best epoch are automatically used!
Successfully saved model at 

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917010   0.2137879        1       16   15      3     1.98         2            1          0.00          0.01998502      entmax   200     1   0.00010000  40:42 
[32m[I 2021-09-18 01:21:39,990][0m Trial 10 finished with value: 0.21378786430510952 and parameters: {'n_d': 16, 'n_a': 15, 'n_steps': 3, 'gamma': 1.9793876594695212, 'n_independent': 2, 'n_shared': 1, 'lambda_sparse': 0.001, 'learning_rate': 0.019985015393063126}. Best is trial 7 with value: 0.21297060056178208.[0m


OOF score across folds: 0.21378786430510952
Training fold 0
Device used : cuda
epoch 0  | loss: 28.82134| val_0_rmspe: 2.09301 |  0:00:01s
epoch 50 | loss: 0.21477 | val_0_rmspe: 0.22172 |  0:01:44s
epoch 100| loss: 0.20395 | val_0_rmspe: 0.21281 |  0:03:28s

Early stopping occurred at epoch 135 with best_epoch = 85 and best_val_0_rmspe = 0.2122
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917011_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 38.20095| val_0_rmspe: 3.95067 |  0:00:02s
epoch 50 | loss: 0.21639 | val_0_rmspe: 0.22237 |  0:01:46s
epoch 100| loss: 0.20531 | val_0_rmspe: 0.21458 |  0:03:30s

Early stopping occurred at epoch 141 with best_epoch = 91 and best_val_0_rmspe = 0.21404
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917011_fold1.model.zip
Training fold 2
Device used : cuda
epoch 0  | loss: 55.20741| val

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917011   0.2131993        1       14   18      1     1.95         1            1          0.00          0.02761733      entmax   200     1   0.00010000  23:06 
[32m[I 2021-09-18 01:44:46,134][0m Trial 11 finished with value: 0.2131992503454295 and parameters: {'n_d': 14, 'n_a': 18, 'n_steps': 1, 'gamma': 1.9480579200097077, 'n_independent': 1, 'n_shared': 1, 'lambda_sparse': 0.0, 'learning_rate': 0.027617332064611547}. Best is trial 7 with value: 0.21297060056178208.[0m


OOF score across folds: 0.2131992503454295
Training fold 0
Device used : cuda
epoch 0  | loss: 290.98204| val_0_rmspe: 65.03957|  0:00:02s
epoch 50 | loss: 0.24802 | val_0_rmspe: 0.23898 |  0:02:06s
epoch 100| loss: 0.21958 | val_0_rmspe: 0.21666 |  0:04:11s
epoch 150| loss: 0.21228 | val_0_rmspe: 0.21575 |  0:06:15s
Stop training because you reached max_epochs = 200 with best_epoch = 179 and best_val_0_rmspe = 0.21239
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917012_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 201.87899| val_0_rmspe: 69.4562 |  0:00:02s
epoch 50 | loss: 0.23582 | val_0_rmspe: 0.23232 |  0:02:06s
epoch 100| loss: 0.21869 | val_0_rmspe: 0.22459 |  0:04:11s
epoch 150| loss: 0.20978 | val_0_rmspe: 0.21798 |  0:06:15s
Stop training because you reached max_epochs = 200 with best_epoch = 163 and best_val_0_rmspe = 0.21677
Best weights from best epoch are automatically used!
Suc

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917012   0.2146514        1       15   16      3     1.82         1            1          0.00          0.01671690      entmax   200     1   0.00010000  40:29 
[32m[I 2021-09-18 02:25:15,793][0m Trial 12 finished with value: 0.2146514000212205 and parameters: {'n_d': 15, 'n_a': 16, 'n_steps': 3, 'gamma': 1.8210391646540933, 'n_independent': 1, 'n_shared': 1, 'lambda_sparse': 0.001, 'learning_rate': 0.01671690048953213}. Best is trial 7 with value: 0.21297060056178208.[0m


OOF score across folds: 0.2146514000212205
Training fold 0
Device used : cuda
epoch 0  | loss: 305.97989| val_0_rmspe: 67.81943|  0:00:02s
epoch 50 | loss: 0.22017 | val_0_rmspe: 0.22099 |  0:02:06s
epoch 100| loss: 0.20821 | val_0_rmspe: 0.21384 |  0:04:10s
epoch 150| loss: 0.20296 | val_0_rmspe: 0.21389 |  0:06:13s

Early stopping occurred at epoch 177 with best_epoch = 127 and best_val_0_rmspe = 0.21181
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917013_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 342.63143| val_0_rmspe: 79.58143|  0:00:02s
epoch 50 | loss: 0.23448 | val_0_rmspe: 0.23214 |  0:02:07s
epoch 100| loss: 0.21825 | val_0_rmspe: 0.21997 |  0:04:12s
epoch 150| loss: 0.20648 | val_0_rmspe: 0.2179  |  0:06:15s

Early stopping occurred at epoch 153 with best_epoch = 103 and best_val_0_rmspe = 0.21769
Best weights from best epoch are automatically used!
Successfully saved model at /

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917013   0.2142939        1       18   18      3     1.99         1            1          0.00          0.01850070      entmax   200     1   0.00010000  37:02 
[32m[I 2021-09-18 03:02:18,275][0m Trial 13 finished with value: 0.21429393105156097 and parameters: {'n_d': 18, 'n_a': 18, 'n_steps': 3, 'gamma': 1.99396672975095, 'n_independent': 1, 'n_shared': 1, 'lambda_sparse': 0.001, 'learning_rate': 0.018500703044194815}. Best is trial 7 with value: 0.21297060056178208.[0m


OOF score across folds: 0.21429393105156097
Training fold 0
Device used : cuda
epoch 0  | loss: 123.9744| val_0_rmspe: 106.77523|  0:00:02s
epoch 50 | loss: 0.22713 | val_0_rmspe: 0.23016 |  0:02:02s
epoch 100| loss: 0.21193 | val_0_rmspe: 0.21458 |  0:04:03s
epoch 150| loss: 0.20677 | val_0_rmspe: 0.21216 |  0:06:03s
Stop training because you reached max_epochs = 200 with best_epoch = 170 and best_val_0_rmspe = 0.21194
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917014_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 175.58117| val_0_rmspe: 67.06066|  0:00:02s
epoch 50 | loss: 0.23789 | val_0_rmspe: 0.23155 |  0:02:03s
epoch 100| loss: 0.21551 | val_0_rmspe: 0.22265 |  0:04:04s
epoch 150| loss: 0.20677 | val_0_rmspe: 0.21561 |  0:06:04s
Stop training because you reached max_epochs = 200 with best_epoch = 182 and best_val_0_rmspe = 0.21527
Best weights from best epoch are automatically used!
Su

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917014   0.2142864        1       15   17      2     1.99         1            3          0.00          0.01421623      entmax   200     1   0.00010000  38:01 
[32m[I 2021-09-18 03:40:19,590][0m Trial 14 finished with value: 0.21428637866348973 and parameters: {'n_d': 15, 'n_a': 17, 'n_steps': 2, 'gamma': 1.9942425522109481, 'n_independent': 1, 'n_shared': 3, 'lambda_sparse': 0.0, 'learning_rate': 0.01421622830663111}. Best is trial 7 with value: 0.21297060056178208.[0m


OOF score across folds: 0.21428637866348973
Training fold 0
Device used : cuda
epoch 0  | loss: 68.68506| val_0_rmspe: 8.16448 |  0:00:02s
epoch 50 | loss: 0.21534 | val_0_rmspe: 0.21522 |  0:01:47s
epoch 100| loss: 0.20798 | val_0_rmspe: 0.21523 |  0:03:34s
epoch 150| loss: 0.20272 | val_0_rmspe: 0.21301 |  0:05:20s

Early stopping occurred at epoch 188 with best_epoch = 138 and best_val_0_rmspe = 0.211
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917015_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 63.66979| val_0_rmspe: 24.69537|  0:00:02s
epoch 50 | loss: 0.21419 | val_0_rmspe: 0.22482 |  0:01:49s
epoch 100| loss: 0.20547 | val_0_rmspe: 0.21613 |  0:03:35s

Early stopping occurred at epoch 130 with best_epoch = 80 and best_val_0_rmspe = 0.21451
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917015_fold1.model.zip
Train

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917015   0.2128403        1       14   14      1     1.94         1            2          0.00          0.02394244      entmax   200     1   0.00010000  30:02 
[32m[I 2021-09-18 04:10:22,509][0m Trial 15 finished with value: 0.21284028675292316 and parameters: {'n_d': 14, 'n_a': 14, 'n_steps': 1, 'gamma': 1.9372640832466164, 'n_independent': 1, 'n_shared': 2, 'lambda_sparse': 0.0, 'learning_rate': 0.023942436341719324}. Best is trial 15 with value: 0.21284028675292316.[0m


OOF score across folds: 0.21284028675292316
Training fold 0
Device used : cuda
epoch 0  | loss: 77.1076 | val_0_rmspe: 4.71387 |  0:00:02s
epoch 50 | loss: 0.21848 | val_0_rmspe: 0.21606 |  0:01:49s
epoch 100| loss: 0.20755 | val_0_rmspe: 0.21636 |  0:03:35s
epoch 150| loss: 0.20156 | val_0_rmspe: 0.2157  |  0:05:21s

Early stopping occurred at epoch 156 with best_epoch = 106 and best_val_0_rmspe = 0.21204
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917016_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 58.9196 | val_0_rmspe: 8.48058 |  0:00:02s
epoch 50 | loss: 0.2604  | val_0_rmspe: 0.25883 |  0:01:49s
epoch 100| loss: 0.20989 | val_0_rmspe: 0.22351 |  0:03:35s
epoch 150| loss: 0.20254 | val_0_rmspe: 0.22016 |  0:05:21s

Early stopping occurred at epoch 163 with best_epoch = 113 and best_val_0_rmspe = 0.21599
Best weights from best epoch are automatically used!
Successfully saved model at /h

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917016   0.2141119        1       14   14      1     1.93         1            2          0.00          0.02375626      entmax   200     1   0.00010000  28:46 
[32m[I 2021-09-18 04:39:08,921][0m Trial 16 finished with value: 0.21411189770218841 and parameters: {'n_d': 14, 'n_a': 14, 'n_steps': 1, 'gamma': 1.9313511975051108, 'n_independent': 1, 'n_shared': 2, 'lambda_sparse': 0.0, 'learning_rate': 0.023756264938729718}. Best is trial 15 with value: 0.21284028675292316.[0m


OOF score across folds: 0.21411189770218841
Training fold 0
Device used : cuda
epoch 0  | loss: 81.1144 | val_0_rmspe: 5.49105 |  0:00:02s
epoch 50 | loss: 0.22026 | val_0_rmspe: 0.22883 |  0:01:52s
epoch 100| loss: 0.21438 | val_0_rmspe: 0.21521 |  0:03:44s
epoch 150| loss: 0.20607 | val_0_rmspe: 0.21392 |  0:05:34s
Stop training because you reached max_epochs = 200 with best_epoch = 164 and best_val_0_rmspe = 0.21263
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917017_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 89.1035 | val_0_rmspe: 9.55241 |  0:00:02s
epoch 50 | loss: 0.23151 | val_0_rmspe: 0.2365  |  0:01:53s
epoch 100| loss: 0.21231 | val_0_rmspe: 0.21797 |  0:03:44s
epoch 150| loss: 0.20455 | val_0_rmspe: 0.22567 |  0:05:35s

Early stopping occurred at epoch 166 with best_epoch = 116 and best_val_0_rmspe = 0.21512
Best weights from best epoch are automatically used!
Successfully save

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917017   0.2143960        1       14   14      1     1.93         3            2          0.00          0.02274588      entmax   200     1   0.00010000  32:38 
[32m[I 2021-09-18 05:11:46,949][0m Trial 17 finished with value: 0.21439597685382844 and parameters: {'n_d': 14, 'n_a': 14, 'n_steps': 1, 'gamma': 1.9255856981300656, 'n_independent': 3, 'n_shared': 2, 'lambda_sparse': 0.0, 'learning_rate': 0.022745878639265898}. Best is trial 15 with value: 0.21284028675292316.[0m


OOF score across folds: 0.21439597685382844
Training fold 0
Device used : cuda
epoch 0  | loss: 79.59824| val_0_rmspe: 8.70163 |  0:00:02s
epoch 50 | loss: 0.23056 | val_0_rmspe: 0.23512 |  0:01:50s
epoch 100| loss: 0.21379 | val_0_rmspe: 0.22868 |  0:03:38s
epoch 150| loss: 0.20554 | val_0_rmspe: 0.21318 |  0:05:27s
Stop training because you reached max_epochs = 200 with best_epoch = 178 and best_val_0_rmspe = 0.21287
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917018_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 89.92384| val_0_rmspe: 17.00592|  0:00:02s
epoch 50 | loss: 0.43608 | val_0_rmspe: 0.44013 |  0:01:50s
epoch 100| loss: 0.28602 | val_0_rmspe: 0.24947 |  0:03:39s
epoch 150| loss: 0.22566 | val_0_rmspe: 0.22836 |  0:05:28s
Stop training because you reached max_epochs = 200 with best_epoch = 197 and best_val_0_rmspe = 0.21952
Best weights from best epoch are automatically used!
Succ

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917018   0.2184007        1       15   15      1     1.91         2            2          0.00          0.02108952      entmax   200     1   0.00010000  31:20 
[32m[I 2021-09-18 05:43:07,863][0m Trial 18 finished with value: 0.21840074899432663 and parameters: {'n_d': 15, 'n_a': 15, 'n_steps': 1, 'gamma': 1.9075223641081287, 'n_independent': 2, 'n_shared': 2, 'lambda_sparse': 0.0, 'learning_rate': 0.02108952181429731}. Best is trial 15 with value: 0.21284028675292316.[0m


OOF score across folds: 0.21840074899432663
Training fold 0
Device used : cuda
epoch 0  | loss: 92.75645| val_0_rmspe: 21.47523|  0:00:02s
epoch 50 | loss: 0.22476 | val_0_rmspe: 0.22164 |  0:01:49s
epoch 100| loss: 0.21091 | val_0_rmspe: 0.27168 |  0:03:35s
epoch 150| loss: 0.20533 | val_0_rmspe: 0.21256 |  0:05:22s
Stop training because you reached max_epochs = 200 with best_epoch = 150 and best_val_0_rmspe = 0.21256
Best weights from best epoch are automatically used!
Successfully saved model at /home/xuming/workspace/output/tabnet_917019_fold0.model.zip
Training fold 1
Device used : cuda
epoch 0  | loss: 128.02522| val_0_rmspe: 27.24875|  0:00:02s
epoch 50 | loss: 0.25322 | val_0_rmspe: 0.26272 |  0:01:49s
epoch 100| loss: 0.22582 | val_0_rmspe: 0.23343 |  0:03:35s
epoch 150| loss: 0.21345 | val_0_rmspe: 0.22004 |  0:05:22s
Stop training because you reached max_epochs = 200 with best_epoch = 189 and best_val_0_rmspe = 0.21815
Best weights from best epoch are automatically used!
Suc

 suffix  rmspe_score  cat_emb_dim  n_d  n_a  n_steps  gamma  n_independent  n_shared  lambda_sparse  optimizer_params  mask_type  T_0  T_mult eta_min     time
 917019   0.2153700        1       16   14      1     1.94         1            2          0.00          0.01076705      entmax   200     1   0.00010000  33:08 
[32m[I 2021-09-18 06:16:16,644][0m Trial 19 finished with value: 0.2153699706954589 and parameters: {'n_d': 16, 'n_a': 14, 'n_steps': 1, 'gamma': 1.9399151944545263, 'n_independent': 1, 'n_shared': 2, 'lambda_sparse': 0.0, 'learning_rate': 0.010767050638480795}. Best is trial 15 with value: 0.21284028675292316.[0m


OOF score across folds: 0.2153699706954589
