In [1]:
import cudf as gd
import pandas as pd
import numpy as np
import math
import xgboost as xgb
from functools import partial
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import warnings
warnings.filterwarnings("ignore")

In [2]:
GPU_MEMORY = 32 # GB. please change it accordingly

In [3]:
TEST_ROWS = 453653104 # number of rows in test data
# no skip if your gpu has 32 GB memory
# otherwise, skip rows porportionally
SKIP_ROWS = int((1 - GPU_MEMORY/32.0)*TEST_ROWS) 

# Functions

In [4]:
from timeit import default_timer

class Timer(object):

    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [5]:
def multi_weighted_logloss(y_true, y_preds, classes, class_weights):
    """
    refactor from
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order='F')
    # Trasform y_true in dummies
    y_ohe = pd.get_dummies(y_true)
    # Normalize rows and limit y_preds to 1e-15, 1-1e-15
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    # Transform to log
    y_p_log = np.log(y_p)
    # Get the log for ones, .values is used to drop the index of DataFrames
    # Exclude class 99 for now, since there is no class99 in the training set
    # we gave a special process for that class
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    # Get the number of positives for each class
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    # Weight average and divide by the number of positives
    class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = - np.sum(y_w) / np.sum(class_arr)
    return loss

def xgb_multi_weighted_logloss(y_predicted, y_true, classes, class_weights):
    loss = multi_weighted_logloss(y_true.get_label(), y_predicted, 
                                  classes, class_weights)
    return 'wloss', loss

In [6]:
def ravel_column_names(cols):
    d0 = cols.get_level_values(0)
    d1 = cols.get_level_values(1)
    return ["%s_%s"%(i,j) for i,j in zip(d0,d1)]
    
def etl_cpu(df,df_meta):
    df['flux_ratio_sq'] = np.power(df['flux'] / df['flux_err'], 2.0)
    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']
    aggs = {
        'passband': ['mean'], 
        'flux': ['min', 'max', 'mean'],
        'flux_err': ['min', 'max', 'mean'],
        'detected': ['mean'],
        'mjd':['max','min'],
        'flux_ratio_sq':['sum'],
        'flux_by_flux_ratio_sq':['sum'],
    }
    agg_df = df.groupby('object_id').agg(aggs)
    agg_df.columns = ravel_column_names(agg_df.columns)
    
    agg_df['flux_diff'] = agg_df['flux_max'] - agg_df['flux_min']
    agg_df['flux_dif2'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_mean']
    agg_df['flux_w_mean'] = agg_df['flux_by_flux_ratio_sq_sum'] / agg_df['flux_ratio_sq_sum']
    agg_df['flux_dif3'] = (agg_df['flux_max'] - agg_df['flux_min']) / agg_df['flux_w_mean']
    
    agg_df['mjd_diff'] = agg_df['mjd_max'] - agg_df['mjd_min']
    agg_df = agg_df.drop(['mjd_max','mjd_min'],axis=1)
    
    agg_df = agg_df.reset_index()
    df_meta = df_meta.drop(['ra','decl','gal_l','gal_b'],axis=1)
    df_meta = df_meta.merge(agg_df,on='object_id',how='left')
    return df_meta

In [7]:
# To save GPU memory, we drop the column as soon as it is done with groupby
# 
# this hits performance a little but avoids GPU OOM.
def groupby_aggs(df,aggs,col):
    res = None
    for i,j in aggs.items():
        for k in j:
            #print(i,k)
            tmp = df.groupby(col).agg({i:[k]})
            if res is None:
                res = tmp
            else:
                res = res.merge(tmp,on=[col],how='left')
        df.drop_column(i)
    return res

def etl_gpu(df,df_meta):
    aggs = {
        'passband': ['mean'], 
        'detected': ['mean'],
        'mjd':['max','min'],
    }
    agg_df = groupby_aggs(df,aggs,'object_id')
    # at this step, columns ['passband','detected','mjd'] are deleted 
    
    df['flux_ratio_sq'] = df['flux'] / df['flux_err']
    df['flux_ratio_sq'] = df['flux_ratio_sq'].applymap(lambda x: math.pow(x,2))
    df['flux_by_flux_ratio_sq'] = df['flux'] * df['flux_ratio_sq']
    
    aggs2 = {
        'flux_ratio_sq':['sum'],
        'flux_by_flux_ratio_sq':['sum'],
        'flux': ['min', 'max', 'mean'],
        'flux_err': ['min', 'max', 'mean'],
    }
    agg_df2 = groupby_aggs(df,aggs2,'object_id')
    agg_df = agg_df.merge(agg_df2,on=['object_id'],how='left')
    del agg_df2

    agg_df['flux_diff'] = agg_df['max_flux'] - agg_df['min_flux']
    agg_df['flux_dif2'] = (agg_df['max_flux'] - agg_df['min_flux']) / agg_df['mean_flux']
    agg_df['flux_w_mean'] = agg_df['sum_flux_by_flux_ratio_sq'] / agg_df['sum_flux_ratio_sq']
    agg_df['flux_dif3'] = (agg_df['max_flux'] - agg_df['min_flux']) / agg_df['flux_w_mean']
    
    agg_df['mjd_diff'] = agg_df['max_mjd'] - agg_df['min_mjd']
    agg_df.drop_column('max_mjd')
    agg_df.drop_column('min_mjd')
    
    for col in ['ra','decl','gal_l','gal_b']:
        df_meta.drop_column(col)
    
    df_meta = df_meta.merge(agg_df,on=['object_id'],how='left')
    return df_meta

# Load data

In [8]:
%%time
# read data on cpu
test = pd.read_csv('../input/test_set.csv')
test_meta = pd.read_csv('../input/test_set_metadata.csv')

train = pd.read_csv('../input/training_set.csv')
train_meta = pd.read_csv('../input/training_set_metadata.csv')

CPU times: user 4min 40s, sys: 47.5 s, total: 5min 27s
Wall time: 4min


In [9]:
%%time
# read data on gpu
ts_cols = ['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected']
ts_dtypes = ['int32', 'float32', 'int32', 'float32','float32','int32']

test_gd = gd.read_csv('../input/test_set.csv',
            names=ts_cols,dtype=ts_dtypes,skiprows=1+SKIP_ROWS) # skip the header
train_gd = gd.read_csv('../input/training_set.csv',
            names=ts_cols,dtype=ts_dtypes,skiprows=1)

cols = ['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'ddf',
       'hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err', 
       'distmod','mwebv', 'target']
dtypes = ['int32']+['float32']*4+['int32']+['float32']*5+['int32']

train_meta_gd = gd.read_csv('../input/training_set_metadata.csv',
            names=cols,dtype=dtypes,skiprows=1)
del cols[-1],dtypes[-1]
test_meta_gd = gd.read_csv('../input/test_set_metadata.csv',
            names=cols,dtype=dtypes,skiprows=1)

CPU times: user 20.2 s, sys: 7.06 s, total: 27.2 s
Wall time: 21.7 s


# ETL

In [10]:
%%time
# CPU
train_final = etl_cpu(train,train_meta)

CPU times: user 9.09 s, sys: 148 ms, total: 9.24 s
Wall time: 233 ms


In [11]:
%%time
# GPU
train_final_gd = etl_gpu(train_gd,train_meta_gd)

1:int32
2:int32
3:int32
1:int32
1:int32
2:int32
3:float32
1:int32
1:int32
1:float32
2:int32
3:float32
1:float32
2:int32
3:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:int32
1:int32
1:float32
1:float32
2:int32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
1:int32
1:float32
1:float32
1:float32
1:float32
1:float32
1:int32
2:int32
3:int32
3:int32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
CPU times: user 1.2 s, sys: 224 ms, total: 1.42 s
Wall time: 1.05 s


In [12]:
%%time
# CPU
test_final = etl_cpu(test,test_meta)

CPU times: user 4min 1s, sys: 2min 24s, total: 6min 26s
Wall time: 1min 58s


In [13]:
%%time
# GPU
test_final_gd = etl_gpu(test_gd,test_meta_gd)

1:int32
2:int32
3:int32
1:int32
1:int32
2:int32
3:float32
1:int32
1:int32
1:float32
2:int32
3:float32
1:float32
2:int32
3:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:float32
1:float32
1:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:float32
1:int32
1:int32
1:float32
1:float32
2:int32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
1:int32
1:float32
1:float32
1:float32
1:float32
1:float32
2:int32
3:int32
3:int32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
3:float32
CPU times: user 3.84 s, sys: 3.11 s, total: 6.96 s
Wall time: 12.7 s


# train and validation

In [14]:
# CPU
X = train_final.drop(['object_id','target'],axis=1).values
y = train_final['target']
Xt = test_final.drop(['object_id'],axis=1).values
assert X.shape[1] == Xt.shape[1]
classes = sorted(y.unique())    
# Taken from Giba's topic : https://www.kaggle.com/titericz
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194
# with Kyle Boone's post https://www.kaggle.com/kyleboone
class_weights = {c: 1 for c in classes}
class_weights.update({c:2 for c in [64, 15]})

lbl = LabelEncoder()
y = lbl.fit_transform(y)
print(lbl.classes_)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)

[ 6 15 16 42 52 53 62 64 65 67 88 90 92 95]


In [15]:
cpu_params = {
            'objective': 'multi:softprob', 
            'tree_method': 'hist', 
            'nthread': 16, 
            'num_class':14,
            'max_depth': 7, 
            'silent':1,
            'subsample':0.7,
            'colsample_bytree': 0.7,}

In [16]:
func_loss = partial(xgb_multi_weighted_logloss, 
                        classes=classes, 
                        class_weights=class_weights)

In [17]:
%%time
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dvalid = xgb.DMatrix(data=X_test, label=y_test)
dtest = xgb.DMatrix(data=Xt)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
clf = xgb.train(cpu_params, dtrain=dtrain,
                num_boost_round=50,evals=watchlist,
                feval=func_loss,early_stopping_rounds=10,
                verbose_eval=1000)
yp = clf.predict(dvalid)
loss = multi_weighted_logloss(y_test, yp, classes, class_weights)
ysub = clf.predict(dtest)
print('validation loss %.4f'%loss)

[14:18:10] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	eval-merror:0.332484	train-merror:0.27637	eval-wloss:1.98637	train-wloss:1.85982
Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.

Will train until train-wloss hasn't improved in 10 rounds.
[49]	eval-merror:0.273885	train-merror:0.00538	eval-wloss:1.26831	train-wloss:0.131698
validation loss 1.2683
CPU times: user 5min 31s, sys: 1.98 s, total: 5min 33s
Wall time: 21.6 s


In [18]:
# GPU
y = train_final_gd['target'].to_array()
y = lbl.fit_transform(y)
for col in ['object_id','target']:
    train_final_gd.drop_column(col)
for col in train_final_gd.columns:
    train_final_gd[col] = train_final_gd[col].fillna(0).astype('float32')
X = train_final_gd.as_matrix()

for col in ['object_id']:
    test_final_gd.drop_column(col)
for col in test_final_gd.columns:
    test_final_gd[col] = test_final_gd[col].fillna(0).astype('float32')
Xt = test_final_gd.as_matrix()

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1,stratify=y)

In [20]:
# GPU
gpu_params = cpu_params.copy()
gpu_params.update({'objective': 'multi:softprob',
                   'tree_method': 'gpu_hist', 
                  })

In [21]:
%%time
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dvalid = xgb.DMatrix(data=X_test, label=y_test)
dtest = xgb.DMatrix(data=Xt)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
clf = xgb.train(gpu_params, dtrain=dtrain,
                num_boost_round=50,evals=watchlist,
                feval=func_loss,early_stopping_rounds=10,
                verbose_eval=1000)
yp = clf.predict(dvalid)
loss = multi_weighted_logloss(y_test, yp, classes, class_weights)
ysub = clf.predict(dtest)
print('validation loss %.4f'%loss)

[0]	eval-merror:0.323567	train-merror:0.276795	eval-wloss:2.00989	train-wloss:1.94981
Multiple eval metrics have been passed: 'train-wloss' will be used for early stopping.

Will train until train-wloss hasn't improved in 10 rounds.
[49]	eval-merror:0.26879	train-merror:0.008637	eval-wloss:1.12085	train-wloss:0.150667
validation loss 1.1208
CPU times: user 1min 12s, sys: 1.44 s, total: 1min 13s
Wall time: 5.86 s
