In [39]:
import pandas as pd
import numpy as np
np.random.seed(2018)

from tqdm import tqdm

from sklearn import preprocessing
from sklearn.model_selection import KFold

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import mean_absolute_error

In [27]:
train = pd.read_csv('../input/train.csv')

In [28]:
cat_feats = train.select_dtypes(include=["object"]).columns

for feat in tqdm(cat_feats):
    train[feat + '_id'] = pd.factorize( train[feat] )[0]

100%|██████████| 116/116 [00:01<00:00, 103.43it/s]


In [29]:
num_feats = [feat for feat in train.columns if 'cont' in feat]
id_feats  = [feat for feat in train.columns if '_id' in feat]

X = train[num_feats + id_feats].values
y = train['loss'].values

## Modeling

In [7]:
models = [
    ('dt', DecisionTreeRegressor(max_depth=10)), 
    ('rf', RandomForestRegressor(max_depth=10, n_estimators=20)),
]

kf = KFold(n_splits=3, shuffle = True, random_state = 2018)

for model_name, model in models:
    scores = []
    for train_idx, test_idx in kf.split(X):
        model.fit(X[train_idx], y[train_idx])
        y_pred = model.predict(X[test_idx])
        
        score = mean_absolute_error(y[test_idx], y_pred)
        scores.append(score)
        
    print(model_name, np.mean(scores), np.std(scores) )

dt 1362.8732376665685 1.7448180097867887
rf 1317.5370763126657 2.540738629807585


## XGBoost

In [20]:
model = xgb.XGBRegressor(
    max_depth = 12,
    learning_rate = 0.2,
    n_estimators = 20,
    objective = 'reg:linear',
    nthread = -1,
    subsample = 0.7,
    colsample_bytree = 0.6,
    seed = 2018
)

In [9]:
kf = KFold(n_splits=3, shuffle = True, random_state = 2018)
scores = []
for num_iter, (train_idx, test_idx) in enumerate(kf.split(X)):
    
    model.fit(X[train_idx], y[train_idx],
       eval_metric='mae',
       eval_set=[(X[train_idx], y[train_idx]), (X[test_idx], y[test_idx])],
       verbose=20)
    
    y_pred = model.predict(X[test_idx])
    y_pred[y_pred<0] = 0

    score = mean_absolute_error(y[test_idx], y_pred)
    print("Fold{0}, score={1}".format(num_iter+1, score))
    scores.append(score)

[0]	validation_0-mae:2456.46	validation_1-mae:2426.53
[19]	validation_0-mae:995.507	validation_1-mae:1214.37
Fold1, score=1214.3632879701288
[0]	validation_0-mae:2439.77	validation_1-mae:2464.2
[19]	validation_0-mae:971.745	validation_1-mae:1233.69
Fold2, score=1233.6858121375053
[0]	validation_0-mae:2443.05	validation_1-mae:2457.38
[19]	validation_0-mae:976.681	validation_1-mae:1230.65
Fold3, score=1230.6541292733866


## Log transformation

Let's use log transformation for target variable.

In [31]:
models = [
    ('dt', DecisionTreeRegressor(max_depth=10)), 
    ('rf', RandomForestRegressor(max_depth=10, n_estimators=20)),
]

kf = KFold(n_splits=3, shuffle = True, random_state = 2018)

offset = 0
y_log = np.log(y + offset)
for model_name, model in models:
    scores = []
    for train_idx, test_idx in kf.split(X):
        model.fit(X[train_idx], y_log[train_idx])
        y_pred_log = model.predict(X[test_idx])
        y_pred = np.exp(y_pred_log) - offset
        
        score = mean_absolute_error(y[test_idx], y_pred)
        scores.append(score)
        
    print(model_name, np.mean(scores), np.std(scores) )

dt 1306.4424390640017 7.353250529878342
rf 1263.2329451162102 6.530534772533145


## XGBoost + log(loss)

In [35]:
model = xgb.XGBRegressor(
    max_depth = 12,
    learning_rate = 0.3,
    n_estimators = 20,
    objective = 'reg:linear',
    nthread = -1,
    subsample = 0.7,
    colsample_bytree = 0.6,
    seed = 2018
)

In [37]:
kf = KFold(n_splits=3, shuffle = True, random_state = 2018)
scores = []

y_log = np.log(y)
for num_iter, (train_idx, test_idx) in enumerate(kf.split(X)):
    model.fit(X[train_idx], y_log[train_idx],
       eval_metric='mae',
       eval_set=[(X[train_idx], y_log[train_idx]), (X[test_idx], y_log[test_idx])],
       verbose=5)
    
    y_pred_log = model.predict(X[test_idx])
    y_pred = np.exp( y_pred_log )
    y_pred[y_pred<0] = 0

    score = mean_absolute_error(y[test_idx], y_pred)
    print("Fold{0}, score={1}".format(num_iter+1, score))
    scores.append(score)
    
np.mean(scores), np.std(scores)

[0]	validation_0-mae:5.75178	validation_1-mae:5.7428
[5]	validation_0-mae:1.88924	validation_1-mae:1.88523
[10]	validation_0-mae:0.693799	validation_1-mae:0.706503
[15]	validation_0-mae:0.437333	validation_1-mae:0.472321
[19]	validation_0-mae:0.392509	validation_1-mae:0.439671
Fold1, score=1214.0610349104584
[0]	validation_0-mae:5.74742	validation_1-mae:5.75233
[5]	validation_0-mae:1.88847	validation_1-mae:1.89159
[10]	validation_0-mae:0.692816	validation_1-mae:0.708938
[15]	validation_0-mae:0.436444	validation_1-mae:0.473941
[19]	validation_0-mae:0.391495	validation_1-mae:0.441838
Fold2, score=1237.98094127552
[0]	validation_0-mae:5.74817	validation_1-mae:5.75204
[5]	validation_0-mae:1.88895	validation_1-mae:1.89127
[10]	validation_0-mae:0.694181	validation_1-mae:0.709357
[15]	validation_0-mae:0.437371	validation_1-mae:0.47301
[19]	validation_0-mae:0.393367	validation_1-mae:0.440859
Fold3, score=1231.2871999216247


(1227.7763920358677, 10.075872472937519)

## LightGBM

In [60]:
lgb_params = {'learning_rate'    : 0.35,
              'boosting'         : 'gbdt',
              'objective'        : 'regression_l1',
              'metric'           : 'mae',
              'max_depth'        : 12,
              'feature_fraction' : 0.9,
              'bagging_fraction' : 0.75,
              'num_leaves'       : 31,
              'bagging_freq'     : 1,
              'min_data_per_leaf': 250}


lgb_train = lgb.Dataset(data=train[num_feats + id_feats], label=y)

cv = lgb.cv(lgb_params, 
              lgb_train, 
              num_boost_round=100, 
              early_stopping_rounds=15,
              stratified=False, 
              verbose_eval=10,
              nfold=3) 

[10]	cv_agg's l1: 1546.51 + 7.70168
[20]	cv_agg's l1: 1373.26 + 9.42364
[30]	cv_agg's l1: 1324.17 + 8.54692
[40]	cv_agg's l1: 1300.88 + 8.25568
[50]	cv_agg's l1: 1285.73 + 7.70553
[60]	cv_agg's l1: 1277.55 + 6.91272
[70]	cv_agg's l1: 1273.5 + 6.30497
[80]	cv_agg's l1: 1270.39 + 5.28638
[90]	cv_agg's l1: 1268.54 + 5.12904
[100]	cv_agg's l1: 1269.28 + 5.34698


## Advanced Objective Function: [link](https://www.kaggle.com/c/allstate-claims-severity/discussion/24520)

![](../images/ln_cosh.png)