### Load libraries

In [46]:
library(xgboost)
library(data.table)
library(Metrics)

### Load Data
To simplify, let's read only `train.csv`

In [63]:
train = fread("../input/train.csv", showProgress = TRUE)
y_train = train$loss
train[, c("id", "loss") := NULL]

### One hot encoding

In [37]:
df = train

for (f in features) {
  if (class(df[[f]])=="character") {
    levels <- sort(unique(df[[f]]))
    df[[f]] <- as.integer(factor(df[[f]], levels=levels))
  }
}

## train model

In [68]:
set.seed(2017)

dtrain = xgb.DMatrix(as.matrix(df), label=y_train)

xgb_params = list(
    objective='reg:linear', 
    eval_metric = "mae",
    max_depth = 10
)
model = xgb.train(xgb_params, dtrain, nrounds=30, maximize = F)

## predict model

In [69]:
y_pred = predict(model,dtrain)
mae(y_train, y_pred)

## Any issues?

### Tips
- validation
- categorical variables

###  k-fold Cross Validation

In [70]:
res = xgb.cv(
    xgb_params, 
    dtrain, 
    prediction =T,
    nrounds=30, 
    nfold=3,
    verbose=1
)

[1]	train-mae:2153.059245+7.470566	test-mae:2158.650635+19.567839 
[2]	train-mae:1637.004964+6.762716	test-mae:1656.250366+17.548189 
[3]	train-mae:1375.164388+7.881921	test-mae:1412.830566+12.958487 
[4]	train-mae:1249.742432+9.041052	test-mae:1304.854492+10.572649 
[5]	train-mae:1185.916870+9.486604	test-mae:1258.568441+8.812283 
[6]	train-mae:1153.010539+9.291867	test-mae:1241.483805+9.637509 
[7]	train-mae:1133.609660+10.127964	test-mae:1234.950033+8.126698 
[8]	train-mae:1117.499715+10.801563	test-mae:1231.734416+6.997436 
[9]	train-mae:1104.506266+10.648119	test-mae:1230.018311+5.752851 
[10]	train-mae:1094.974528+8.493094	test-mae:1229.443156+6.860485 
[11]	train-mae:1085.386352+9.017015	test-mae:1228.035238+6.789870 
[12]	train-mae:1075.641561+9.897941	test-mae:1227.096436+5.976828 
[13]	train-mae:1067.792277+9.417031	test-mae:1225.805990+5.954158 
[14]	train-mae:1057.835246+8.166014	test-mae:1224.805827+6.278466 
[15]	train-mae:1048.299113+8.388607	test-mae:1223.641276+6.30605

## Any issues?

### Tips
- compare train-mae vs test-mae

Let's decrise `max_depth` from 10 to 5

In [81]:
xgb_params = list(
    objective='reg:linear', 
    eval_metric = "mae",
    max_depth = 5
)

res = xgb.cv(
    xgb_params, 
    dtrain, 
    prediction =T,
    nrounds=30, 
    nfold=3,
    verbose=1
)

[1]	train-mae:2158.451172+3.542729	test-mae:2159.255208+8.664711 
[2]	train-mae:1680.657104+1.212815	test-mae:1683.335164+9.322993 
[3]	train-mae:1454.712931+0.562066	test-mae:1458.127319+7.882483 
[4]	train-mae:1359.080241+2.006828	test-mae:1364.622721+8.459649 
[5]	train-mae:1319.730754+0.707579	test-mae:1326.753255+5.642203 
[6]	train-mae:1298.236125+3.333178	test-mae:1307.390340+7.172061 
[7]	train-mae:1287.257528+0.949397	test-mae:1297.876058+4.524106 
[8]	train-mae:1280.598796+2.008944	test-mae:1292.802531+1.723555 
[9]	train-mae:1273.578532+1.936364	test-mae:1287.634359+1.978019 
[10]	train-mae:1266.206380+4.255002	test-mae:1281.524170+0.973047 
[11]	train-mae:1258.862468+3.048014	test-mae:1275.310873+1.915191 
[12]	train-mae:1252.787761+3.281192	test-mae:1270.040487+1.354098 
[13]	train-mae:1246.876221+3.881155	test-mae:1265.515625+2.816074 
[14]	train-mae:1242.045776+4.051238	test-mae:1261.715373+3.079416 
[15]	train-mae:1236.667765+3.425280	test-mae:1257.236410+2.521531 
[16]

### Play around with:
- eta
- subsample (0..1)
- colsample_bytree (0..1)

## Play around with target variable

## Links
* [xgboost docs](http://xgboost.readthedocs.io/en/latest/R-package/xgboostPresentation.html)
* [tutorial #1](https://www.analyticsvidhya.com/blog/2016/01/xgboost-algorithm-easy-steps/), [tutorial #2](https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/)
* [xbfi](https://github.com/Far0n/xgbfi) worth to check
 