In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn import preprocessing
from sklearn.cross_validation import KFold

from sklearn.metrics import mean_absolute_error

%matplotlib inline



In [2]:
train = pd.read_csv('train.csv')

In [3]:
cat_feats = train.select_dtypes(include=["object"]).columns

for feat in cat_feats:
    train[feat + '_id'] = preprocessing.LabelEncoder().fit_transform(train[feat].values)

In [4]:
num_feats = [feat for feat in train.columns if 'cont' in feat]
id_feats  = [feat for feat in train.columns if '_id' in feat]

X = train[num_feats + id_feats].values
y = train['loss'].values

In [5]:
model = xgb.XGBRegressor(
    max_depth = 12,
    learning_rate = 0.2,
    n_estimators = 20,
    silent = 0,
    objective = 'reg:linear',
    nthread = -1,
    # gamma = 5290.,
    # min_child_weight = 4.2922,
    subsample = 0.7,
    colsample_bytree = 0.6,
    seed = 2017
)

In [6]:
nfolds = 3
folds = KFold(len(y), n_folds=nfolds, shuffle = True, random_state = 2017)


for num_iter, (train_index, test_index) in enumerate(folds):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test   = X[test_index], y[test_index]
    
    model.fit(X_train, y_train,
       eval_metric='mae',
       eval_set=[(X[train_index], y[train_index]), (X[test_index], y[test_index])],
       verbose=True)
    
    y_pred = model.predict(X_test)
    y_pred[y_pred<0] = 0
    
    score = mean_absolute_error(y_test, y_pred)
    print("Fold{0}, score={1}".format(num_iter+1, score))

[0]	validation_0-mae:2444.05	validation_1-mae:2456.15
[1]	validation_0-mae:1994.57	validation_1-mae:2015.93
[2]	validation_0-mae:1684.08	validation_1-mae:1718.83
[3]	validation_0-mae:1482.49	validation_1-mae:1534.73
[4]	validation_0-mae:1343.5	validation_1-mae:1414.39
[5]	validation_0-mae:1245.19	validation_1-mae:1334.26
[6]	validation_0-mae:1179.91	validation_1-mae:1285.21
[7]	validation_0-mae:1140.75	validation_1-mae:1260.76
[8]	validation_0-mae:1107.5	validation_1-mae:1243.2
[9]	validation_0-mae:1084.5	validation_1-mae:1233.41
[10]	validation_0-mae:1068.61	validation_1-mae:1229.43
[11]	validation_0-mae:1055.57	validation_1-mae:1226.41
[12]	validation_0-mae:1043.66	validation_1-mae:1224.81
[13]	validation_0-mae:1035.79	validation_1-mae:1224.13
[14]	validation_0-mae:1025.62	validation_1-mae:1223.09
[15]	validation_0-mae:1017.92	validation_1-mae:1223.85
[16]	validation_0-mae:1009.26	validation_1-mae:1223.29
[17]	validation_0-mae:1002.76	validation_1-mae:1223.5
[18]	validation_0-mae:995

## Task

One cell above there's a model wich use y like a target variable.
Modeify the code in order to use transformed targert variable by logarithm...


some TIPS:
1. y_log_train = np.log(y_train)
2. model.fit(X_train, y_log_train, ...
3. y_log_pred = model.predict(X_test)
4. y_pred = np.exp(y_log_pred)
