In [None]:
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
DATA_DIR = os.path.join('..','input')
TRAIN_FILE = os.path.join(DATA_DIR, 'train.csv')
TEST_FILE = os.path.join(DATA_DIR, 'test.csv')
SUBMISSION_FILE = os.path.join(DATA_DIR,'sample_submission.csv')

In [None]:
train_df = pd.read_csv(TRAIN_FILE).set_index('id')
target = np.log(train_df['loss'])
train_df = train_df.drop('loss', axis=1)
test_df = pd.read_csv(TEST_FILE).set_index('id')
print (train_df.shape, test_df.shape)

In [None]:
cat_cols = [col for col in train_df.columns if col.startswith('cat')]
cont_cols = [col for col in train_df.columns if col.startswith('cont')]

In [None]:
train_test_df = pd.concat([train_df, test_df], axis=0)
print(train_test_df.shape)

In [None]:
train_test_df[cat_cols] = train_test_df[cat_cols].apply(lambda x: pd.factorize(x, sort=True)[0])
train_test_df[cont_cols] = train_test_df[cont_cols].apply(lambda x: StandardScaler().fit_transform(x))

In [None]:
train_df = train_test_df.iloc[:train_df.shape[0]]
test_df = train_test_df.iloc[train_df.shape[0]:]
del train_test_df

In [None]:
X_train, X_cv, y_train, y_cv = train_test_split(train_df, target, test_size=0.1, random_state=42)

In [None]:
xgb_params = {
    'seed': 42,
    'silent': 0,
    'objective': 'reg:linear',
    'nthread': -1,
    'max_depth': 5,
    'min_child_weight':1,
    'gamma': 0,
    'subsample': 0.75,
    'learning_rate': 0.1
    
}

In [None]:
master_grid_params = {'colsample_bylevel':[0.3, 0.6, 0.75, 0.9],
               'colsample_bytree':[0.3, 0.6, 0.75, 0.9],
               'learning_rate':[0.01, 0.03, 0.09, 0.3],
               'max_depth':[3, 6, 9],
               'min_child_weight':[1, 3, 6, 9],
               'n_estimators':[300, 600, 900],
               'reg_alpha':[0, 0.01, 0.1, 0.3, 1, 100],
               'subsample':[0.6, 0.75, 0.9]
               }
param_test1 = {k: master_grid_params[k] for k in ['n_estimators']} 

In [None]:
xgb_model = XGBRegressor(**xgb_params)
gsearch = GridSearchCV(estimator=xgb_model,
                       param_grid=param_test1,
                       scoring='neg_mean_absolute_error',
                       n_jobs=-1,
                       cv=5,
                       verbose=3
                      )

In [None]:
gsearch.fit(X_train, y_train)

In [None]:
y_pred = gsearch.predict(X_cv)

In [None]:
print(mean_absolute_error(np.exp(y_cv), np.exp(y_pred)))