In [9]:
import pandas as pd
import numpy as np

from matplotlib.pyplot import plot as plt
from sklearn.model_selection import train_test_split, KFold
%matplotlib inline

In [10]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [11]:
import seaborn as sns
import xgboost as xgb

In [14]:
mat = train_df.drop(['ID','target'], axis=1).values
labels = train_df['target'].values

In [15]:
train_mat, test_mat, train_labels, test_labels = train_test_split(mat, labels, test_size = 0.1, random_state=20)

### Create baseline using xgboost

In [17]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {
          'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    dtest = xgb.DMatrix(test_X)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [None]:
kf = KFold(n_splits=5)

for train_index, valid_index in kf.split(train_mat):
    train_mat = mat[train_index]
    valid_mat = mat[valid_index]
    train_label = labels[train_index]
    valid_label = labels[valid_index]
    
    run_xgb(train_mat, train_label, valid_mat, valid_label, test_mat)
    
    

[0]	train-rmse:1.01199e+07	valid-rmse:1.01099e+07
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:9.60023e+06	valid-rmse:9.65309e+06
[200]	train-rmse:9.13574e+06	valid-rmse:9.25457e+06
[300]	train-rmse:8.7221e+06	valid-rmse:8.91211e+06
[400]	train-rmse:8.34849e+06	valid-rmse:8.6124e+06
[500]	train-rmse:8.01269e+06	valid-rmse:8.35652e+06
[600]	train-rmse:7.71726e+06	valid-rmse:8.13961e+06
[700]	train-rmse:7.44931e+06	valid-rmse:7.95455e+06
[800]	train-rmse:7.20877e+06	valid-rmse:7.79836e+06
[900]	train-rmse:6.99011e+06	valid-rmse:7.66544e+06
[1000]	train-rmse:6.79582e+06	valid-rmse:7.54946e+06
[1100]	train-rmse:6.61752e+06	valid-rmse:7.45702e+06
[1200]	train-rmse:6.45754e+06	valid-rmse:7.37626e+06
[1300]	train-rmse:6.31095e+06	valid-rmse:7.30795e+06
[1400]	train-rmse:6.17982e+06	valid-rmse:7.254e+06
[1500]	train-rmse:6.0605e+06	valid-rmse:7.20643e+06
[1600]	train-rmse:5.947



[0]	train-rmse:1.01461e+07	valid-rmse:1.00038e+07
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:9.62368e+06	valid-rmse:9.57519e+06
[200]	train-rmse:9.152e+06	valid-rmse:9.19765e+06
[300]	train-rmse:8.72936e+06	valid-rmse:8.87511e+06
[400]	train-rmse:8.35178e+06	valid-rmse:8.59598e+06
[500]	train-rmse:8.01331e+06	valid-rmse:8.35456e+06
[600]	train-rmse:7.71285e+06	valid-rmse:8.14687e+06
[700]	train-rmse:7.44465e+06	valid-rmse:7.97163e+06
[800]	train-rmse:7.20053e+06	valid-rmse:7.81792e+06
[900]	train-rmse:6.98281e+06	valid-rmse:7.68698e+06
[1000]	train-rmse:6.78265e+06	valid-rmse:7.57601e+06
[1100]	train-rmse:6.60087e+06	valid-rmse:7.48212e+06
[1200]	train-rmse:6.43801e+06	valid-rmse:7.40327e+06
[1300]	train-rmse:6.28947e+06	valid-rmse:7.33571e+06
[1400]	train-rmse:6.15084e+06	valid-rmse:7.28003e+06
[1500]	train-rmse:6.02741e+06	valid-rmse:7.23374e+06
[1600]	train-rmse:5.

In [None]:
Ids = test_df['ID'].tolist()
submission = pd.DataFrame(data={'ID': pd.Series(Ids), 'target': pd.Series(result)})

In [None]:
submission.to_csv('submission.csv', index=False)