In [0]:
# To download files
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':'1IIZQyDW6gyw1_yigC4whlqLeuEr6_b-d'})
downloaded.GetContentFile('train_clean.csv')

downloaded = drive.CreateFile({'id':'1sOh376Y8hs0KKevZ7CArKmFEIty0FWAO'})
downloaded.GetContentFile('test_clean.csv')

downloaded = drive.CreateFile({'id':'1yP-mQHvkkncFAV_Mxyyna05ZyC0GzPKe'})
downloaded.GetContentFile('sample_submission.csv')

In [0]:
import pandas as pd
import numpy as np

# https://github.com/dmlc/xgboost
!pip install -q xgboost==0.7.post3
import xgboost as xgb

In [0]:
# load data
training_data = pd.read_csv('train_clean.csv')
testing_data = pd.read_csv('test_clean.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [0]:
y_train = training_data['loss'].ravel()
# remove target variable from feature
x_train = np.array(training_data.drop('loss', 1))
x_test = np.array(testing_data)

# use xgb.DMatrix for xgboost
# using log-loss for target variable
dtrain = xgb.DMatrix(x_train, label=np.log(y_train))

In [0]:
# parameters for the xgboost
params = {
    'seed': 2,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'reg:linear',
    'max_depth': 6,
    'min_child_weight': 1,
    'eval_metric': 'mae',
    'alpha' : 8.75,
    'gamma': 0.8
}

In [0]:
# train the boosted tree
model = xgb.train(params, dtrain, 600)

In [13]:
mae = sum(abs(np.exp(model.predict(dtrain)) - y_train)) / len(y_train)
print 'Training MAE: ' + str(mae)

Training MAE: 1092.7269133544594


In [0]:
# because we used log-loss, result must be exp
dtest = xgb.DMatrix(x_test)
res = model.predict(dtest)
res_exp = np.exp(res)

In [0]:
# prepare data for submission
final_result = pd.DataFrame()
final_result['id'] = sample_submission['id']
final_result['loss'] = res_exp

In [0]:
# convert to csv and download file
final_result.to_csv('submission3.csv', index=False)

In [0]:
from google.colab import files
files.download('submission3.csv')

Score: 1122.23929<br>
Position: 683 / 3055 (22.35%)

In [0]:
# store results of predictions for training example for stacking
res = model.predict(dtrain)
res_exp = np.exp(res)
final_result = pd.DataFrame()
final_result['model3'] = res_exp
final_result.to_csv('model3.csv', index=False)

In [0]:
from google.colab import files
files.download('model3.csv')