In [9]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import Imputer
from xgboost import XGBRegressor

In [3]:
data = pd.read_csv('./train.csv')
data.shape

(1460, 81)

In [4]:
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
data.shape

(1460, 81)

In [5]:
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])

In [6]:
Xtrn, Xtst, ytrn, ytst = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

In [7]:
imputer = Imputer()
Xtrn = imputer.fit_transform(Xtrn)
Xtst = imputer.fit_transform(Xtst)

### Train model

In [15]:
model = XGBRegressor()
model.fit(Xtrn, ytrn, verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [17]:
predict = model.predict(Xtst)
mean_absolute_error(predict, ytst)

15955.090239726027

### With additional parameters

In [30]:
model = XGBRegressor(n_estimators=1000)
model.fit(Xtrn, ytrn, early_stopping_rounds=5, eval_set=[(Xtst, ytst)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [31]:
predict = model.predict(Xtst)
mean_absolute_error(predict, ytst)

15966.784053938356

### With learning rate parameter

In [32]:
model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model.fit(Xtrn, ytrn, early_stopping_rounds=5, eval_set=[(Xtst, ytst)], verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [33]:
predict = model.predict(Xtst)
mean_absolute_error(predict, ytst)

16242.464383561644

### With categorical data

In [58]:
# get train data with categorical columns
X = data.drop(['SalePrice'], axis=1)
X = pd.get_dummies(X)
# split training and testing
Xtr, Xts, ytr, yts = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)
# imput
Xtr = imputer.fit_transform(Xtr)
Xts = imputer.fit_transform(Xts)

In [62]:
# train
model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model.fit(Xtr, ytr)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [63]:
# predict
predictions = model.predict(Xts)
# mae
mean_absolute_error(predictions, yts)

14412.325278253425

### Submission 4

In [53]:
test_data = pd.read_csv('./test.csv')
X = data.drop(['SalePrice'], axis=1)
# hot encode
Xtr = pd.get_dummies(X)
Xts = pd.get_dummies(test_data)
# alignt
Xtr, Xts = Xtr.align(Xts, join='inner', axis=1)
# impute
Xtr = imputer.fit_transform(Xtr)
Xts = imputer.fit_transform(Xts)

In [54]:
# train model
model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
model.fit(Xtr, y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1000,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [55]:
# make predictions
predictions = model.predict(Xts)
submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predictions})

In [57]:
# write csv
submission.to_csv('submission4.csv', index=False)