### Load library & prepare data

In [5]:
# load library
import pandas as pd
from sklearn.model_selection import train_test_split

# read the data
X = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# remove rows with the missing target, separate traget from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# break off training data & test data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# select categorical column with relatively low cardinality
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and X_train_full[cname].dtype == 'object']

# select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# one-shot encode the data
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

### Build model
Building initial model (1st) with gradient boosting

In [11]:
# load library
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# build first model
my_model_1 = XGBRegressor()

# train first model
my_model_1.fit(X_train, y_train)

# get first predictions
predictions_1 = my_model_1.predict(X_valid)

# calculate MAE
mae_1 = mean_absolute_error(predictions_1, y_valid)
print('Mean Absolute Error: ', mae_1)

Mean Absolute Error:  16803.434690710616


### Improve the model
Build more better model with gradient booster by adding some parameters like `n_estimators` and `learning_rate`

In [13]:
# build second model
my_model_2 = XGBRegressor(n_estimators=1000, learning_rate=0.05)

# train second model
my_model_2.fit(X_train, y_train)

# get second prediction
predictions_2 = my_model_2.predict(X_valid)

# calculate MAE
mae_2 = mean_absolute_error(predictions_2, y_valid)
print('Mean Absolute Error: ', mae_2)

Mean Absolute Error:  16084.123354559075


### Break the model
Build model that perform worse than the first model. This will help us to determine how to set the parameters

In [14]:
# build third model
my_model_3 = XGBRegressor(n_estimators=50, learning_rate=0.1)

# train second model
my_model_3.fit(X_train, y_train)

# get second prediction
predictions_3 = my_model_3.predict(X_valid)

# calculate MAE
mae_3 = mean_absolute_error(predictions_3, y_valid)
print('Mean Absolute Error: ', mae_3)

Mean Absolute Error:  17742.43953339041
