In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import pickle
%matplotlib inline

In [3]:
# Read the train and test datasets
df_train = pd.read_csv("dataset/train_final.csv")
df_test = pd.read_csv("dataset/test_final.csv")

cols_train = df_train.columns.tolist()
cols_test = df_test.columns.tolist()

# Train the model with columns that exist both in train and test set
cols_to_train = [col for col in cols_train if col in cols_test]
cols_to_train.remove('Id')

In [4]:
X_train_i = df_train[cols_to_train]
Y_train = df_train['SalePrice']
X_test_i = df_test[cols_to_train]
X_Id = df_test['Id']

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_i)
X_test = scaler.transform(X_test_i)

In [6]:
print("Training(+validation) set shape : {}".format(X_train.shape))
print("Y_train shape : {}".format(Y_train.shape))
print("Test set shape : {}".format(X_test.shape))

Training(+validation) set shape : (1448, 184)
Y_train shape : (1448,)
Test set shape : (1459, 184)


In [22]:
param = {
#     'loss': ['deviance', 'exponential'], 
    'learning_rate': [.01],
    'n_estimators': [5000,5005,5010],
#     'criterion': ['mae'], 
    'max_depth': [3],
#     'random_state': [0],
    'min_child_weight':[0],
    'gamma':[0],
    'subsample':[0.7],
    'colsample_bytree':[0.7],
    'objective':['reg:squarederror'], 
    'nthread':[-1],
    'scale_pos_weight':[1],
    'seed':[27],
    'reg_alpha':[0.00006]
}
estmtr = XGBRegressor()
cv_split = ShuffleSplit(n_splits = 5, test_size = .20, train_size = .80, random_state = 0 )
best_model = GridSearchCV(estimator = estmtr, param_grid = param, cv = cv_split,\
                          scoring = 'neg_mean_absolute_error')

In [23]:
best_model.fit(X_train, Y_train)

GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=0, test_size=0.2, train_size=0.8),
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=...
                                    tree_method=None, validate_parameters=None,
                                    verbosity=None),
             param_grid={'colsample_bytree': [0.7], 'gamma': [0],
                         'learning_rate': [0.01], 'max_depth': [3],
                         'min_child_weight': [0],
                         'n_estimators': [5000, 

In [24]:
best_param = best_model.best_params_
print("Best parameters are : {}".format(best_param))
estmtr.set_params(**best_param)
estmtr.fit(X_train, Y_train)
preds_train = estmtr.predict(X_train)
preds_test = estmtr.predict(X_test)

result = pd.DataFrame({
    'Id':X_Id,
    'SalePrice':preds_test
})

result.to_csv("dataset/result_xgb_CV.csv",index=False)

Best parameters are : {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 0, 'n_estimators': 5000, 'nthread': -1, 'objective': 'reg:squarederror', 'reg_alpha': 6e-05, 'scale_pos_weight': 1, 'seed': 27, 'subsample': 0.7}


In [9]:
# TODO:
#     1. Polynomial features
#     2. Grid Search
#     3. Improve ordinal variables
#     4. Feature Selection
#     5. Remove outliers