In [8]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn import feature_selection
from sklearn.model_selection import cross_validate
import seaborn as sns
import pickle
%matplotlib inline

In [9]:
# Read the train and test datasets
df_train = pd.read_csv("dataset/train_poly.csv")
df_test = pd.read_csv("dataset/test_poly.csv")

cols_train = df_train.columns.tolist()
cols_test = df_test.columns.tolist()

# Train the model with columns that exist both in train and test set
cols_to_train = [col for col in cols_train if col in cols_test]
cols_to_train.remove('Id')

In [10]:
X_train_i = df_train[cols_to_train]
Y_train = df_train['SalePrice']
X_test_i = df_test[cols_to_train]
X_Id = df_test['Id']

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_i)
X_test = scaler.transform(X_test_i)

In [12]:
print("Training(+validation) set shape : {}".format(X_train.shape))
print("Y_train shape : {}".format(Y_train.shape))
print("Test set shape : {}".format(X_test.shape))

Training(+validation) set shape : (1448, 244)
Y_train shape : (1448,)
Test set shape : (1459, 244)


In [13]:
model = XGBRegressor(colsample_bytree= 0.7, gamma= 0, learning_rate= 0.01, max_depth= 3, \
                      min_child_weight= 0, n_estimators= 5000, nthread= -1, \
                      objective= 'reg:squarederror', reg_alpha= 6e-05, scale_pos_weight= 1,\
                      seed= 27, subsample= 0.7)
cv_split = ShuffleSplit(n_splits = 10, test_size = .20, train_size = .80, random_state = 0 )

In [None]:
model_rfe = feature_selection.RFECV(model, step = 1, scoring = 'neg_mean_absolute_error', \
                                    cv = cv_split)
model_rfe.fit(X_train, Y_train)

In [None]:
X_rfe = X_train_i.columns.values[model_rfe.get_support()]

pickle_out = open("X_rfe_poly.pickle","wb")
pickle.dump(X_rfe, pickle_out)
pickle_out.close()

In [None]:
pickle_in = open("X_rfe_poly.pickle","rb")
X_rfe = pickle.load(pickle_in)

In [None]:
scaler = StandardScaler()
X_train_fs = scaler.fit_transform(X_train_i[X_rfe])
X_test_fs = scaler.transform(X_test_i[X_rfe])

# X_train_fs = np.zeros((X_train.shape[0],1))
# X_test_fs = np.zeros((X_test.shape[0],1))
# for idx,r in enumerate(model_rfe.get_support()):
#     if(r):
#         X_train_fs = np.concatenate([X_train_fs,X_train[:,idx].reshape(X_train.shape[0],1)],axis=1)
#         X_test_fs = np.concatenate([X_test_fs,X_test[:,idx].reshape(X_test.shape[0],1)],axis=1)
# X_train_fs = X_train_fs[:,1:]
# X_test_fs = X_test_fs[:,1:]

In [None]:
print("Training(+validation) set shape : {}".format(X_train_fs.shape))
print("Y_train shape : {}".format(Y_train.shape))
print("Test set shape : {}".format(X_test_fs.shape))

In [None]:
cv_results = cross_validate(model, X_train, Y_train, cv = cv_split,return_train_score=\
                                             True, scoring='neg_mean_absolute_error')
print("Without feature selection: ")
print("Mean train score = {}".format(cv_results['train_score'].mean()))
print("Mean validation score = {}".format(cv_results['test_score'].mean()))

In [None]:
cv_results = cross_validate(model, X_train_fs, Y_train, cv = cv_split,return_train_score=\
                                             True, scoring='neg_mean_absolute_error')
print("With feature selection: ")
print("Mean train score = {}".format(cv_results['train_score'].mean()))
print("Mean validation score = {}".format(cv_results['test_score'].mean()))

In [None]:
model.fit(X_train_fs, Y_train)
preds_train = model.predict(X_train_fs)
preds_test = model.predict(X_test_fs)

result = pd.DataFrame({
    'Id':X_Id,
    'SalePrice':preds_test
})

result.to_csv("dataset/result_xgb_RFE_poly.csv",index=False)

In [9]:
# TODO:

#     2. Grid Search
#     3. Improve ordinal variables


#     6. Regularization