In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import pickle
%matplotlib inline

In [2]:
# Read the train and test datasets
df_train = pd.read_csv("dataset/train_final.csv")
df_test = pd.read_csv("dataset/test_final.csv")

cols_train = df_train.columns.tolist()
cols_test = df_test.columns.tolist()

# Train the model with columns that exist both in train and test set
cols_to_train = [col for col in cols_train if col in cols_test]
cols_to_train.remove('Id')

In [3]:
X_train_i = df_train[cols_to_train]
Y_train = df_train['SalePrice']
X_test_i = df_test[cols_to_train]
X_Id = df_test['Id']

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_i)
X_test = scaler.fit_transform(X_test_i)

In [5]:
print("Training(+validation) set shape : {}".format(X_train.shape))
print("Y_train shape : {}".format(Y_train.shape))
print("Test set shape : {}".format(X_test.shape))

Training(+validation) set shape : (1460, 160)
Y_train shape : (1460,)
Test set shape : (1459, 160)


In [14]:
param = {
#     'loss': ['deviance', 'exponential'], 
    'learning_rate': [.01,.05,.1,.2],
    'n_estimators': [100,200,300,400,500],
    'criterion': ['mae'], 
    'max_depth': [3,4,5],
    'random_state': [0]
}
estmtr = GradientBoostingRegressor(verbose=1)
cv_split = ShuffleSplit(n_splits = 10, test_size = .20, train_size = .80, random_state = 0 )
best_model = GridSearchCV(estimator = estmtr, param_grid = param, cv = cv_split,\
                          scoring = 'neg_mean_absolute_error')

In [None]:
best_model.fit(X_train, Y_train)

      Iter       Train Loss   Remaining Time 
         1  6074408724.8968           21.03s
         2  5993767577.8597           20.97s
         3  5914356449.8165           20.70s
         4  5836660640.9013           20.61s
         5  5760741820.5130           20.58s
         6  5686277123.2937           20.31s
         7  5612987540.6256           20.03s
         8  5540808633.5404           19.78s
         9  5472421003.3700           19.55s
        10  5401768904.4112           19.23s
        20  4764632271.4536           16.92s
        30  4230712316.6760           14.76s
        40  3786845176.2714           12.71s
        50  3401383745.4688           10.91s
        60  3060093881.8060            8.82s
        70  2766448149.4587            6.61s
        80  2519454187.5761            4.41s
        90  2306098255.0427            2.20s
       100  2111759506.6892            0.00s
      Iter       Train Loss   Remaining Time 
         1  6089787103.4301           21.52s
        

In [None]:
best_param = best_model.best_params_

In [115]:
model = RandomForestRegressor(n_estimators = 100)
model.fit(X_train, Y_train)
preds_train = model.predict(X_train)
preds_test = model.predict(X_test)

result = pd.DataFrame({
    'Id':X_Id,
    'SalePrice':preds_test
})

result.to_csv("dataset/result_rf.csv",index=False)

In [116]:
X_trainn, X_valid, Y_trainn, Y_valid = train_test_split(X_train, Y_train)

In [117]:
def get_mae_valid(model,X_train, X_valid, Y_train, Y_valid):
    model.fit(X_train, Y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(Y_valid, preds)

print(get_mae_valid(model,X_trainn, X_valid, Y_trainn, Y_valid))

16572.04104109589


In [119]:
# TODO:
#     1. Polynomial features
#     2. Grid Search
#     3. Improve ordinal variables
#     4. Feature Selection
#     5. Remove outliers