For knowledge pursuit, let's submite the datasets to Tensorflow to compare results based in the previous version using XGBoost and LassoCV.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt

from scipy.stats import skew
from scipy.stats.stats import pearsonr

from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.preprocessing import Imputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV, LassoLarsCV

from sklearn.preprocessing import StandardScaler

import os
print(os.listdir("../input"))


Capture data of interest

In [None]:
train_file_path = '../input/train.csv' # this is the path to the Iowa data that you will use
test_file_path = '../input/test.csv' # this is the path to the Iowa data that you will use

train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)


Verify training data

In [None]:
train_data.columns

In [None]:
train_data.describe()

Verify test data

In [None]:
test_data.describe()

For benchmark purpose, let's apply a strict regression using the most proeminent numeric attributes. Delimiting information used in the predictions as data of interest.

In [None]:
data_interest = np.array(train_data.columns[train_data.dtypes != 'object'])
  

Split the data to train and test

In [None]:
# define os conjuntos de teste e treinamento
train_x, test_x, train_y, test_y = train_test_split(train_data[data_interest], train_data['SalePrice'], test_size=0.25)


Testing the preliminar model for future reference

In [None]:
#Vers鉶 usando Regressor+pipeline
reg_model = make_pipeline(Imputer(),XGBRegressor())
reg_model.fit(train_x, train_y)

predicted_home_prices = reg_model.predict(test_x)

print("Regressor - Mean Absolute Error : " + str(mean_absolute_error(predicted_home_prices, test_y)))

scores = np.sqrt(-cross_val_score(reg_model, train_data[data_interest], train_data['SalePrice'], scoring="neg_mean_squared_error", cv=5))
print(scores.mean())
print(scores)

First assessment of correlation

In [None]:
matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

corr_coef = train_data.corr()
corr_coef = corr_coef.fillna(0)
top_coef = pd.concat([corr_coef['SalePrice'].sort_values().head(20),corr_coef['SalePrice'].sort_values().tail(20)])
ax_coefs = top_coef.plot.bar()
print('Mean of correlation for full set of attributes:' + str(corr_coef['SalePrice'].mean()))
ax_coefs.plot()

Preparing the data's new model and Pre-processing the data to solve problems and high level skewness

In [None]:
train_data = train_data.fillna(train_data.mean())
test_data = test_data.fillna(test_data.mean())
data = pd.concat([train_data, test_data], sort=False)

Removing the attributes with high level of biases or outliers (check the [EDA Kernel](https://www.kaggle.com/marcelofms/house-prices-competition-eda-kernel) to details)

In [None]:

remove_list = ['Alley', 'LandContour', 'Utilities', 'LandSlope', 'Condition2', 'RoofStyle', 'HouseStyle', 'Street',
               'RoofMatl', 'Foundation', 'BsmtCond', 'Heating', 'Electrical', 'FireplaceQu', 'GarageYrBlt', 
               'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'BsmtFinSF2', 'LotShape',
               'BsmtFinType1', 'BsmtFinType2', 'BldgType', 'Exterior2nd', 'MiscVal', 'MSSubClass', 'Functional',
               'Exterior1st', 'MSZoning', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']

data = data.drop(remove_list, axis=1)

Normalization of data to make compatible with Tensorflow

*As neither of two methods bellow weren't capable of improve the prediction (no real gains), both were disabled.*

In [None]:
# Normalize the data to better fit to the tensorflow
#interest_data = data[data.columns.difference(['SalePrice']) & data.columns.difference(['Id'])]
#cols_interest = np.array(interest_data.columns[interest_data.dtypes != 'object'])

#mean = data[cols_interest].mean(axis=0)
#std = data[cols_interest].std(axis=0)
#data[cols_interest] = (data[cols_interest] - mean) / std

#log transform skewed numeric features:
#numeric_feats = data.dtypes[data.dtypes != "object"].index

#skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
#skewed_feats = skewed_feats[skewed_feats > 0.75]
#skewed_feats.drop(columns=['SalePrice'], inplace=True, errors='ignore')
#skewed_feats = skewed_feats.index

#data[skewed_feats] = np.log1p(data[skewed_feats])

Encoding the data

In [None]:
#one-hot encoding
data_cat = pd.get_dummies(data)

Split the data in test and train again.

In [None]:
#split data 
train_data_cat = data_cat[data_cat['SalePrice'].notnull()]
test_data_cat = data_cat[data_cat['SalePrice'].isnull()]

test_data_cat = test_data_cat.drop(columns=['SalePrice'])

Check new levels of correlation

In [None]:
df_data_cat = train_data_cat
df_data_cat = df_data_cat.assign(SalePrice = train_data['SalePrice'])
corr_coef = df_data_cat.corr()
corr_coef = corr_coef.fillna(0)
top_coef = pd.concat([corr_coef['SalePrice'].sort_values().head(20),corr_coef['SalePrice'].sort_values().tail(20)])
val_mean_coef = pd.DataFrame(corr_coef).iloc[1].mean(axis=0)
ax_coefs = top_coef.plot.bar()
ax_coefs.plot()

Prepare the sets of data training and test to fit the model

In [None]:
train_x = train_data_cat[train_data_cat.columns.difference(['SalePrice'])]
test_x = test_data_cat
#train_y = train_data_cat['SalePrice']
train_y = train_data['SalePrice']


Apply cross validation to check the model

In [None]:
data_model = XGBRegressor(max_depth=3, n_estimators=500, learning_rate=0.1).fit(train_x, train_y)
scores = np.sqrt(-cross_val_score(data_model, train_x, train_y, scoring="neg_mean_squared_error", cv=5))
print('Scores for XGBRegressor')
print(scores.mean())
print(scores)

Preparing the model evaluation

In [None]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
from tensorflow import keras

import numpy as np

print(tf.__version__)

First try using pre loaded Estimators of Tensorflow (TO-DO)

In [None]:
# convert the dataframes to tensor datasets
def df_to_tensords(df_features, df_labels):
    features = {}
    features_columns = []

    for column in df_features:
        features[column] = df_features[column]
        features_columns.append(tf.feature_column.numeric_column(key=column))
    
    ds = tf.data.Dataset.from_tensor_slices((dict(features), df_labels))    
    return ds, features_columns       
      

In [None]:
#Shuffle the train and test data
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.25)

input_ds, input_features =  df_to_tensords(X_train, y_train)
eval_ds, eval_features = df_to_tensords(X_test, y_test)

In [None]:
#create the inputs methods for the model
def input_train():
    return(input_ds.shuffle(1000).batch(128).repeat().make_one_shot_iterator().get_next())

def input_eval():
    return (eval_ds.shuffle(1000).batch(128).make_one_shot_iterator().get_next())


In [None]:
#build and check the model - LinearRegressor

#model = tf.estimator.LinearRegressor(feature_columns=input_features,
#    optimizer=lambda: tf.train.AdamOptimizer(
#        learning_rate=tf.train.exponential_decay(
#            learning_rate=0.1,
#            global_step=tf.train.get_global_step(),
#            decay_steps=10000,
#           decay_rate=0.96)))

#model.train(input_fn=input_train, steps=500)


In [None]:
#DNNLinearCombinedRegressor
#model = tf.estimator.DNNLinearCombinedRegressor(linear_feature_columns=input_features,
#                                                    linear_optimizer=lambda: tf.train.AdamOptimizer(
#                                                        learning_rate=tf.train.exponential_decay(
#                                                        learning_rate=0.1,
#                                                        global_step=tf.train.get_global_step(),
#                                                        decay_steps=10000,
#                                                        decay_rate=0.96)),
#                                                   loss_reduction=tf.losses.Reduction.MEAN)

#model.train(input_fn=input_train)

That's Estimator was discard due the lack of proper documentation about the mandatory use of bucketized features. The output created as data for the features doesn't allow better predictions or analysis.

In [None]:
#BoostedTreesREgressor
#model = tf.estimator.BoostedTreesRegressor(feature_columns=input_features, 
#                                          n_batches_per_layer=100)

#model.train(input_fn=input_train)

Assessment of the MSE for the model.

In [None]:
eval_result = model.evaluate(input_fn=input_eval)

average_loss = eval_result["average_loss"]

print("Loss for the test set: {:7.2f}".format(average_loss))

Using Neural Network with Keras

In [None]:
def build_model():
  model = keras.Sequential([
    keras.layers.Dense(64, activation=tf.nn.relu,
                       input_shape=(train_x.shape[1],)),
    keras.layers.Dense(64, activation=tf.nn.relu),
    keras.layers.Dense(64, activation=tf.nn.relu),
    keras.layers.Dense(1)
  ])

  #just some experiments
  #optimizer = tf.train.RMSPropOptimizer(0.001)
  #optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001, name='GradientDescent')
  optimizer = tf.train.AdamOptimizer(0.01)  

  model.compile(loss='mse',
                optimizer=optimizer,
                metrics=['mae'])
  return model

model = build_model()
model.summary()

In [None]:
# Display training progress by printing a single dot for each completed epoch
class PrintDot(keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs):
    if epoch % 100 == 0: print('')
    print('.', end='')

EPOCHS = 500

# Store training stats
history = model.fit(train_x, train_y, epochs=EPOCHS,
                    validation_split=0.2, verbose=0,
                    callbacks=[PrintDot()])

In [None]:
import matplotlib.pyplot as plt


def plot_history(history):
  plt.figure()
  plt.xlabel('Epoch')
  plt.ylabel('Mean Abs Error')
  plt.plot(history.epoch, np.array(history.history['mean_absolute_error']),
           label='Train Loss')
  plt.plot(history.epoch, np.array(history.history['val_mean_absolute_error']),
           label = 'Val loss')
  plt.legend()
  #plt.ylim([0, 5])

plot_history(history)

In [None]:
[loss, mae] = model.evaluate(train_x, train_y, verbose=0)

print("Testing set Mean Abs Error: ${:7.2f}".format(mae))

In [None]:
model = build_model()

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)

history = model.fit(train_x, train_y, epochs=EPOCHS,
                    validation_split=0.2, verbose=0,
                    callbacks=[early_stop, PrintDot()])

plot_history(history)

Creating submission from the final configuration of the selected model

In [None]:
#alig the columns e remove the discard attributes from test data set

#test_data_filtered = test_data_cat.drop(np.array(ls_coefs[ls_coefs==0].index), axis=1, errors='ignore')
test_x = test_x[train_x[train_x.columns.difference(['SalePrice'])].columns]

In [None]:
preds_val = model.predict(test_x)

result = test_data.assign(SalePrice=preds_val)

submission_file_path = 'submission.csv'
result.to_csv(submission_file_path,sep=',',columns=['Id', 'SalePrice'], index=False)

In [None]:
result.head()