In [70]:
import pandas as pd
import numpy as np
from utils import *

# Retrieve Data
data = retrieve_data()
train = data['train']
test = data['test']
train_num = data['train_num']
y_feature = 'SalePrice'

train = train.drop(['Id'], axis=1)

In [71]:
# Test data now
test_cat = data['test_cat']
# Get the dics for the missing values in the test dataset
# Sicne there are no 'SalePrice' features in the test set
# then we should apply the values for train to test
dics = {}
for feat in test_cat:
    dics[feat] = rank_categorical_values(train, feat)
    test[feat] = impute_rank_weight(test[feat].copy(), dics[feat][0])

In [72]:
# Numerical encoding for test data:
# LotFrontage and MasVnrArea should equal zero when na 
zeros = [
    'LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',  
    'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars'
]

imp = ['GarageYrBlt', 'GarageArea']

for feat in zeros:
    test[feat] = test[feat].fillna(0)
    
for im in imp:
    test[im] = test[im].fillna(test[im].mean())

In [73]:
# Impute both train and test data
# 1. encode categorical
train = encode_categorical(train.copy(), data['train_cat'].copy(), y_feature='SalePrice')
# 2. impute numericals
train = impute_numerical(train)
# 3. randomize the data
normalize_train = normalize(train)

In [74]:
# Breaking the x and y splits:
# Finding the features
features = normalize_train.corr()['SalePrice'].nlargest(9)[1:].keys().to_list()
X = normalize_train[features]
y = normalize_train[y_feature]


In [102]:

devs = []
for i in range(10):
    dev_data = train.sample(n=438, random_state=i)
    dev_x = dev_data[features]
    dev_y = dev_data[y_feature]
    devs.append((dev_x, dev_y))

In [75]:
# Check to see if the imputation worked
True in normalize_train.isna().any()

False

In [76]:
normalize_train.head(5)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.07335,0.387032,0.212804,-0.207071,0.064216,0.223014,-0.736346,-0.057799,0.026171,-0.345831,...,-0.068668,-0.04466,0.445272,0.168335,-0.087658,-1.598563,0.13873,-0.255454,-0.195569,0.347154
1,-0.872264,0.387032,0.645526,-0.091855,0.064216,0.223014,-0.736346,-0.057799,0.026171,-0.259309,...,-0.068668,-0.04466,0.445272,0.168335,-0.087658,-0.488943,-0.614228,-0.255454,-0.195569,0.007286
2,0.07335,0.387032,0.299349,0.073455,0.064216,0.223014,1.14692,-0.057799,0.026171,-0.345831,...,-0.068668,-0.04466,0.445272,0.168335,-0.087658,0.990552,0.13873,-0.255454,-0.195569,0.53597
3,0.309753,0.387032,0.068564,-0.096864,0.064216,0.223014,1.14692,-0.057799,0.026171,0.06097,...,-0.068668,-0.04466,0.445272,0.168335,-0.087658,-1.598563,-1.367186,-0.255454,-1.176173,-0.515105
4,0.07335,0.387032,0.760919,0.37502,0.064216,0.223014,1.14692,-0.057799,0.026171,-0.259309,...,-0.068668,-0.04466,0.445272,0.168335,-0.087658,2.100173,0.13873,-0.255454,-0.195569,0.869545


In [77]:
# Some categorical features in training data don't have nan's but
# they do in test, hence the transform method of sklearn.simpleimputer will be used
from sklearn.impute import SimpleImputer

missing_test = test[test.columns[test.isna().any()].tolist()]

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

imp_mean.fit(train[test.columns[test.isna().any()].tolist()])
d = imp_mean.transform(np.array(missing_test))

df = {}
for feat in test.columns[test.isna().any()].tolist():
    df[feat] = d[:, test.columns[test.isna().any()].tolist().index(feat)]
test[test.columns[test.isna().any()].tolist()] = pd.DataFrame(df)

In [78]:
True in list(test.isna().any())

False

In [79]:
# Set the normalized X_test
X_test = normalize(test[features].copy())
# Get the mean and std to remake the predicted values
std = data['train'][y_feature].std()
mean = data['train'][y_feature].mean()

In [80]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
import matplotlib.pyplot as plt
import seaborn as sns

In [161]:
# Softmax does not make sense, drop out and batchnormalization works
def build_model03():
  model = keras.Sequential([
    layers.InputLayer(input_shape=[len(X.keys())]),
      
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dense(64),
    layers.BatchNormalization(),
      
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(128),
    layers.BatchNormalization(),
      
    layers.Dense(256, activation='relu'),
    layers.Dense(64),
    
    layers.BatchNormalization(),
    layers.Dense(16), 
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.Adam(0.001)

  model.compile(loss='msle',
                optimizer=optimizer,
               )
  return model

model = build_model03()

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=250)

In [148]:
EPOCHS = 1000
batch_size = 128 

In [162]:
model.fit(X, y, batch_size=batch_size, epochs=EPOCHS,
          verbose=0, validation_data=devs[0],
          callbacks=[early_stop, tfdocs.modeling.EpochDots()])


Epoch: 0, loss:0.1813,  val_loss:144.7977,  
....................................................................................................
Epoch: 100, loss:0.0469,  val_loss:120.4040,  
....................................................................................................
Epoch: 200, loss:0.0475,  val_loss:121.4320,  
....................................................................................................
Epoch: 300, loss:0.0412,  val_loss:129.0302,  
.....................................................

<tensorflow.python.keras.callbacks.History at 0x7f426e52d5b0>

In [146]:
for i in range(10):
    model.evaluate(devs[i][0], devs[i][1], batch_size=batch_size)



In [22]:
def f(x, std, mean):
    exponent = ((x - mean) / std) ** 2 * (-1) * 0.5
    hyp = std * np.sqrt(2 * np.pi)
    
    return np.exp(exponent) / hyp

In [129]:
pred_y = model.predict(X_test, batch_size=20, steps=73, verbose=0)
pred = pred_y * std + mean

In [130]:
# It would make sense to convert all of the data to int 
# instead of float since there no floats in trainig.
modified = [] 
for num in list(pd.DataFrame(pred)[0].values):
    if num - int(num) >= 0.5:
        modified.append(int(num) + 1)
    else:
        modified.append(int(num))

In [131]:
modified[:10]

[83332, 133333, 157436, 177533, 219502, 173956, 140336, 159178, 185269, 72267]

In [132]:
output = pd.DataFrame({'Id': test.Id,
                      'SalePrice': modified})
output.to_csv('submission.csv', index=False)