In [1]:
# Imports:
import pandas as pd
import numpy as np
from utils import *
import seaborn as sns

#### Note: Check all the unique categorical features in both training and testing datasets to see which features are missing in test dataset, This part might not be necessary but could be helpful.

In [4]:
# Retrieve Data
data = retrieve_data()
train = data['train'].copy()
test = data['test'].copy()
train_num = data['train_num']
y_feat = 'SalePrice'

# Generates a dictionary of values corresponding to the 
# categorical features within the dataset
cat_dics = {}
cat_feats = train.select_dtypes('object').columns.to_list()

for feat in cat_feats:
    cat_dics[feat] = rank_categorical_values(train, feat)[0]

# There might be some missing values in the categorical features in the
# testing data, which will be treated as numerical and imputed with that
# respect.
## Note: mappings should be done after combining datasets

# Get the length of dataset so I can rebreak them after combining
train_len = train.shape[0]
test_len = test.shape[0]

# Get the column for the dependent data into a seprate variable
dep_col = train[y_feat]

# Drop the dependent column in train
train.drop([y_feat], axis=1 , inplace = True)
feat_cols = train.append(test) # Combine datasets
feat_cols.reset_index(inplace=True) # Reset Indexes
feat_cols.drop(['index', 'Id'], inplace=True, axis=1) # Drop Id and index columns

# In order to impute and decode data, first it is needed 
# to break it into categorical and numerical datasets
feat_cols_cat = feat_cols.select_dtypes('object').columns.to_list()
feat_cols_num = feat_cols.select_dtypes(['float64', 'int64']).columns.to_list()

# Decode the categorical features in the combined dataset
for feat in cat_feats:
    feat_cols[feat] = impute_rank_weight(feat_cols[feat].copy(), cat_dics[feat])

In [5]:
cat_dics['Neighborhood']

{'CollgCr': 0.04304425976649378,
 'Veenker': 0.05191703153946601,
 'Crawfor': 0.04579673165007603,
 'NoRidge': 0.07290421209470918,
 'Mitchel': 0.03397825609535837,
 'Somerst': 0.049004977454523396,
 'NWAmes': 0.04110569276735894,
 'OldTown': 0.027880390973622476,
 'BrkSide': 0.027143022046897163,
 'Sawyer': 0.029743319483390717,
 'NridgHt': 0.06876761896720276,
 'NAmes': 0.031711944403308295,
 'SawyerW': 0.04056335615506725,
 'IDOTRR': 0.021770198380387268,
 'MeadowV': 0.021433761682225477,
 'Edwards': 0.027879173157315654,
 'Timber': 0.052672549788386036,
 'Gilbert': 0.041932902480024487,
 'StoneBr': 0.06751267852111145,
 'ClearCr': 0.04621870422721212,
 'NPkVill': 0.031026457909772254,
 'Blmngtn': 0.04237132883976904,
 'BrDale': 0.02272037253329444,
 'SWISU': 0.0310040439665444,
 'Blueste': 0.029897015116482902}

In [6]:
# Impute the missing categorical variables with KNNImputer
from sklearn.impute import KNNImputer

missings = feat_cols.columns[feat_cols.isna().any()].tolist()

imputer = KNNImputer(n_neighbors=300, weights="distance")
feat_cols[missings] = pd.DataFrame(imputer.fit_transform(feat_cols[missings]))

In [7]:
# Now rebreak the data into train and test
imp_train = feat_cols.iloc[:train_len]
imp_train.loc[:, (y_feat)] = dep_col
imp_test = feat_cols[train_len:].reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [8]:
imp_train.corr()[y_feat].nlargest(15)[1:]

OverallQual     0.790982
Neighborhood    0.738630
GrLivArea       0.708624
ExterQual       0.690933
BsmtQual        0.681905
KitchenQual     0.675721
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
GarageFinish    0.553059
FireplaceQu     0.542181
TotRmsAbvGrd    0.533723
Name: SalePrice, dtype: float64

In [9]:
# Feature Engineering
imp_train.corr()[y_feat].nlargest(100).keys().to_list()

['SalePrice',
 'OverallQual',
 'Neighborhood',
 'GrLivArea',
 'ExterQual',
 'BsmtQual',
 'KitchenQual',
 'GarageCars',
 'GarageArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'FullBath',
 'GarageFinish',
 'FireplaceQu',
 'TotRmsAbvGrd',
 'YearBuilt',
 'YearRemodAdd',
 'Foundation',
 'GarageYrBlt',
 'GarageType',
 'MasVnrArea',
 'Fireplaces',
 'BsmtFinType1',
 'HeatingQC',
 'MasVnrType',
 'Exterior2nd',
 'Exterior1st',
 'BsmtExposure',
 'BsmtFinSF1',
 'SaleType',
 'SaleCondition',
 'LotFrontage',
 'MSZoning',
 'WoodDeckSF',
 '2ndFlrSF',
 'OpenPorchSF',
 'HouseStyle',
 'GarageQual',
 'GarageCond',
 'HalfBath',
 'LotShape',
 'LotArea',
 'CentralAir',
 'Electrical',
 'RoofStyle',
 'PavedDrive',
 'BsmtFullBath',
 'BsmtCond',
 'BsmtUnfSF',
 'Fence',
 'BldgType',
 'Condition1',
 'RoofMatl',
 'BsmtFinType2',
 'BedroomAbvGr',
 'LandContour',
 'ExterCond',
 'PoolQC',
 'LotConfig',
 'Alley',
 'Functional',
 'Heating',
 'ScreenPorch',
 'Condition2',
 'PoolArea',
 'MiscFeature',
 'LandSlope',
 'MoSold',
 '3Ss

In [10]:
# Breaking the x and y splits:
# Finding the features
features = imp_train.corr()[y_feat].nlargest(100)[1:].keys().to_list()

# Training datasets
X = imp_train[features]
y = imp_train[y_feat]

# It makes more sense to use batchnormalization in NN instead
# of feeding normalized data into the model.
norm_X = normalize(X)
norm_y = normalize(y)

# Testing datasets
X_test = imp_test[features]
norm_X_test = normalize(X_test)

In [11]:
# Chunks of data used to check for overfitting
devs = []
dev_batch_size = int(imp_train.shape[0] * 0.3)

for i in range(10):
    dev_data = imp_train.sample(n=438, random_state=i)
    dev_x = dev_data[features]
    dev_y = dev_data[y_feat]
    devs.append((dev_x, dev_y))

In [12]:
# Check to see if the imputation worked
True in X.isna().any()

False

# Fitting parts

In [145]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers, regularizers, losses, metrics
from layers import Dense, Dropout, BatchNormalization

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

from sklearn.metrics import mean_absolute_error as MAE

In [181]:
X['GarageCars'][10:20]

10    1.0
11    3.0
12    1.0
13    3.0
14    1.0
15    2.0
16    2.0
17    2.0
18    2.0
19    1.0
Name: GarageCars, dtype: float64

In [173]:
# Softmax does not make sense, drop out and batchnormalization works
# For metrics, mse and msle should be considered
# The only place to use the BatchNormalization layer is the 
def build_model05():
    model = keras.Sequential([
        layers.InputLayer(input_shape=[len(X.keys())]),
        layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.0001)),
        layers.Dense(64, activation='elu', kernel_regularizer=regularizers.l2(0.01)),
        layers.Dense(64),
        
        layers.Dense(256, activation='relu'),
        layers.Dense(256, activation='elu', kernel_regularizer=regularizers.l2(0.01)),
        layers.Dense(256),
        
        layers.Dense(2048, kernel_regularizer=regularizers.l2(0.1)),
        layers.Dropout(0.70),
        layers.Dense(256, activation = 'elu', kernel_regularizer=regularizers.l1_l2(0.001, 0.001)),
        layers.Dense(256, activation = 'elu', kernel_regularizer=regularizers.l1(0.001)),
        
        layers.Dense(16, activation = 'elu'),
        layers.Dense(16, activation = 'relu'),
        layers.Dense(16, activation = 'elu'),
        layers.Dense(1)
      ])
    
    time_lr = tf.keras.optimizers.schedules.InverseTimeDecay(
      0.0025,
      decay_steps=1460 // 5,
      decay_rate=1.2,
      staircase=False
    )
    
    exp_lr = tf.keras.optimizers.schedules.InverseTimeDecay(
        initial_learning_rate = 0.0025, 
        decay_steps=1460 // 5, 
        decay_rate=0.5, 
        staircase=False, name=None
    )
    
    optimizer = tf.keras.optimizers.Adam(exp_lr)
        
    model.compile(
                loss=losses.MeanSquaredLogarithmicError(name='MSLE'), 
                optimizer=optimizer, 
                metrics=[metrics.MeanSquaredLogarithmicError(name='msle')]
    )
  
    return model

model = build_model05()

def validate():
    # Check to see if there have been an overfit or underfit
    for i in range(10):
        model.evaluate(devs[i][0], devs[i][1], batch_size=batch_size)

In [151]:
EPOCHS = 2500
batch_size = 1460 // 20

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=110)

In [169]:
history = model.fit(X, y, batch_size=batch_size, epochs=EPOCHS,
          verbose=0, validation_data=devs[0], steps_per_epoch=3,
          callbacks=[early_stop, tfdocs.modeling.EpochDots()])
print('------------------------------------------------------------------------')
validate()


Epoch: 0, loss:190.1047,  msle:124.5566,  val_loss:181.6721,  val_msle:117.6487,  
....................................................................................................
Epoch: 100, loss:13.6205,  msle:11.2187,  val_loss:14.4337,  val_msle:12.0475,  
....................................................................................................
Epoch: 200, loss:5.3139,  msle:4.4443,  val_loss:5.2251,  val_msle:4.3574,  
....................................................................................................
Epoch: 300, loss:2.9869,  msle:2.3947,  val_loss:3.0826,  val_msle:2.4919,  
....................................................................................................
Epoch: 400, loss:1.8248,  msle:1.3620,  val_loss:1.8477,  val_msle:1.3857,  
....................................................................................................
Epoch: 500, loss:1.3172,  msle:0.8993,  val_loss:1.2426,  val_msle:0.8252,  
......................

In [174]:
history = model.fit(X, y, batch_size=batch_size, epochs=EPOCHS,
          verbose=0, validation_data=devs[0], steps_per_epoch=3,
          callbacks=[early_stop, tfdocs.modeling.EpochDots()])
print('------------------------------------------------------------------------')
validate()


Epoch: 0, loss:190.9156,  msle:125.4110,  val_loss:129.1139,  val_msle:64.8590,  
....................................................................................................
Epoch: 100, loss:5.4127,  msle:0.6525,  val_loss:6.0034,  val_msle:1.2683,  
....................................................................................................
Epoch: 200, loss:2.6480,  msle:0.3259,  val_loss:2.9844,  val_msle:0.6668,  
....................................................................................................
Epoch: 300, loss:1.8969,  msle:0.2542,  val_loss:2.0800,  val_msle:0.4403,  
....................................................................................................
Epoch: 400, loss:1.3533,  msle:0.1859,  val_loss:1.4236,  val_msle:0.2594,  
....................................................................................................
Epoch: 500, loss:1.0871,  msle:0.1900,  val_loss:1.0133,  val_msle:0.1182,  
...........................

In [30]:
b012 = load_bench_data(file_name='012008.csv', root='./submissions/')['SalePrice']
b011 = load_bench_data(file_name='011978.csv', root='./submissions/')['SalePrice']

In [151]:
exp = 15
den = (0.12008 ** exp + 0.11978 ** exp)

w012 = 1 - 0.12008 ** exp / den
w011 = 1 - 0.11978 ** exp / den

pred_y = b012 * w012 + b011 * w011

In [175]:
# Fit it to all of the data
model.fit(X, y, 
          epochs=500, steps_per_epoch=5, 
          verbose=0, callbacks=[tfdocs.modeling.EpochDots()]
         )

pred_y = pd.DataFrame(model.predict(X_test, batch_size=20, steps=73, verbose=0))[0]
# It would make sense to convert all of the data to int 
# instead of float since there no floats in trainig.
modified = quantize(pred_y)

modified[:5]


Epoch: 0, loss:0.2186,  msle:0.0720,  
....................................................................................................
Epoch: 100, loss:0.1973,  msle:0.0642,  
....................................................................................................
Epoch: 200, loss:0.1753,  msle:0.0534,  
....................................................................................................
Epoch: 300, loss:0.1611,  msle:0.0499,  
....................................................................................................
Epoch: 400, loss:0.1471,  msle:0.0454,  
....................................................................................................

[123193, 146664, 175908, 174839, 156534]

In [176]:
# val_loss of 0.0197 is close
print('b011:', int(MAE(b011, modified)) / 1000)
print('b012:', int(MAE(b012, modified)) / 1000)
print('-----------------------------------')
print('b:', int(MAE(b011, b012)) / 1000)

b011: 21.68
b012: 22.369
-----------------------------------
b: 5.76


In [177]:
output = pd.DataFrame({'Id': test.Id,
                      'SalePrice': modified})
output.to_csv('submissions/submission.csv', index=False)