In [1]:
# Imports:
import pandas as pd
import numpy as np
from utils import *
import seaborn as sns

In [2]:
# Retrieve Data
data = retrieve_data()
train = data['train'].copy()
test = data['test'].copy()
# The dependent feature
y_feat = 'SalePrice'

# Preprocessing:
The general strategy is to combine both the categorical and numerical values in the training and testing and then process them at the same time. For categorical variables we will be getting dictionaries from the training data and then process them for the combined dataframe. In the case of numerical features, imputation is going to be done using the KNN imputation.

1. Encoding categorical features: I'll be using some functions written in utils.py to come up with meaning values for the unique keys in each of the categorical features, then map them in the data given certain conditions.
2. Imputing numerical data: The numerical 

## In-depth analysis of categorical variables:
1. Compare the different NaNs for the same categories (and not) in the number of NaNs they have.
2. Given that 90% data is not missing for a given feature (column) map their encoded numerical values in the dataframe, otherwise, only impute non-nan values in the feature and then impute the rest of the missing values using any other technique. Dropping the column for values with too many missing might be a general option but in order to use the data for Nerual Networks, it would make sense to just impute the missing values with zeros.

## Encoding categorical variables:
In order to come up with a meaningful value for any given unique value in a categorical feature column, we will be considering the average SalePrice for each of those unique values and weight them relative to each other. The important thing to note would be that given that more than 90% exists in a column we could just impute the minor missing values with the average of SalePrice for those columns. But if less then 90% of the data existed then there would be a problem since our measures would not make sense and since we are using Neural Networks it would make more sense to impute them with zeros.


In [33]:
def missing_info(data):
    """ retuns two dataframes (train, test) defining their relative missing values. """
    # test data:
    cat_dict = {
        "Test": dict(data['test'][data['test_cat_missing']].isna().sum()), 
        "Train": dict(data['train'][data['train_cat_missing']].isna().sum())
    }

    # train data:
    num_dict = {
        "Test": dict(data['train'][data['train_num_missing']].isna().sum()), 
        "Train": dict(data['test'][data['test_num_missing']].isna().sum())
    }

    return pd.DataFrame(cat_dict).fillna(0), pd.DataFrame(num_dict).fillna(0)


In [34]:
# Get the DataFrames
cat_info, num_info = missing_info(data)

In [37]:
cat_info

Unnamed: 0,Test,Train
Alley,1352,1369
MasVnrType,16,8
BsmtQual,44,37
BsmtCond,45,37
BsmtExposure,44,38
BsmtFinType1,42,37
BsmtFinType2,42,38
Electrical,0,1
FireplaceQu,730,690
GarageType,76,81


In [None]:
"""
    # Put this in a markdown cell
    1. Alley, PoolQc, Fence, MiscFeature are the features with an ecessive number of missing values both in training and testing.
    2. FireplaceQu is not as bad ass the described functions but it is going to be treated the same way.

    Note: To conclude there are 5 features that the np.nan values in them should not be imputed with their given dictionary value but a zero.
"""

In [38]:
num_info

Unnamed: 0,Test,Train
LotFrontage,259.0,227
MasVnrArea,8.0,15
GarageYrBlt,81.0,78
BsmtFinSF1,0.0,1
BsmtFinSF2,0.0,1
BsmtUnfSF,0.0,1
TotalBsmtSF,0.0,1
BsmtFullBath,0.0,2
BsmtHalfBath,0.0,2
GarageCars,0.0,1


In [None]:
""" 
    #  Put this in Mark down cell:
    Based on this dataframe, there some features missing in Training that are not missing in the test data. There is no need manually impute anything in the case of numerical values and I am just going to let KNN handle it.
"""

In [40]:
def combine_train_test(train, test, y_feat='SalePrice'):
    train.drop([y_feat], axis=1 , inplace = True) # Drop the dependent column in traininig data
    feat_cols = train.append(test) # Combine datasets
    feat_cols.reset_index(inplace=True) # Reset Indexes
    feat_cols.drop(['index', 'Id'], inplace=True, axis=1) # Drop Id and index columns

    return feat_cols

In [43]:
# Get the length of training data to rebreak the combined data further along the way
train_len = train.shape[0]

# Combine the train and test 
feat_cols = combine_train_test(train.copy(), test.copy())

In [44]:
feat_cols

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2915,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2916,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2917,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [None]:
# Impute the missing values with KNNImputer
from sklearn.impute import KNNImputer

# Get the list of columns with missing values
missings = feat_cols.columns[feat_cols.isna().any()].tolist()
# The number of neighbors that the function look for is the 1/3 of the whole dataframe
num = (train_len + test_len) // 3

# Imputer object
imputer = KNNImputer(n_neighbors=num, weights="distance")
# Get the new imputed data
feat_cols[missings] = pd.DataFrame(imputer.fit_transform(feat_cols[missings]))

In [182]:
# Now rebreak the data into train and test
imp_train = feat_cols[: train_len]
# test-data
imp_test = feat_cols[train_len:].reset_index()
imp_test.drop(['index'], inplace=True, axis=1)

In [188]:
pd.Series(dep_col)

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [29]:
# Feature Engineering
imp_train.corr()[y_feat].nlargest(21)[1:]

OverallQual     0.790982
Neighborhood    0.738630
GrLivArea       0.708624
ExterQual       0.690933
BsmtQual        0.681905
KitchenQual     0.675721
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
GarageFinish    0.553059
FireplaceQu     0.542181
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
Foundation      0.506328
GarageYrBlt     0.506210
GarageType      0.499204
MasVnrArea      0.477596
Name: SalePrice, dtype: float64

In [191]:
# Breaking the x and y splits:
# Finding the features
# features = imp_train.corr()[y_feat].nlargest(21)[1:].keys().to_list()

# Training datasets
X = imp_train
y = dep_col

# It makes more sense to use batchnormalization in NN instead
# of feeding normalized data into the model.
norm_X = normalize(X)
norm_y = normalize(y)

# Testing datasets
X_test = imp_test
norm_X_test = normalize(X_test)

# Check to see if the imputation worked
print('Check nans in X:', True in X.isna().any())
print('Check nans in y:', True in dict(y.isna()).values())

Check nans in X: False
Check nans in y: False


In [140]:
# Chunks of data used to check for overfitting
# devs = []
# dev_batch_size = int(imp_train.shape[0] * 0.3)

# for i in range(10):
#     dev_data = imp_train.sample(n=438, random_state=i)
#     dev_x = dev_data[features]
#     dev_y = dev_data[y_feat]
#     devs.append((dev_x, dev_y))

# Fitting parts

In [184]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers, regularizers, losses, metrics
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

from sklearn.metrics import mean_absolute_error as MAE

In [185]:
# Softmax does not make sense, drop out and batchnormalization works
# For metrics, mse and msle should be considered
# The only place to use the BatchNormalization layer is at the beginning

#
def build_model05():
    model = keras.Sequential([
        layers.InputLayer(input_shape=[len(X.keys())]),
        
        layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dense(64, activation='elu', kernel_regularizer=regularizers.l2(0.01)),
        layers.Dense(64),
        layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l1(0.001)),
        layers.Dense(64, activation='elu', kernel_regularizer=regularizers.l1(0.001)),
        
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dense(256, activation='elu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dense(256, activation='relu'),
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l1(0.001)),
        layers.Dense(256, activation='elu', kernel_regularizer=regularizers.l2(0.001)),
        
        layers.Dense(1024),
        layers.Dropout(0.5),
        
        layers.Dense(16, activation = 'elu'),
        layers.Dense(16, activation = 'elu'),
        layers.Dense(16, activation = 'relu'),
        
        Dense(8, activation = 'elu'),
        Dense(8, activation = 'elu'),
        Dense(8, activation = 'relu'),
        
        Dense(4),
        Dense(4, kernel_regularizer=regularizers.l1_l2(0.001, 0.01)),
        Dense(4),
        
        layers.Dense(1)
      ])
    
    time_lr = tf.keras.optimizers.schedules.InverseTimeDecay(
      0.0025,
      decay_steps=1460 // 5,
      decay_rate=1.2,
      staircase=False
    )
    
    exp_lr = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate = 0.25, 
        decay_steps=1460 // 20, 
        decay_rate=0.02,
        staircase=False, name=None
    )
    
    optimizer = tf.keras.optimizers.Adam(time_lr)
        
    model.compile(
                loss=losses.MeanSquaredLogarithmicError(name='MSLE'), 
                optimizer=optimizer, 
    )
  
    return model

model = build_model05()

def validate():
    # Check to see if there have been an overfit or underfit
    for i in range(10):
        model.evaluate(devs[i][0], devs[i][1], batch_size=batch_size)

In [172]:
EPOCHS = 3500
batch_size = 1460 // 20

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=60)

In [187]:
history = model.fit(X, y, epochs=EPOCHS,
          verbose=0, validation_split=0.33,
          callbacks=[early_stop, tfdocs.modeling.EpochDots()])
# print('------------------------------------------------------------------------')
# validate()


Epoch: 0, loss:0.6405,  val_loss:0.6394,  
....................................................................................................
Epoch: 100, loss:0.2473,  val_loss:0.2510,  
....................................................................................................
Epoch: 200, loss:0.1323,  val_loss:0.1372,  
....................................................................................................
Epoch: 300, loss:0.0959,  val_loss:0.1013,  
....................................................................................................
Epoch: 400, loss:0.0832,  val_loss:0.0889,  
....................................................................................................
Epoch: 500, loss:0.0780,  val_loss:0.0834,  
....................................................................................................
Epoch: 600, loss:0.0743,  val_loss:0.0800,  
................................................................................

In [189]:
b012 = load_bench_data(file_name='012008.csv', root='./submissions/')['SalePrice']
b011 = load_bench_data(file_name='011978.csv', root='./submissions/')['SalePrice']

In [151]:
exp = 15
den = (0.12008 ** exp + 0.11978 ** exp)

w012 = 1 - 0.12008 ** exp / den
w011 = 1 - 0.11978 ** exp / den

pred_y = b012 * w012 + b011 * w011

In [192]:
# A very low patience rate for the 
# train_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)

# # Fit it to all of the data
# model.fit(X, y, 
#           epochs=1500, steps_per_epoch=5, validation_split=0.3,
#           verbose=0, callbacks=[tfdocs.modeling.EpochDots(), train_stop]
#          )

pred_y = pd.DataFrame(model.predict(X_test, batch_size=20, steps=73, verbose=0))[0]
# It would make sense to convert all of the data to int 
# instead of float since there no floats in trainig.
modified = quantize(pred_y)

modified[:5]

[142684, 179834, 193854, 190913, 173967]

In [193]:
# val_loss of 0.0197 is close
print('b011:', int(MAE(b011, modified)) / 1000)
print('b012:', int(MAE(b012, modified)) / 1000)
print('-----------------------------------')
print('b:', int(MAE(b011, b012)) / 1000)

b011: 20.887
b012: 21.302
-----------------------------------
b: 5.76


In [177]:
output = pd.DataFrame({'Id': test.Id,
                      'SalePrice': modified})
output.to_csv('submissions/submission.csv', index=False)