In [1]:
# Imports:
import pandas as pd
import numpy as np
from utils import *
import seaborn as sns

In [None]:
# Retrieve Data
data = retrieve_data()
train = data['train'].copy()
test = data['test'].copy()

# The dependent feature
y_feat = 'SalePrice'

# Preprocessing:
The general strategy is to combine both the categorical and numerical values in the training and testing and then process them at the same time. For categorical variables we will be getting dictionaries from the training data and then process them for the combined dataframe. In the case of numerical features, imputation is going to be done using the KNN imputation.

1. Encoding categorical features: I'll be using some functions written in utils.py to come up with meaning values for the unique keys in each of the categorical features, then map them in the data given certain conditions.
2. Imputing numerical data: The numerical 

## In-depth analysis of categorical variables:
1. Compare the different NaNs for the same categories (and not) in the number of NaNs they have.
2. Given that 90% data is not missing for a given feature (column) map their encoded numerical values in the dataframe, otherwise, only impute non-nan values in the feature and then impute the rest of the missing values using any other technique. Dropping the column for values with too many missing might be a general option but in order to use the data for Nerual Networks, it would make sense to just impute the missing values with zeros.

## Encoding categorical variables:
In order to come up with a meaningful value for any given unique value in a categorical feature column, we will be considering the average SalePrice for each of those unique values and weight them relative to each other. The important thing to note would be that given that more than 90% exists in a column we could just impute the minor missing values with the average of SalePrice for those columns. But if less then 90% of the data existed then there would be a problem since our measures would not make sense and since we are using Neural Networks it would make more sense to impute them with zeros.


In [1]:
# def missing_info(data):
#     """ retuns two dataframes (train, test) defining their relative missing values. """
#     # test data:
#     cat_dict = {
#         "Test": dict(data['test'][data['test_cat_missing']].isna().sum()), 
#         "Train": dict(data['train'][data['train_cat_missing']].isna().sum())
#     }

#     # train data:
#     num_dict = {
#         "Test": dict(data['train'][data['train_num_missing']].isna().sum()), 
#         "Train": dict(data['test'][data['test_num_missing']].isna().sum())
#     }

#     return pd.DataFrame(cat_dict).fillna(0), pd.DataFrame(num_dict).fillna(0)


In [6]:
# Get the DataFrames
cat_info, num_info = missing_info(data)

NameError: name 'missing_info' is not defined

In [5]:
cat_info

NameError: name 'cat_info' is not defined

### Important to note for categorical features:
1. Alley, PoolQc, Fence, MiscFeature are the features with an ecessive number of missing values both in training and testing.
2. FireplaceQu is not as bad ass the described functions but it is going to be treated the same way.
3. Although for some these values NA means that they just don't have that feature: Alley, MiscFeature, PoolQc

Note: To conclude there are 5 features that the np.nan values in them should not be imputed with their given dictionary value but a zero.


In [6]:
num_info

Unnamed: 0,Test,Train
LotFrontage,259.0,227
MasVnrArea,8.0,15
GarageYrBlt,81.0,78
BsmtFinSF1,0.0,1
BsmtFinSF2,0.0,1
BsmtUnfSF,0.0,1
TotalBsmtSF,0.0,1
BsmtFullBath,0.0,2
BsmtHalfBath,0.0,2
GarageCars,0.0,1


## Numerical data:
Based on this dataframe, there some features missing in Training that are not missing in the test data. There is no need manually impute anything in the case of numerical values and I am just going to let KNN handle it.

In [9]:
# def combine_train_test(train, test, y_feat='SalePrice'):
#     """ Returns a combined version of the train and test datasets. """
#     train.drop([y_feat], axis=1 , inplace = True) # Drop the dependent column in traininig data
#     feat_cols = train.append(test) # Combine datasets
#     feat_cols.reset_index(inplace=True) # Reset Indexes
#     feat_cols.drop(['index', 'Id'], inplace=True, axis=1) # Drop Id and index columns

#     return feat_cols

In [7]:
# Get the length of training data to rebreak the combined data further along the way
train_len = train.shape[0]

# Combine the train and test:
# Note: Pass the copies so the actual dataframes won't change and we can still use them
feat_cols = combine_train_test(train.copy(), test.copy())

In [8]:
feat_cols

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2915,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2916,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2917,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [9]:
# def encode_categorical_feature(df, category, y_feature='SalePrice', outlier=False,  include_nan=True):
#     """
#         Given that there categorical variables, we 
#         want to have them ranked based on their value
        
#         # Arguments:
#             df: Dataframe
#             category: the category (feature) to be imputed
#             y_feature: the independent feature that we base our
#                 ranking on
#             Type: 
#                 'average' would average the values, 
#                 'norm' returns the normalized version of means
#             outlier: given that it is set to True, the outliers in the 
#                 y_feature of the dataframe would not be considered
            
#         # Returns:
#             imputed column values with the encoding dictionary
#             True if the data needed raking False if not
#     """
#     vals_list = list(df[category].unique())
    
#     unique_categories = stringify_keys(vals_list)
#     haveNan = False # Check to see if there is na/nan in unique vales
    
#     # Deleting NaNs since they are going to be considered seperately
#     if 'nan' in unique_categories:
#         haveNan = True
#         i = unique_categories.index('nan')
#         unique_categories.pop(i)
    
    
#     # Dictionary containing mean values of different values in column
#     means = {}
    
#     AVG = 0 # Sum of all averages
    
#     if not include_nan:
#         haveNan = False
#         means[np.nan] = 0
    
    
#     # Going through unique values
#     for cat in unique_categories:
#         cat_avg = df.loc[df[category] == cat][y_feature].mean()
#         means[cat] = cat_avg
#         AVG += cat_avg
        
#     # Now considering the nan's or the values that were not in any of the unique
#     if haveNan:
#         na_avg = df.loc[~df[category].isin(unique_categories)][y_feature].mean()
#         means[np.nan] = na_avg
#         AVG += na_avg
#         unique_categories.append('nan')
    
#     for cat in unique_categories:
#         if cat == 'nan':
#             means[np.nan] = round(means[np.nan] / AVG, 4)
#         else:
#             means[cat] = round(means[cat] / AVG, 4)
    
#     # IF the Type was not softmax return averages
#     return means


# def get_encoding_dicts(df, features):
#     """ Returns the dictionary containing the encoded values for each unique
#     value inside the categorical columns. """
#     cat_dicts = {}
#     len_df = df.shape[0]
    
#     for feature in features:
#         if df[feature].isna().sum() / len_df < 0.1:
#             cat_dicts[feature] = encode_categorical_feature(df, feature)
#         else:
#             cat_dicts[feature] = encode_categorical_feature(df, feature, include_nan=False)
    
#     return cat_dicts

# Get the needed dictionaries to be used for encoding categorical features
cat_dicts = get_encoding_dicts(train, data['train_cat_list'])  

In [10]:
# # Implement: Don't impute ones with more than 10% missing data.
# def encode_categorical(df, cat_dicts):
#     """ Encodes the dataframe's categorical features by mapping them 
# 	to their relative dictionary values. """
    
#     for feature in cat_dicts.keys():
#         df[feature] = df[feature].map(cat_dicts[feature])
		
#     return df

# Do the encoding
encoded_feat_cols = encode_categorical(feat_cols.copy(), cat_dicts)

In [12]:
encoded_feat_cols['Alley'].unique()

array([0.38729798, 0.25802482, 0.3546772 ])

## Imputing Data with KNN:
- Both the features of train and test are going to be implemented at the same time together using the KNN algorithm

In [None]:
# Impute the missing values with KNNImputer
from sklearn.impute import KNNImputer

# Get the list of columns with missing values
missings = feat_cols.columns[feat_cols.isna().any()].tolist()
# The number of neighbors that the function look for is the 1/3 of the whole dataframe
num = (train_len + test_len) // 3

# Imputer object
imputer = KNNImputer(n_neighbors=num, weights="distance")
# Get the new imputed data
feat_cols[missings] = pd.DataFrame(imputer.fit_transform(feat_cols[missings]))

In [182]:
# Now rebreak the data into train and test
imp_train = feat_cols[: train_len]
# test-data
imp_test = feat_cols[train_len:].reset_index()
imp_test.drop(['index'], inplace=True, axis=1)

In [188]:
pd.Series(dep_col)

0       208500
1       181500
2       223500
3       140000
4       250000
         ...  
1455    175000
1456    210000
1457    266500
1458    142125
1459    147500
Name: SalePrice, Length: 1460, dtype: int64

In [29]:
# Feature Engineering
imp_train.corr()[y_feat].nlargest(21)[1:]

OverallQual     0.790982
Neighborhood    0.738630
GrLivArea       0.708624
ExterQual       0.690933
BsmtQual        0.681905
KitchenQual     0.675721
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
GarageFinish    0.553059
FireplaceQu     0.542181
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
Foundation      0.506328
GarageYrBlt     0.506210
GarageType      0.499204
MasVnrArea      0.477596
Name: SalePrice, dtype: float64

In [191]:
# Breaking the x and y splits:
# Finding the features
# features = imp_train.corr()[y_feat].nlargest(21)[1:].keys().to_list()

# Training datasets
X = imp_train
y = dep_col

# It makes more sense to use batchnormalization in NN instead
# of feeding normalized data into the model.
norm_X = normalize(X)
norm_y = normalize(y)

# Testing datasets
X_test = imp_test
norm_X_test = normalize(X_test)

# Check to see if the imputation worked
print('Check nans in X:', True in X.isna().any())
print('Check nans in y:', True in dict(y.isna()).values())

Check nans in X: False
Check nans in y: False


In [140]:
# Chunks of data used to check for overfitting
# devs = []
# dev_batch_size = int(imp_train.shape[0] * 0.3)

# for i in range(10):
#     dev_data = imp_train.sample(n=438, random_state=i)
#     dev_x = dev_data[features]
#     dev_y = dev_data[y_feat]
#     devs.append((dev_x, dev_y))

# Fitting parts

In [13]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers, regularizers, losses, metrics
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling


# Scheduler objects to control the optimizer learning rate:
from tensorflow.keras.optimizers.schedules import InverseTimeDecay, ExponentialDecay

def TimeDecayScheduler(learning_rate=0.001, decay_steps=200, decay_rate=1.2, name="")
    """ Returns an InverseTimeDecay object with the given properties to be used in the optimizer. """
    return InverseTimeDecay(
        initial_learning_rate=learning_rate, 
        decay_steps=decay_steps,
        decay_rate=decay_rate,
        name=name
    )


def ExponentialScheduler(initial_learning_rate, decay_steps, decay_rate, name=""):
    """ Returns an ExponentialDecay object with the given properties to be used in the optimizer. """
    return InverseTimeDecay(
        initial_learning_rate=initial_learning_rate, 
        decay_steps=decay_steps,
        decay_rate=decay_rate,
        name=name
    )


# Actual Optimizers: Adam and RMSprop are the main two optimizers that are going to be used for this project since they accept schedulers and happen to be effective.

from tensorflow.keras.optimizers import Adam, RMSprop

def AdamOptimizer(learning_rate=0.001, scheduler=None):
    """
        # params:
        learning_rate: the initial learning rate to be used
        scheduler: If this is passed by the user then use it in the optimizer instead of the learning rate

        # returns: an Adam optimizer
    """
    if scheduler == None:
        return Adam(learning_rate)
    else:
        return Adam(scheduler)
    

def RMSpropOptimizer(learning_rate=0.001, scheduler=None):
    """
        # params:
            learning_rate: the initial learning rate to be used
            scheduler: If this is passed by the user then use it in the 
            optimizer instead of the learning rate
        
        # returns: an RMSprop optimizer
    """
    if scheduler == None:
        return RMSprop(learning_rate)
    else:
        return RMSprop(scheduler)

# CallBacks:
from tensorflow.keras.callbacks import EarlyStopping

def EarlyStopCallBack(patience=100):
    """
        # params: patience of the object for the number of epochs passed with no improvement
        # returns: a EarlyStopping callback object 
    """
    return EarlyStopping(monitor='val_loss', patience=patience)


# Models: 
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization  # Layers 
from tensorflow.keras.regularizers import l2, l1, l1_l2, L1L2  # Regularizer
from tensorflow.keras.losses import MeanSquaredLogarithmicError # Error-metric
import tensorflow_docs as tfdocs # For logging puposes

def Model01(config):
    """
        # params: 
        config: uses the configuration dictionary to compile and fit the model accordingly
        
        # returns a history object when the fitting is done
    """
    pass

## Ideas to try out and improve the model
1. Weight-Initializers:
    - Use tf.keras.initializers.RandomNormal and tf.keras.initializers.RandomUniform
    - Tweak their properties and see how they would work.
2. Bias in Dense layers:
    - Setup an initiallizer and regularizer for the bias of the layer
    - Also use it those for the weights too
3. Layers:
    - Use LeakyRelu/TreshholdRelu as a layer
    - Maybe try-out tf.keras.layers.experimental.preprocessing.Normalization*
    - Tweak BatchNormalization layer arguments

In [185]:
def build_model01():
    model = keras.Sequential([
        layers.InputLayer(input_shape=[len(X.keys())]),
        
        layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dense(64, activation='elu', kernel_regularizer=regularizers.l2(0.01)),
        layers.Dense(64),
        layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l1(0.001)),
        layers.Dense(64, activation='elu', kernel_regularizer=regularizers.l1(0.001)),
        
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dense(256, activation='elu', kernel_regularizer=regularizers.l2(0.001)),
        layers.Dense(256, activation='relu'),
        layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l1(0.001)),
        layers.Dense(256, activation='elu', kernel_regularizer=regularizers.l1(0.001)),
        
        layers.Dense(1024),
        layers.Dropout(0.5),
        
        layers.Dense(16, activation = 'elu'),
        layers.Dense(16, activation = 'elu'),
        layers.Dense(16, activation = 'relu'),
        
        Dense(8, activation = 'elu'),
        Dense(8, activation = 'elu'),
        Dense(8, activation = 'relu'),
        
        Dense(4),
        Dense(4, kernel_regularizer=regularizers.l1_l2(0.001, 0.01)),
        Dense(4),
        
        layers.Dense(1)
      ])
    
    time_lr = tf.keras.optimizers.schedules.InverseTimeDecay(
      0.0025,
      decay_steps=1460 // 5,
      decay_rate=1.2,
      staircase=False
    )
    
    exp_lr = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate = 0.25, 
        decay_steps=1460 // 20, 
        decay_rate=0.02,
        staircase=False, name=None
    )
    
    optimizer = tf.keras.optimizers.Adam(time_lr)
        
    model.compile(
                loss=losses.MeanSquaredLogarithmicError(name='MSLE'), 
                optimizer=optimizer, 
    )
  
    return model

model = build_model05()

def validate():
    # Check to see if there have been an overfit or underfit
    for i in range(10):
        model.evaluate(devs[i][0], devs[i][1], batch_size=batch_size)

In [172]:
EPOCHS = 3500
batch_size = 1460 // 20

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=60)

In [187]:
history = model.fit(X, y, epochs=EPOCHS,
          verbose=0, validation_split=0.33,
          callbacks=[early_stop, tfdocs.modeling.EpochDots()])
# print('------------------------------------------------------------------------')
# validate()


Epoch: 0, loss:0.6405,  val_loss:0.6394,  
....................................................................................................
Epoch: 100, loss:0.2473,  val_loss:0.2510,  
....................................................................................................
Epoch: 200, loss:0.1323,  val_loss:0.1372,  
....................................................................................................
Epoch: 300, loss:0.0959,  val_loss:0.1013,  
....................................................................................................
Epoch: 400, loss:0.0832,  val_loss:0.0889,  
....................................................................................................
Epoch: 500, loss:0.0780,  val_loss:0.0834,  
....................................................................................................
Epoch: 600, loss:0.0743,  val_loss:0.0800,  
................................................................................

In [192]:
pred_y = quantize(pd.DataFrame(model.predict(X_test, batch_size=20, steps=73, verbose=0))[0])

pred_y

[142684, 179834, 193854, 190913, 173967]

In [None]:
def validate(y_pred):
    """ Prints out the data validation with respect to the highest submissions. """
    from sklearn.metrics import mean_absolute_error as MAE
    # Import the base_validation submititions
    b012 = load_bench_data(file_name='012008.csv', root='./submissions/')['SalePrice']
    b011 = load_bench_data(file_name='011978.csv', root='./submissions/')['SalePrice']
    
    # Print out the differences
    print('b011:', int(MAE(b011, y_pred)) / 1000)
    print('b012:', int(MAE(b012, y_pred)) / 1000)
    print('-----------------------------------')
    print('base-differences:', int(MAE(b011, b012)) / 1000)

In [177]:
output = pd.DataFrame({'Id': test.Id,
                      'SalePrice': modified})
output.to_csv('submissions/submission.csv', index=False)