In [135]:
import pandas as pd
import numpy as np
from utils import *

In [None]:
# Another more concrete way of imputing data could be to seperate the
# independent feature columns from both train and test set and then 
# impute them all at once, then break it again into test and train
# and then run the model on it.

In [198]:
# Retrieve Data
data = retrieve_data()
train = data['train'].copy()
test = data['test'].copy()
train_num = data['train_num']
y_feat = 'SalePrice'

#### Note: Check all the unique categorical features in both training and testing datasets to see which features are missing in test dataset, This part might not be necessary but could be helpful.

In [199]:
# Generates a dictionary of values corresponding to the 
# categorical features within the dataset
cat_dics = {}
cat_feats = train.select_dtypes('object').columns.to_list()

for feat in cat_feats:
    cat_dics[feat] = rank_categorical_values(train, feat)[0]

# There might be some missing values in the categorical features in the
# testing data, which will be treated as numerical and imputed with that
# respect.
## Note: mappings should be done after combining datasets

In [200]:
# Get the length of dataset so I can rebreak them after combining
train_len = train.shape[0]
test_len = test.shape[0]

# Get the column for the dependent data into a seprate variable
dep_col = train[y_feature]

# Drop the dependent column in train
train.drop([y_feat], axis=1 , inplace = True)
feat_cols = train.append(test) # Combine datasets
feat_cols.reset_index(inplace=True) # Reset Indexes
feat_cols.drop(['index', 'Id'], inplace=True, axis=1) # Drop Id and index columns

In [201]:
# In order to impute and decode data, first it is needed 
# to break it into categorical and numerical datasets
feat_cols_cat = feat_cols.select_dtypes('object').columns.to_list()
feat_cols_num = feat_cols.select_dtypes(['float64', 'int64']).columns.to_list()

In [202]:
# Decode the categorical features in the combined dataset
for feat in cat_feats:
    feat_cols[feat] = impute_rank_weight(feat_cols[feat].copy(), cat_dics[feat])

In [203]:
missings = feat_cols.columns[feat_cols.isna().any()].tolist()

In [204]:
# Impute the missing categorical variables with KNNImputer
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=300, weights="distance")
feat_cols[missings] = pd.DataFrame(imputer.fit_transform(feat_cols[missings]))

In [205]:
# Now rebreak the data into train and test
imp_train = feat_cols[:train_len]
imp_train[y_feature] = dep_col
imp_test = feat_cols[train_len:].reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imp_train[y_feature] = dep_col


In [251]:
imp_train.corr()[y_feature].nlargest(18)[1:]

OverallQual     0.790982
Neighborhood    0.738630
GrLivArea       0.708624
ExterQual       0.690933
BsmtQual        0.681905
KitchenQual     0.675721
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
GarageFinish    0.553059
FireplaceQu     0.542181
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
Foundation      0.506328
Name: SalePrice, dtype: float64

In [252]:
# Breaking the x and y splits:
# Finding the features
features = imp_train.corr()[y_feature].nlargest(18)[1:].keys().to_list()

# Training datasets
X = imp_train[features]
y = imp_train[y_feature]
# It makes more sense to use batchnormalization in NN instead
# of feeding normalized data into the model.
norm_X = normalize(X)
norm_y = normalize(y)

# Testing datasets
X_test = imp_test[features]
norm_X_test = normalize(X_test)

In [253]:
# Chunks of data used to check for overfitting
devs = []
dev_batch_size = int(imp_train.shape[0] * 0.3)

for i in range(10):
    dev_data = imp_train.sample(n=438, random_state=i)
    dev_x = dev_data[features]
    dev_y = dev_data[y_feature]
    devs.append((dev_x, dev_y))

In [225]:
# Check to see if the imputation worked
True in X.isna().any()

False

In [227]:
norm_X.head(5)

Unnamed: 0,OverallQual,Neighborhood,GrLivArea,ExterQual,BsmtQual,KitchenQual,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath
0,0.651256,0.290473,0.370207,0.9239,0.401816,0.581115,0.311618,0.35088,-0.459145,-0.793162,0.78947
1,-0.071812,0.985904,-0.482347,-0.666429,0.401816,-0.763002,0.311618,-0.06071,0.466305,0.257052,0.78947
2,0.651256,0.290473,0.514836,0.9239,0.401816,0.581115,0.311618,0.63151,-0.313261,-0.627611,0.78947
3,0.651256,0.506207,0.383528,-0.666429,-0.741365,0.581115,1.649742,0.790533,-0.687089,-0.521555,-1.025689
4,1.374324,2.630839,1.298881,0.9239,0.401816,0.581115,1.649742,1.697903,0.199611,-0.045596,0.78947


# Fitting parts

In [254]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
import matplotlib.pyplot as plt
import seaborn as sns

In [255]:
# Softmax does not make sense, drop out and batchnormalization works
def build_model04():
  model = keras.Sequential([
    layers.InputLayer(input_shape=[len(X.keys())]),
    layers.BatchNormalization(),
    layers.Dense(64),
    layers.Dense(64, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)
  ])

  optimizer = tf.keras.optimizers.Adam(0.001)

  model.compile(loss='msle', optimizer=optimizer,)
  return model

model = build_model04()

# The patience parameter is the amount of epochs to check for improvement
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=250)

In [256]:
EPOCHS = 1000
batch_size = 128 

In [233]:
model.fit(X, y, batch_size=batch_size, epochs=EPOCHS,
          verbose=0, validation_data=devs[0],
          callbacks=[early_stop, tfdocs.modeling.EpochDots()])


Epoch: 0, loss:123.1985,  val_loss:81.0794,  
....................................................................................................
Epoch: 100, loss:0.2243,  val_loss:0.2128,  
....................................................................................................
Epoch: 200, loss:0.0778,  val_loss:0.0730,  
....................................................................................................
Epoch: 300, loss:0.0411,  val_loss:0.0384,  
....................................................................................................
Epoch: 400, loss:0.0298,  val_loss:0.0280,  
....................................................................................................
Epoch: 500, loss:0.0276,  val_loss:0.0250,  
....................................................................................................
Epoch: 600, loss:0.0255,  val_loss:0.0246,  
.............................................................................

<tensorflow.python.keras.callbacks.History at 0x7f86bc4fd850>

In [257]:
model.fit(X, y, batch_size=batch_size, epochs=EPOCHS,
          verbose=0, validation_data=devs[0],
          callbacks=[early_stop, tfdocs.modeling.EpochDots()])


Epoch: 0, loss:124.6541,  val_loss:40.6012,  
....................................................................................................
Epoch: 100, loss:0.2236,  val_loss:0.2366,  
....................................................................................................
Epoch: 200, loss:0.0742,  val_loss:0.0702,  
....................................................................................................
Epoch: 300, loss:0.0450,  val_loss:0.0416,  
....................................................................................................
Epoch: 400, loss:0.0326,  val_loss:0.0305,  
....................................................................................................
Epoch: 500, loss:0.0264,  val_loss:0.0251,  
....................................................................................................
Epoch: 600, loss:0.0243,  val_loss:0.0231,  
.............................................................................

<tensorflow.python.keras.callbacks.History at 0x7f86b4371940>

In [258]:
for i in range(10):
    model.evaluate(devs[i][0], devs[i][1], batch_size=batch_size)



In [264]:
model.save_weights('./weights/3-015')

In [22]:
def f(x, std, mean):
    exponent = ((x - mean) / std) ** 2 * (-1) * 0.5
    hyp = std * np.sqrt(2 * np.pi)
    
    return np.exp(exponent) / hyp

In [259]:
pred_y = pd.DataFrame(model.predict(X_test, batch_size=20, steps=73, verbose=0))[0]

In [260]:
# It would make sense to convert all of the data to int 
# instead of float since there no floats in trainig.
modified = [] 
for num in list(pd.DataFrame(pred_y)[0].values):
    if num - int(num) >= 0.5:
        modified.append(int(num) + 1)
    else:
        modified.append(int(num))

In [261]:
modified[:10]

[120429,
 143637,
 178952,
 190009,
 215563,
 180544,
 172236,
 179009,
 178711,
 122787]

In [262]:
output = pd.DataFrame({'Id': test.Id,
                      'SalePrice': modified})
output.to_csv('submission.csv', index=False)