In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
#Training data
filepath_train = '../input/home-data-for-ml-course/train.csv'
X = pd.read_csv(filepath_train, index_col='Id')
y = X.pop('SalePrice')

#Testing data
filepath_test = '../input/home-data-for-ml-course/test.csv'
X_test = pd.read_csv(filepath_test, index_col='Id')

In [None]:
#Extracting categorical and numerical columns
cat_cols = [col for col in X.columns if X[col].dtype == 'object']
num_cols = np.setdiff1d(X.columns, cat_cols, True)

In [None]:
#Missing numerical columns
missing_num = [col for col in num_cols if X[col].isnull().any()]
missing_num = missing_num + [col for col in num_cols if ((X_test[col].isnull().any()) & (col not in missing_num))]
missing_num.pop(2)
print(missing_num)

In [None]:
#Insert missing values for 'GarageYrBlt'
X.loc[:,'GarageYrBlt'] = X.loc[:,'GarageYrBlt'].fillna(X['GarageYrBlt'].mean())
X_test.loc[:,'GarageYrBlt'] = X_test.loc[:,'GarageYrBlt'].fillna(X['GarageYrBlt'].mean())

In [None]:
#Missing categorical columns
missing_cat = [col for col in cat_cols if X[col].isnull().any()]
missing_cat = missing_cat + [col for col in cat_cols if ((X_test[col].isnull().any()) & (col not in missing_num))]
print(missing_cat)

In [None]:
#Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])
    
cat_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('OH_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

In [None]:
#Preprocessing and splitting the data
X_pp = pd.DataFrame(preprocessor.fit_transform(X))
X_test_pp = pd.DataFrame(preprocessor.transform(X_test))
X_train, X_val, y_train, y_val = train_test_split(X_pp, y)


In [None]:
#Model
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

model = keras.Sequential([
    layers.Dense(2048, activation='relu', input_shape=[289]),
    layers.BatchNormalization(),
    
    layers.Dropout(0.3),
    layers.Dense(1024, activation='relu'),
    layers.BatchNormalization(),
      
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    
    layers.Dropout(0.3),
    layers.Dense(1)
    
])

model.compile(
    optimizer='adam',
    loss='mae'
)

early_stopping = EarlyStopping(
    min_delta=0.001,
    patience=20,
    restore_best_weights=True,
)

In [None]:
#Training the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=256,
    epochs=5000,
    callbacks=[early_stopping],
    verbose=0
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[5:, ['loss', 'val_loss']].plot()

print(("Best Validation Loss: {:0.4f}" .format(history_df['val_loss'].min())))

In [None]:
#Predicting and previewing the predictions
predictions = model.predict(X_test_pp)
predictions

In [None]:
#Saving predictions to file
output = pd.DataFrame(predictions)
output.index = X_test.index
output.columns = ['SalePrice']
output.to_csv('submission.csv', index=False)