In [61]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
import tensorflow as tf
import random

In [62]:
random.seed(1)
np.random.seed(1)
tf.set_random_seed(1)

In [63]:
df_orig = pd.read_csv("train.csv")
print(df_orig.shape)
print("Null Counts:")
print(df_orig.isnull().sum()[df_orig.isnull().sum() > 0])

(1460, 81)
Null Counts:
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [64]:
df = df_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id", "SalePrice"])
print(df.shape)

(1460, 75)


In [65]:
def fill_nulls(df, mean, mode):
    df.fillna(mean, inplace=True)
    df.fillna(mode, inplace=True)

mean = df.mean()
mode = df.mode().iloc[0]
fill_nulls(df, mean, mode)
print(df.isnull().sum()[df.isnull().sum() > 0])

Series([], dtype: int64)


In [66]:
def encode_categorical_features(df, enc):
    # Flatten enc.categories_ which is a list of np.array
    cat_list = np.concatenate(enc.categories_).ravel()
    df[cat_list] = pd.DataFrame(enc.transform(df[categorical_features]).toarray(), index=df.index)
    df = df.drop(columns=categorical_features)
    return df

categorical_features = df.select_dtypes(exclude=[np.number]).columns
onehot = OneHotEncoder(handle_unknown='ignore')
onehot.fit(df[categorical_features])
df = encode_categorical_features(df, onehot)
print(df.shape)

(1460, 203)


In [67]:
def standardize_numerical_features(df):
    numerical_features = df.select_dtypes(include=[np.number]).columns
    scaler = StandardScaler()
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df

df = standardize_numerical_features(df)

In [68]:
X = df
y = df_orig.SalePrice
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1095, 203) (1095,)
(365, 203) (365,)


In [73]:
reg = 0.01
model = Sequential()
model.add(Dense(units=90, activation='relu', input_dim=X_train.shape[1]))
#model.add(Dropout(0.1))
model.add(Dense(units=90, activation='relu'))
#model.add(Dropout(0.1))
model.add(Dense(units=1, activation=None))

model.compile(loss='mean_squared_error',
              optimizer='Adam',
              metrics=['mean_squared_logarithmic_error'])

model.fit(X_train, y_train, epochs=5, batch_size=16)
train_loss, train_msle = model.evaluate(X_train, y_train)
test_loss, test_msle = model.evaluate(X_test, y_test)
print(np.sqrt(train_msle), np.sqrt(test_msle))
# Current best is 0.14

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
3.5632377710728487 3.8069319682324965


In [None]:
df_test_orig = pd.read_csv("test.csv")
df_test = df_test_orig.drop(columns=["MiscFeature", "Fence", "PoolQC", "Alley", "Id"])
fill_nulls(df_test, mean, mode)
df_test = encode_categorical_features(df_test, onehot)
df_test = standardize_numerical_features(df_test)
predictions = model.predict(df_test)
predictions = np.squeeze(predictions)
print(predictions.shape)
df_submit = pd.DataFrame({'Id': df_test_orig.Id, 'SalePrice': predictions})
df_submit.to_csv('submission.csv', index=False)