In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

In [15]:
# Load .csv files
data = pd.read_csv('train.csv')

In [16]:
# There are a few houses with more than 4000 sq ft living area that are
# outliers, so we drop them from the dataset
data=data[data["GrLivArea"] < 4000]

In [17]:
data_new=data.drop(["Id","PoolQC","MiscVal","MiscFeature","Fence","FireplaceQu","LotFrontage",
                 "Alley","GarageYrBlt"], axis=1)

In [18]:
#create simple imputer and use it to fill nan values 
from sklearn.preprocessing import Imputer
median_imputer = Imputer(strategy='median')
data_new['MasVnrArea'] = median_imputer.fit_transform(data_new['MasVnrArea'].reshape(-1, 1))
#missing values of numerical columns
total = data_new.isnull().sum().sort_values(ascending=False)

  after removing the cwd from sys.path.


In [19]:
#categorical features mapping
obj_df = data_new.select_dtypes(include=['object']).copy()
for i in obj_df:
    obj_df[i] = obj_df[i].astype('category')

for i in obj_df:
    obj_df[i] = obj_df[i].cat.codes
#export numerical features
numerical_features = data_new.select_dtypes(include=["float","int","bool"]).copy()
#concat to new dataframe
dataset=pd.concat([numerical_features,obj_df], axis=1)

In [20]:
# Splitting up a training and test (validation) set
X = dataset.drop("SalePrice", axis=1)
y= dataset["SalePrice"]
frac_test = 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = frac_test, random_state=5)

print('Full data size:')
print(dataset.shape, data['SalePrice'].shape)
print('\nTraining data size:')
print(X_train.shape, y_train.shape)
print('\nTest data size:')
print(X_test.shape, y_test.shape)

Full data size:
(1456, 72) (1456,)

Training data size:
(1092, 71) (1092,)

Test data size:
(364, 71) (364,)


In [21]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()  # define scaler
scaler.fit(X_train)  # fit scaler ONLY on the training data

# print('mean: {}\nstd:  {}'.format(scaler.mean_ , scaler.scale_))

# transform on both sets:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
#calculate correlation of features with Sales price
corr_concat = pd.concat([X_train,y_train], axis=1)
corrmat = corr_concat.corr()
corr_list = corrmat['SalePrice'].sort_values(axis=0,ascending=False).iloc[1:]
# features with correlation >0.45 and <(-0.45)
feat=corr_list[((corr_list.values >0.45)|(corr_list.values < (-0.4))) ].index.tolist()
#remove columns of the remaining ones with low correlation among them
remove_list = ['1stFlrSF','GarageArea','TotRmsAbvGrd','YearRemodAdd']
feat=[ x for x in feat if x not in remove_list ]

In [33]:
# Splitting up again a training and test (validation) set
X = dataset[feat]
y= dataset["SalePrice"]
frac_test = 0.25
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = frac_test, random_state=5)

print('Full data size:')
print(dataset[feat].shape, data['SalePrice'].shape)
print('\nTraining data size:')
print(X_train.shape, y_train.shape)
print('\nTest data size:')
print(X_test.shape, y_test.shape)

Full data size:
(1456, 11) (1456,)

Training data size:
(1092, 11) (1092,)

Test data size:
(364, 11) (364,)


In [39]:
import numpy as np
import os
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

seed = 7
np.random.seed(seed)

# Model
model = Sequential()
model.add(Dense(200, input_dim=11, kernel_initializer='normal', activation='relu'))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
model.add(Dense(50, kernel_initializer='normal', activation='relu'))
model.add(Dense(25, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
# Compile model
model.compile(loss= 'mean_squared_logarithmic_error', optimizer=keras.optimizers.Adadelta())

feature_cols = X_train
labels = y_train
print (feature_cols.shape)
labels.shape
model.fit(np.array(X_train), np.array(y_train), epochs=50, batch_size=10)

(1092, 11)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fe07f0ed828>

In [40]:
# Evaluation on the test set created by train_test_split
model.evaluate(np.array(X_test), np.array(y_test))



0.046024817952906694

In [37]:
# Predictions
import itertools
feature_cols_test = X_test
labels_test = y_test

m = model.predict(np.array(feature_cols_test))
predictions = list(itertools.islice(m, y.shape[0]))

In [38]:
len(predictions)

364