# 1. Prepare the Data Set


In [60]:
# Make the necessary imports
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam


In [3]:
# Load the dataset
from google.colab import files
uploaded = files.upload()

Saving Housing Pricing.csv to Housing Pricing.csv


In [None]:
data = pd.read_csv("Housing Pricing.csv")
data.head()

In [5]:
data.shape

(1460, 77)

In [6]:
data.drop('Id',inplace=True,axis=1)

In [None]:
data.sample()

In [8]:
len(data.columns)

76

# 2. Fill in Missing Values


In [None]:
features_with_missing= list(data.columns[data.isna().any()])
features_with_missing

In [10]:
#drops data that is 50% missing
data.dropna(axis=1,thresh=0.5*(len(data)),inplace=True)

In [11]:
len(features_with_missing)

15

In [12]:
mean_fill = ['LotFrontage','MasVnrArea']
bfill= ['FireplaceQu']
ffill = list(set(features_with_missing)-set(mean_fill)-set(bfill))

In [13]:
for feature in mean_fill:
  data[feature].fillna(data[feature].mean(),inplace=True)

In [14]:
for feature in bfill:
  data[feature].fillna(method = 'bfill',inplace = True)

In [15]:
for feature in ffill:
  data[feature].fillna(method = 'ffill',inplace = True)

In [None]:
data.isna().any()

# 3. Encode the Dataset


In [17]:
original_features =list(data.columns)

In [None]:
categorical_data = list(data.select_dtypes(include=['object']).columns)
categorical_data

In [19]:
nominal =['MSZoning','LandContour','LotConfig','Neighborhood']
ordinal = list(set(categorical_data)-set(nominal))
numerical = list(set(original_features)-set(categorical_data))
target= ['SalePrice']

In [20]:
#Integer encoding
for feature in ordinal:
  data[feature]=(data[feature].astype('category')).cat.codes

In [21]:
# One hot encoding
df_nominal= pd.get_dummies(data[nominal])
df_nominal.sample()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker
90,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
df_ordinal = data[ordinal]
df_numerical = data[numerical]

In [23]:
encoded_data= pd.concat([df_nominal,df_numerical,df_ordinal],axis=1)
encoded_data.sample()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,1stFlrSF,...,GarageCars,KitchenAbvGr,YearRemodAdd,PoolArea,LotFrontage,Condition2,PavedDrive,HouseStyle,Electrical,LandSlope,BldgType,FireplaceQu,Exterior1st,Foundation,MasVnrType,GarageType,SaleCondition,GarageCond,Exterior2nd,BsmtExposure,ExterQual,BsmtFinType2,ExterCond,BsmtFinType1,SaleType,GarageQual,Heating,LotShape,Functional,KitchenQual,Utilities,GarageFinish,RoofMatl,Condition1,BsmtQual,Street,RoofStyle,BsmtCond,CentralAir,HeatingQC
1169,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1831,...,3,1,1996,0,118.0,2,2,5,4,0,0,4,6,2,1,1,4,4,6,1,2,5,2,2,8,4,1,0,6,2,0,0,1,2,0,1,3,3,1,0


# 4. Standardize the Dataset


In [24]:
#Data Normalization - numbers are made to be between 0 and 1

In [25]:
X= encoded_data.drop('SalePrice',axis = 1)

In [26]:
y = data[target]

In [27]:
#Standardize y
y= StandardScaler().fit_transform(y)

# 5. Perform Feature selection using L1 

In [28]:
regressor=LassoCV()
regressor.fit(X,y)

  y = column_or_1d(y, warn=True)


LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
        max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [29]:
coef_col = (regressor.coef_)
selected_features =(X.columns[(coef_col != 0)])
selected_features

Index(['MasVnrArea', 'GrLivArea', 'MiscVal', 'GarageArea', 'LotArea',
       'TotalBsmtSF', 'YearBuilt', 'BsmtFinSF1', '2ndFlrSF', 'WoodDeckSF',
       'YearRemodAdd'],
      dtype='object')

After perfomring Lasso Regularization on the data, L1 selected 11 features that will be used for modelling


In [None]:
rejected_features =  X.columns[(coef_col == 0)]
rejected_features

In [31]:
#Drop rejected features.
#Standardize new features.
new_X = encoded_data.drop(rejected_features, axis=1)
new_X = StandardScaler().fit_transform(new_X)

# 6. Train a regression model using Deep Feed Forward ANN on 70% of the data set


In [52]:
X_train,X_test,y_train,y_test=train_test_split(new_X,y,test_size=0.3)

In [65]:
epochs =100
model=Sequential()

model.add(Dense(13, input_shape=(12,), activation='relu'))
model.add(Dense(13, activation='relu'))
model.add(Dense(13, activation='relu'))
model.add(Dense(13, activation='relu'))
model.add(Dense(13, activation='relu'))
model.add(Dense(1,))

model.compile(Adam(learning_rate=0.001),loss='mean_squared_error')

history = model.fit(new_X, y, verbose=0, epochs=epochs)

In [54]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 13)                169       
_________________________________________________________________
dense_40 (Dense)             (None, 13)                182       
_________________________________________________________________
dense_41 (Dense)             (None, 13)                182       
_________________________________________________________________
dense_42 (Dense)             (None, 13)                182       
_________________________________________________________________
dense_43 (Dense)             (None, 13)                182       
_________________________________________________________________
dense_44 (Dense)             (None, 1)                 14        
Total params: 911
Trainable params: 911
Non-trainable params: 0
________________________________________________________

# 7. Test on 30% of the data and explain the metrics


In [66]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [67]:
train_score = r2_score(y_train, y_train_pred)
train_score

0.9991558679280468

In [68]:
test_score = r2_score(y_test, y_test_pred)
test_score

0.9992923666289288

- The metric that was used for this specific model is R- Squared.
- This metrics gives us a measure of how well the actual outcomes are replicated by the model
- R-Squared is always between 0 and 1 or between 0% to 100%
- For this model the R-squared value for the training set is 0.99915 while the R-squared value for the test set is 0.99929.
- This implies that the model is doing a good job at prediction from the 11 selected features