### Import Libraries and Load Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [2]:
#read train and test data from the EDA & Cleaning noteboook
%store -r traindf
%store -r testdf

In [3]:
traindf.shape

(1966, 71)

In [4]:
testdf.shape

(878, 70)

### Feature Engineering
Based on the correlations between feature and target and some basic domain knowledge on housing prices I decided to create some new features to capture the number baths and total square feet as well as polynomial variables of these highly correlated features to see how they impact the score of the model. Lastly I will dummify all the categorical features so that they can be fed into the model.

In [5]:
#Here I am creating two new interaction features within the training and testing data to get the total baths and total square feet
traindf['Baths'] = traindf['Full Bath'] + (traindf['Half Bath'] *0.5) + traindf['Bsmt Full Bath'] + (traindf['Bsmt Half Bath'] *0.5)
traindf['HouseSF'] = traindf['Total Bsmt SF'] + traindf['Gr Liv Area'] + traindf['Garage Area']

testdf['Baths'] = testdf['Full Bath'] + (testdf['Half Bath'] *0.5) + testdf['Bsmt Full Bath'] + (testdf['Bsmt Half Bath'] *0.5)
testdf['HouseSF'] = testdf['Total Bsmt SF'] + testdf['Gr Liv Area'] + testdf['Garage Area']

### Dummify Variables
Encode categorical variables as 1s and 0s so that they can be interpreted by the linear model

In [6]:
#dummify all the categorical features
dummies= traindf[['MS Zoning', 
                   'Street',
                   'Lot Shape',
                   'Land Contour',      
                   'Utilities', 
                   'Lot Config',
                   'Land Slope',
                   'Neighborhood',      
                   'Condition 1',      
                   'Condition 2',       
                   'Bldg Type',        
                   'House Style',
                   'Roof Style', 
                   'Roof Matl',         
                   'Exterior 1st',      
                   'Exterior 2nd',      
                   'Mas Vnr Type',      
                   'Exter Qual',        
                   'Exter Cond',        
                   'Foundation',        
                   'Bsmt Qual',         
                   'Bsmt Cond',         
                   'Bsmt Exposure',     
                   'BsmtFin Type 1',
                   'BsmtFin Type 2',
                    'Heating',           
                    'Heating QC',        
                    'Central Air',       
                    'Electrical',        
                    'Kitchen Qual',      
                    'Functional',               
                    'Paved Drive',
                    'Sale Type']]  

In [7]:
#Create the dummy columns for each categorical feature
dummy_col=pd.concat([pd.get_dummies(dummies[col], drop_first=True) for col in dummies], axis=1, keys=dummies.columns)

In [8]:
#drop the original categorical columns as we replaced these with the dummy columns
traindf.drop(columns= ['MS Zoning', 
                   'Street',
                   'Lot Shape',
                   'Land Contour',      
                   'Utilities', 
                   'Lot Config',
                   'Land Slope',
                   'Neighborhood',      
                   'Condition 1',      
                   'Condition 2',       
                   'Bldg Type',        
                   'House Style',
                   'Roof Style', 
                   'Roof Matl',         
                   'Exterior 1st',      
                   'Exterior 2nd',      
                   'Mas Vnr Type',      
                   'Exter Qual',        
                   'Exter Cond',        
                   'Foundation',        
                   'Bsmt Qual',         
                   'Bsmt Cond',         
                   'Bsmt Exposure',     
                   'BsmtFin Type 1',
                   'BsmtFin Type 2',
                    'Heating',           
                    'Heating QC',        
                    'Central Air',       
                    'Electrical',        
                    'Kitchen Qual',      
                    'Functional',               
                    'Paved Drive',
                    'Sale Type'], inplace= True)

In [9]:
#join the dummy features with the training dataframe
traindf= traindf.join(dummy_col)

  traindf= traindf.join(dummy_col)


In [10]:
dummies_test= testdf[['MS Zoning', 
                   'Street',
                   'Lot Shape',
                   'Land Contour',      
                   'Utilities', 
                   'Lot Config',
                   'Land Slope',
                   'Neighborhood',      
                   'Condition 1',      
                   'Condition 2',       
                   'Bldg Type',        
                   'House Style',
                   'Roof Style', 
                   'Roof Matl',         
                   'Exterior 1st',      
                   'Exterior 2nd',      
                   'Mas Vnr Type',      
                   'Exter Qual',        
                   'Exter Cond',        
                   'Foundation',        
                   'Bsmt Qual',         
                   'Bsmt Cond',         
                   'Bsmt Exposure',     
                   'BsmtFin Type 1',
                   'BsmtFin Type 2',
                    'Heating',           
                    'Heating QC',        
                    'Central Air',       
                    'Electrical',        
                    'Kitchen Qual',      
                    'Functional',               
                    'Paved Drive',
                    'Sale Type']]  

dummy_col2=pd.concat([pd.get_dummies(dummies_test[col], drop_first=True) for col in dummies_test], axis=1, keys=dummies_test.columns)

In [11]:
testdf.drop(columns= ['MS Zoning', 
                   'Street',
                   'Lot Shape',
                   'Land Contour',      
                   'Utilities', 
                   'Lot Config',
                   'Land Slope',
                   'Neighborhood',      
                   'Condition 1',      
                   'Condition 2',       
                   'Bldg Type',        
                   'House Style',
                   'Roof Style', 
                   'Roof Matl',         
                   'Exterior 1st',      
                   'Exterior 2nd',      
                   'Mas Vnr Type',      
                   'Exter Qual',        
                   'Exter Cond',        
                   'Foundation',        
                   'Bsmt Qual',         
                   'Bsmt Cond',         
                   'Bsmt Exposure',     
                   'BsmtFin Type 1',
                   'BsmtFin Type 2',
                    'Heating',           
                    'Heating QC',        
                    'Central Air',       
                    'Electrical',        
                    'Kitchen Qual',      
                    'Functional',               
                    'Paved Drive',
                    'Sale Type'], inplace= True)

In [12]:
testdf= testdf.join(dummy_col2)

  testdf= testdf.join(dummy_col2)


### Polynomial Features 
Create polynomial features to emphasis features that have high correlation to the target variable

In [13]:
#define poly feats
features= ['HouseSF', 'Baths', 'Bedroom AbvGr', 'Overall Qual']
poly_feats= traindf[['HouseSF', 'Baths', 'Bedroom AbvGr', 'Overall Qual']]

In [14]:
# Instantiate PolynomialFeatures
poly= PolynomialFeatures(include_bias = False)

In [15]:
# Create X_poly
X_poly= poly.fit_transform(poly_feats)

# View X_poly in a DataFrame
temp_df= pd.DataFrame(X_poly, columns = poly.get_feature_names_out(features))

#Drop the original columns as these are already in the traindf
temp_df.drop(columns=['HouseSF', 'Baths', 'Bedroom AbvGr',  'Overall Qual'], inplace= True )

#Add these polynomial features to the train dataset
traindf= traindf.join(temp_df)

In [16]:
temp_df.columns

Index(['HouseSF^2', 'HouseSF Baths', 'HouseSF Bedroom AbvGr',
       'HouseSF Overall Qual', 'Baths^2', 'Baths Bedroom AbvGr',
       'Baths Overall Qual', 'Bedroom AbvGr^2', 'Bedroom AbvGr Overall Qual',
       'Overall Qual^2'],
      dtype='object')

In [17]:
missing_values_test = traindf.isna().sum().reset_index()
missing_values_test.columns = ['feature','nulls']
missing_values_test = missing_values_test[missing_values_test['nulls'] != 0].sort_values(by= 'nulls',ascending=False)
missing_values_test

Unnamed: 0,feature,nulls
215,HouseSF^2,83
216,HouseSF Baths,83
217,HouseSF Bedroom AbvGr,83
218,HouseSF Overall Qual,83
219,Baths^2,83
220,Baths Bedroom AbvGr,83
221,Baths Overall Qual,83
222,Bedroom AbvGr^2,83
223,Bedroom AbvGr Overall Qual,83
224,Overall Qual^2,83


In [18]:
traindf.dropna(inplace=True)

In [19]:
# Create X_poly
test_poly= poly.fit_transform(poly_feats)

# View X_poly in a DataFrame
temp_df_test= pd.DataFrame(test_poly, columns = poly.get_feature_names_out(features))

#Drop the original columns as these are already in the traindf
temp_df_test.drop(columns=['HouseSF', 'Baths', 'Bedroom AbvGr', 'Overall Qual'], inplace= True )

#Add these polynomial features to the train dataset
testdf= testdf.join(temp_df_test)

In [20]:
#Save and store the data for model building and evaluation
%store traindf
%store testdf

Stored 'traindf' (DataFrame)
Stored 'testdf' (DataFrame)
