# Feature Engineering for House Prices

### Import Data and Libraries

In [1]:
# import pandas and MinMaxScaler from sklearn
import pandas as pd
from sklearn.preprocessing import MinMaxScaler 

In [3]:
# import the data as a DataFrame
houseDf = pd.read_csv('./ml_house_data_set.csv')

In [None]:
# Preview the scaled data
# note: use .info() and .head()
houseDf.head()


In [4]:
houseDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42703 entries, 0 to 42702
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year_built           42703 non-null  int64  
 1   stories              42703 non-null  int64  
 2   num_bedrooms         42703 non-null  int64  
 3   full_bathrooms       42703 non-null  int64  
 4   half_bathrooms       42703 non-null  int64  
 5   livable_sqft         42703 non-null  int64  
 6   total_sqft           42703 non-null  int64  
 7   garage_type          42703 non-null  object 
 8   garage_sqft          42703 non-null  int64  
 9   carport_sqft         42703 non-null  int64  
 10  has_fireplace        42703 non-null  bool   
 11  has_pool             42703 non-null  bool   
 12  has_central_heating  42703 non-null  bool   
 13  has_central_cooling  42703 non-null  bool   
 14  house_number         42703 non-null  int64  
 15  street_name          42703 non-null 

### Feature Engineering

In [5]:
# drop the irrelevant features
irrelevantFeats = ['house_number', 'street_name', 'unit_number', 'zip_code']
houseDf.drop( irrelevantFeats, axis=1, inplace=True )

In [6]:
houseDf.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,city,sale_price
0,1978,1,4,1,1,1689,1859,attached,508,0,True,False,True,True,Hallfort,270897.0
1,1958,1,3,1,1,1984,2002,attached,462,0,True,False,True,True,Hallfort,302404.0
2,2002,1,3,2,0,1581,1578,none,0,625,False,False,True,True,Lake Christinaport,2519996.0
3,2004,1,4,2,0,1829,2277,attached,479,0,True,False,True,True,Lake Christinaport,197193.0
4,2006,1,4,2,0,1580,1749,attached,430,0,True,False,True,True,Lake Christinaport,207897.0


In [7]:
# apply one-hot encoding
# hint: use the pandas .get_dummies() method
oneHotEncodingFeats = ['garage_type','city']
houseDf = pd.get_dummies(houseDf, columns=oneHotEncodingFeats)
houseDf.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,sale_price,garage_type_attached,garage_type_detached,garage_type_none,city_Amystad,city_Brownport,city_Chadstad,city_Clarkberg,city_Coletown,city_Davidfort,city_Davidtown,city_East Amychester,city_East Janiceville,city_East Justin,city_East Lucas,city_Fosterberg,city_Hallfort,city_Jeffreyhaven,city_Jenniferberg,city_Joshuafurt,city_Julieberg,city_Justinport,city_Lake Carolyn,city_Lake Christinaport,city_Lake Dariusborough,city_Lake Jack,city_Lake Jennifer,city_Leahview,city_Lewishaven,city_Martinezfort,city_Morrisport,city_New Michele,city_New Robinton,city_North Erinville,city_Port Adamtown,city_Port Andrealand,city_Port Daniel,city_Port Jonathanborough,city_Richardport,city_Rickytown,city_Scottberg,city_South Anthony,city_South Stevenfurt,city_Toddshire,city_Wendybury,city_West Ann,city_West Brittanyview,city_West Gerald,city_West Gregoryview,city_West Lydia,city_West Terrence
0,1978,1,4,1,1,1689,1859,508,0,True,False,True,True,270897.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1958,1,3,1,1,1984,2002,462,0,True,False,True,True,302404.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2002,1,3,2,0,1581,1578,0,625,False,False,True,True,2519996.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2004,1,4,2,0,1829,2277,479,0,True,False,True,True,197193.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2006,1,4,2,0,1580,1749,430,0,True,False,True,True,207897.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# Apply the MinMaxScaler to the DataFrame
# note: use the scaler
scaler = MinMaxScaler( feature_range=(0,1) )
houseDataScaled = scaler.fit_transform( houseDf )

In [10]:
# Print the scaling numbers from the scaler
# note: use the index 13
print( scaler.scale_[13] )
print( scaler.min_[13] )

4.752541801931927e-08
-2.9750911680093863e-05


In [16]:
# Convert from scientific notation to float
print(
    "Mutliplied {:.10f}, Subtracted {:.10f}".format(scaler.scale_[13], scaler.min_[13])
  )

Mutliplied 0.0000000475, Subtracted -0.0000297509


In [17]:
# Convert from NumPy array to Pandas DataFrame
houseDfScaled = pd.DataFrame( houseDataScaled, columns=houseDf.columns.values )

### Export the Data for Deep Learning

In [18]:
# Export the scaled data
houseDfScaled.to_csv('./ml_house_data_set_scaled.csv', index=False)