# House Prices EDA

In [20]:
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [21]:
houseDf = pd.read_csv('./ml_house_data_set.csv')

In [22]:
houseDf.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,house_number,street_name,unit_number,city,zip_code,sale_price
0,1978,1,4,1,1,1689,1859,attached,508,0,True,False,True,True,42670,Lopez Crossing,,Hallfort,10907,270897.0
1,1958,1,3,1,1,1984,2002,attached,462,0,True,False,True,True,5194,Gardner Park,,Hallfort,10907,302404.0
2,2002,1,3,2,0,1581,1578,none,0,625,False,False,True,True,4366,Harding Islands,,Lake Christinaport,11203,2519996.0
3,2004,1,4,2,0,1829,2277,attached,479,0,True,False,True,True,3302,Michelle Highway,,Lake Christinaport,11203,197193.0
4,2006,1,4,2,0,1580,1749,attached,430,0,True,False,True,True,582,Jacob Cape,,Lake Christinaport,11203,207897.0


In [23]:
houseDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42703 entries, 0 to 42702
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   year_built           42703 non-null  int64  
 1   stories              42703 non-null  int64  
 2   num_bedrooms         42703 non-null  int64  
 3   full_bathrooms       42703 non-null  int64  
 4   half_bathrooms       42703 non-null  int64  
 5   livable_sqft         42703 non-null  int64  
 6   total_sqft           42703 non-null  int64  
 7   garage_type          42703 non-null  object 
 8   garage_sqft          42703 non-null  int64  
 9   carport_sqft         42703 non-null  int64  
 10  has_fireplace        42703 non-null  bool   
 11  has_pool             42703 non-null  bool   
 12  has_central_heating  42703 non-null  bool   
 13  has_central_cooling  42703 non-null  bool   
 14  house_number         42703 non-null  int64  
 15  street_name          42703 non-null 

In [24]:
irrelevantFeatures = ['house_number', 'street_name', 'unit_number', 'zip_code']
houseDf_relevant =  houseDf.drop( irrelevantFeatures, axis=1, inplace=False )

In [25]:
houseDf_relevant.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_type,garage_sqft,carport_sqft,has_fireplace,has_pool,has_central_heating,has_central_cooling,city,sale_price
0,1978,1,4,1,1,1689,1859,attached,508,0,True,False,True,True,Hallfort,270897.0
1,1958,1,3,1,1,1984,2002,attached,462,0,True,False,True,True,Hallfort,302404.0
2,2002,1,3,2,0,1581,1578,none,0,625,False,False,True,True,Lake Christinaport,2519996.0
3,2004,1,4,2,0,1829,2277,attached,479,0,True,False,True,True,Lake Christinaport,197193.0
4,2006,1,4,2,0,1580,1749,attached,430,0,True,False,True,True,Lake Christinaport,207897.0


In [26]:
houseDf_relevant.describe()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_sqft,carport_sqft,sale_price
count,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0,42703.0
mean,1990.993209,1.365759,3.209283,1.923659,0.527153,1987.758986,2127.155446,455.8498,41.656324,413507.1
std,19.199987,0.513602,1.043396,0.759699,0.499268,846.76627,922.807342,243.453463,168.715867,318549.7
min,1852.0,0.0,0.0,0.0,0.0,-3.0,5.0,-4.0,0.0,626.0
25%,1980.0,1.0,3.0,1.0,0.0,1380.0,1466.0,412.0,0.0,270899.0
50%,1994.0,1.0,3.0,2.0,1.0,1808.0,1937.0,464.0,0.0,378001.0
75%,2005.0,2.0,4.0,2.0,1.0,2486.0,2640.0,606.0,0.0,497697.0
max,2017.0,4.0,31.0,8.0,1.0,12406.0,15449.0,8318.0,9200.0,21042000.0


In [27]:
# Finding total of uniques values for all categorical features
categoricalFeatures = ['has_pool','garage_type','has_fireplace','has_central_heating','has_central_cooling','city']

for value in categoricalFeatures:
    print(value + ':', len(houseDf_relevant[value].unique()))

has_pool: 2
garage_type: 3
has_fireplace: 2
has_central_heating: 2
has_central_cooling: 2
city: 47


In [28]:
houseDf_encoded = pd.get_dummies( houseDf_relevant, columns=['garage_type', 'city'] )

In [29]:
houseDf_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42703 entries, 0 to 42702
Data columns (total 64 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   year_built                 42703 non-null  int64  
 1   stories                    42703 non-null  int64  
 2   num_bedrooms               42703 non-null  int64  
 3   full_bathrooms             42703 non-null  int64  
 4   half_bathrooms             42703 non-null  int64  
 5   livable_sqft               42703 non-null  int64  
 6   total_sqft                 42703 non-null  int64  
 7   garage_sqft                42703 non-null  int64  
 8   carport_sqft               42703 non-null  int64  
 9   has_fireplace              42703 non-null  bool   
 10  has_pool                   42703 non-null  bool   
 11  has_central_heating        42703 non-null  bool   
 12  has_central_cooling        42703 non-null  bool   
 13  sale_price                 42703 non-null  flo

In [30]:
houseDf_encoded.head()

Unnamed: 0,year_built,stories,num_bedrooms,full_bathrooms,half_bathrooms,livable_sqft,total_sqft,garage_sqft,carport_sqft,has_fireplace,...,city_South Anthony,city_South Stevenfurt,city_Toddshire,city_Wendybury,city_West Ann,city_West Brittanyview,city_West Gerald,city_West Gregoryview,city_West Lydia,city_West Terrence
0,1978,1,4,1,1,1689,1859,508,0,True,...,0,0,0,0,0,0,0,0,0,0
1,1958,1,3,1,1,1984,2002,462,0,True,...,0,0,0,0,0,0,0,0,0,0
2,2002,1,3,2,0,1581,1578,0,625,False,...,0,0,0,0,0,0,0,0,0,0
3,2004,1,4,2,0,1829,2277,479,0,True,...,0,0,0,0,0,0,0,0,0,0
4,2006,1,4,2,0,1580,1749,430,0,True,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# Selecting features and label
labelDf = houseDf_encoded['sale_price']
featuresDf = houseDf_encoded.drop('sale_price', axis=1, inplace=False)

In [None]:
asdasd

In [32]:
trainF, testF, trainL, testL = train_test_split(featuresDf, labelDf, test_size=0.4, random_state=33)

In [33]:
# Baseline
# Gradient Boosting Regressor
gbr_model = GradientBoostingRegressor(
    n_estimators=1000,         # Number of decision trees in the Regressor
    learning_rate=0.1,         # Time spent learning the parameters
    max_depth=6,               # Max depth for any tree in the Regressor
    min_samples_leaf=9,        # Minimum number of types of observation 
    max_features=0.1,          # Maximum features (in percentage) to learn from for any tree 
    loss='huber'               # Loss function for calculating the error of the mode
)

In [34]:
# Learn from the training data set
gbr_model.fit( trainF, trainL )

GradientBoostingRegressor(loss='huber', max_depth=6, max_features=0.1,
                          min_samples_leaf=9, n_estimators=1000)

In [35]:
# Metrics for training
predicted_labels_train = gbr_model.predict(trainF)
mean_absolute_error(trainL, predicted_labels_train)

47813.75635344522

In [36]:
# Metrics for test
predicted_labels_test = gbr_model.predict(testF)
mean_absolute_error(testL, predicted_labels_test)

59095.03880230847

In [37]:
# Improved
# Gradient Boosting Regressor
gbr_model2 = GradientBoostingRegressor(
    n_estimators=500,         # Number of decision trees in the Regressor
    learning_rate=0.1,         # Time spent learning the parameters
    max_depth=6,               # Max depth for any tree in the Regressor
    min_samples_leaf=9,        # Minimum number of types of observation 
    max_features=0.1,          # Maximum features (in percentage) to learn from for any tree 
    loss='huber'               # Loss function for calculating the error of the mode
)

In [38]:
# Learn from the training data set
gbr_model2.fit( trainF, trainL )

# Metrics for training
predicted_labels_train = gbr_model2.predict(trainF)
mae_train = mean_absolute_error(trainL, predicted_labels_train)
print(mae_train)

# Metrics for test
predicted_labels_test = gbr_model2.predict(testF)
mae_test = mean_absolute_error(testL, predicted_labels_test)
print(mae_test)

52587.50623031135
60762.29705990332
