Decision Tree Model for predicting housing prices given a dataset with many features. Using what I have learned from the Kaggle learn course and previous courses as well.

I am a beginner machine learning student who comes from a full-stack developer background. I have made many many websites and consider myself quite familiar with React. I know many languages and am a competitive programmer, attending the Competitive Programming Club at my university (University of Delaware) where I received a Bachelors in Computer Science from.

Enough about my background, let's get to the competition submission!

In [34]:
## imports
import pandas as pd
import numpy as np

In [35]:
housing_data = pd.read_csv('./train.csv')
housing_data.columns.sort_values()

test_data = pd.read_csv('./test.csv')

In [36]:
housing_data.columns[housing_data.isna().any()].tolist()

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [37]:
housing_data['LotFrontage'].fillna(housing_data['LotFrontage'].mean(), inplace=True)
housing_data['MasVnrArea'].fillna(housing_data['MasVnrArea'].mean(), inplace=True)
housing_data['GarageYrBlt'].fillna(housing_data['GarageYrBlt'].mean(), inplace=True)

In [38]:
housing_data.columns[housing_data.isna().any()].tolist()

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [39]:
housing_data.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [40]:
## let's do an "out-of-sample" testing, using sklearn's train_test_split method
from sklearn.model_selection import train_test_split

In [41]:
## prediction target, the column we are trying to predict
y = housing_data['SalePrice']

y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [42]:
housing_data_rest = housing_data.drop(columns=['SalePrice'])
housing_data_rest['LotFrontage'] = housing_data_rest['LotFrontage'].round()
housing_data_rest['MasVnrArea'] = housing_data_rest['MasVnrArea'].round()
housing_data_rest['GarageYrBlt'] = housing_data_rest['GarageYrBlt'].round()

What if we just pass in *all* the numerical features into a decision tree, what is our mae then?

In [43]:
rest_x = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']

In [44]:
rest_features = housing_data_rest[rest_x]

In [45]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

rest_train_x, rest_val_x, rest_train_y, rest_val_y = train_test_split(rest_features, y)

rest_decision_tree = DecisionTreeRegressor()
rest_decision_tree.fit(rest_train_x, rest_train_y)

mean_absolute_error(rest_decision_tree.predict(rest_val_x), rest_val_y)

from sklearn.ensemble import RandomForestRegressor

def get_rf_score(max_estimators: int, r_df, r_train_x, r_train_y, val_x, val_y):
    rf_tree = RandomForestRegressor(n_estimators=max_estimators)
    rf_tree.fit(r_train_x, r_train_y)
    score = mean_absolute_error(rf_tree.predict(val_x), val_y)
    r_df.loc[len(r_df.index)] = [max_estimators, score]

estimator_df = pd.DataFrame()

estimator_df['Estimators'] = []
estimator_df['Score'] = []

# for i in range(100, 500):
#     get_rf_score(i, estimator_df, rest_train_x, rest_train_y, rest_val_x, rest_val_y)


# stats = estimator_df.describe()

# min_score = stats['Score']['min']

# result = estimator_df[estimator_df['Score'] == min_score]

# result

# 155 <--- best # of n_estimators

In [46]:
housing_data.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [47]:
feature_names = ['MSSubClass', 'LotArea', 'LotFrontage', 'MoSold', 'OverallQual', 'OverallCond']

In [48]:
features = housing_data[feature_names]

In [49]:
housing_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,22.024023,9981.264932,1.382997,1.112799,30.202904,20.645407,180.569112,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,60.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,70.049958,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,79.0,11601.5,7.0,6.0,2000.0,2004.0,164.25,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [50]:
## import the tree model
from sklearn.tree import DecisionTreeRegressor

In [51]:
housing_model = DecisionTreeRegressor()

In [52]:
## fit the model
housing_model.fit(features, y)

DecisionTreeRegressor()

In [53]:
test_data = pd.read_csv('./test.csv')
test_data.columns.sort_values()

Index(['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'Alley', 'BedroomAbvGr',
       'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath',
       'BsmtQual', 'BsmtUnfSF', 'CentralAir', 'Condition1', 'Condition2',
       'Electrical', 'EnclosedPorch', 'ExterCond', 'ExterQual', 'Exterior1st',
       'Exterior2nd', 'Fence', 'FireplaceQu', 'Fireplaces', 'Foundation',
       'FullBath', 'Functional', 'GarageArea', 'GarageCars', 'GarageCond',
       'GarageFinish', 'GarageQual', 'GarageType', 'GarageYrBlt', 'GrLivArea',
       'HalfBath', 'Heating', 'HeatingQC', 'HouseStyle', 'Id', 'KitchenAbvGr',
       'KitchenQual', 'LandContour', 'LandSlope', 'LotArea', 'LotConfig',
       'LotFrontage', 'LotShape', 'LowQualFinSF', 'MSSubClass', 'MSZoning',
       'MasVnrArea', 'MasVnrType', 'MiscFeature', 'MiscVal', 'MoSold',
       'Neighborhood', 'OpenPorchSF', 'OverallCond', 'OverallQual',
       'PavedDrive', 'PoolArea', 'Po

In [54]:
test_data_features = test_data[feature_names]

In [55]:
housing_model.predict(test_data_features)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
train_x, val_x, train_y, val_y = train_test_split(features, y)

In [None]:
housing_model_v2 = DecisionTreeRegressor()

In [None]:
housing_model_v2.fit(train_x, train_y)

DecisionTreeRegressor()

In [None]:
mean_squared_error(housing_model_v2.predict(val_x), val_y)

3504451013.0305934