In [151]:
# import important packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
from matplotlib import rcParams
rcParams["figure.figsize"] = 10, 6 # figure size in inches
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

Step 1:
- Read the test and train datasets
- Separate the SalePrice as target variable
- Create a train and validation dataset from the train dataset that you created in the first step
- Check if you have any categorical features
- Check if you have any numerical features

In [152]:
traindat = pd.read_csv('data/train.csv', index_col=0)
testdat = pd.read_csv('data/test.csv', index_col=0)
traindat.dtypes.value_counts()

object     43
int64      34
float64     3
dtype: int64

# Step 1

In [153]:
def partition_cols_bytype(df):
    out = {}
    for type in df.dtypes.unique().astype(str):
        out[type] = df.columns[df.dtypes==type]
    return out


In [154]:
X0 = traindat.drop('SalePrice', axis=1)
y0 = traindat.SalePrice.astype('float64')

Xtrn, Xval, ytrn, yval = train_test_split(X0, y0)

## Dtype stuff:

In [155]:
# convert Garage year built to integer, and all others to nullable pandas integers:
X0.GarageYrBlt = X0.GarageYrBlt.astype('Int64')
X0[partition_cols_bytype(X0)['int64']] = X0[partition_cols_bytype(X0)['int64']].astype('Int64')

# convert all features in square feet to floats:
cols_in_sqft = []
for substr in ['SF', 'Area', 'Porch']:
    cols_in_sqft += X0.columns[X0.columns.str.contains(substr)].to_list()
    cols_in_sqft = list(np.unique(cols_in_sqft))
X0[cols_in_sqft] = X0[cols_in_sqft].astype('float64')
# convert all features in USD to floats:
X0[['MiscVal']] = X0[['MiscVal']].astype('float64')

partition_cols_bytype(X0)

{'Int64': Index(['MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
        'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
        'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
        'GarageCars', 'MoSold', 'YrSold'],
       dtype='object'),
 'object': Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
        'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
        'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
        'SaleType', 'SaleCondition'],
       dtype='object'),
 'float6

In [156]:
for col in partition_cols_bytype(X0)['object']:
    unqvals = X0[col].unique()
    if any([val.isdigit() for val in unqvals[~pd.isnull(unqvals)]]):
        print(col, X0[col].unique())
# this confirms all object columns are categorical (not numeric in string format)

In [157]:
# 'MSSubClass' looks like an integer but is actually nominal categorical:
X0[['MSSubClass']] = X0[['MSSubClass']].astype('str')
# 'OverallQual' & 'OverallCond' look like integers but are actually ordinal categorical:
X0[['OverallQual', 'OverallCond']] = X0[['OverallQual', 'OverallCond']].astype('str')
partition_cols_bytype(X0)

{'object': Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
        'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
        'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
        'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
        'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
        'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
        'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
        'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
       dtype='object'),
 'float64': Index(['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
        'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
        'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch',
        '3SsnPorch', 'Scre

In [158]:
partition = partition_cols_bytype(X0)

cont_fts = partition['float64'] # continuous numerical features
dsct_fts = partition['Int64'] # discrete numerical features

numr_fts = cont_fts.append(dsct_fts)

In [159]:
catg_fts = partition['object'] # all categorical features

binary_fts = catg_fts[X0[catg_fts].nunique()==2]
ordinal_fts = pd.Index(['LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 'PavedDrive', 'Fence'])
# note ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC'] are somewhat ordinal, except they have a "none of the above" option which makes it complicated
nominal_fts = catg_fts.drop(binary_fts.append(ordinal_fts))
nominal_fts, binary_fts, ordinal_fts

(Index(['MSSubClass', 'MSZoning', 'LandContour', 'LotConfig', 'Neighborhood',
        'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
        'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation',
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'Heating', 'Electrical', 'Functional', 'FireplaceQu', 'GarageType',
        'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'MiscFeature',
        'SaleType', 'SaleCondition'],
       dtype='object'),
 Index(['Street', 'Alley', 'Utilities', 'CentralAir'], dtype='object'),
 Index(['LotShape', 'Utilities', 'LandSlope', 'OverallQual', 'OverallCond',
        'ExterQual', 'ExterCond', 'HeatingQC', 'KitchenQual', 'PavedDrive',
        'Fence'],
       dtype='object'))

# Step 2

Step 2:
- Create a pipeline of SimpleImputer and StandardScaler transformers for the numerical data
- Create a SimpleImputer and OneHotEncoder for the categorical data
- Bundle the pre-processing steps into a column transformer
- Create a RandomForestRegressor

In [160]:
num_tf = Pipeline([
    ('numimputer', SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scaler', StandardScaler())
])

cat_tf = Pipeline([
    ('catimputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('cat', cat_tf, catg_fts),
    ('num', num_tf, numr_fts)
])

In [161]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('estimator', RandomForestRegressor(min_samples_leaf=30))
])

model.fit(Xtrn, ytrn)
print(model.score(Xval, yval))

0.8036396190516942


In [162]:
pred_prices = model.predict(testdat)