# House price analysis

## Setup

In [1]:
# This takes a long time / hangs, so don't do it unless we need to.
!pip install -Uq pandas kaggle fastai scikit-learn waterfallcharts treeinterpreter dtreeviz

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.1.2 requires pandas!=1.4.0,<1.6,>1.1, but you have pandas 2.0.1 which is incompatible.
woodwork 0.23.0 requires pandas<2.0.0,>=1.4.3, but you have pandas 2.0.1 which is incompatible.
wfdb 4.1.0 requires pandas<2.0.0,>=1.0.0, but you have pandas 2.0.1 which is incompatible.
pymc3 3.11.5 requires numpy<1.22.2,>=1.15.0, but you have numpy 1.23.5 which is incompatible.
pymc3 3.11.5 requires scipy<1.8.0,>=1.7.3, but you have scipy 1.9.3 which is incompatible.
featuretools 1.25.0 requires pandas<2.0.0,>=1.5.0, but you have pandas 2.0.1 which is incompatible.
beatrix-jupyterlab 2023.46.184821 requires jupyter-server~=1.16, but you have jupyter-server 2.5.0 which is incompatible.[0m[31m
[0m

In [2]:
import pandas as pd
import numpy as np
import os, zipfile
from fastai import * 
from fastai.tabular.all import *
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import *
from dtreeviz.trees import *
import graphviz
from IPython.display import Image, display_svg, SVG

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8



In [3]:
creds = '{"username":"divodivenson","key":"f0f9e3f8378e39e818097a5df978aa64"}'
cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [4]:
import kaggle
comp = 'house-prices-advanced-regression-techniques'
path = Path(f'../input/{comp}')
path
path.ls(file_type='text')

(#4) [Path('../input/house-prices-advanced-regression-techniques/sample_submission.csv'),Path('../input/house-prices-advanced-regression-techniques/data_description.txt'),Path('../input/house-prices-advanced-regression-techniques/train.csv'),Path('../input/house-prices-advanced-regression-techniques/test.csv')]

## First look at data


In [5]:
trn_path = path/'train.csv'
df = pd.read_csv(trn_path, low_memory=False)

df_test = pd.read_csv(path/'test.csv', low_memory=False)

df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,...,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,...,6,2010,WD,Normal
1,1462,20,RL,81.0,...,6,2010,WD,Normal
2,1463,60,RL,74.0,...,3,2010,WD,Normal
3,1464,60,RL,78.0,...,6,2010,WD,Normal
4,1465,120,RL,43.0,...,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,...,6,2006,WD,Normal
1455,2916,160,RM,21.0,...,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,...,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,...,7,2006,WD,Normal


In [6]:
# Lets get an idea of where the ordianls are
df.columns.map(lambda col: df[col].unique().size )

Int64Index([1460,   15,    5,  111, 1073,    2,    3,    4,    4,    2,    5,
               3,   25,    9,    8,    5,    8,   10,    9,  112,   61,    6,
               8,   15,   16,    5,  328,    4,    5,    6,    5,    5,    5,
               7,  637,    7,  144,  780,  721,    6,    5,    2,    6,  753,
             417,   24,  861,    4,    3,    4,    3,    8,    4,    4,   12,
               7,    4,    6,    7,   98,    4,    5,  441,    6,    6,    3,
             274,  202,  120,   20,   76,    8,    4,    5,    5,   21,   12,
               5,    9,    6,  663],
           dtype='int64')

In [7]:
def print_levels(level):
    if df[level].unique().size < 50:
        return f"{level} {df[level].unique()}\n"
    else:
        return f"{level} - Continuous\n"
    
df.columns.map(print_levels)

Index(['Id - Continuous\n',
       'MSSubClass [ 60  20  70  50 190  45  90 120  30  85  80 160  75 180  40]\n',
       'MSZoning ['RL' 'RM' 'C (all)' 'FV' 'RH']\n',
       'LotFrontage - Continuous\n', 'LotArea - Continuous\n',
       'Street ['Pave' 'Grvl']\n', 'Alley [nan 'Grvl' 'Pave']\n',
       'LotShape ['Reg' 'IR1' 'IR2' 'IR3']\n',
       'LandContour ['Lvl' 'Bnk' 'Low' 'HLS']\n',
       'Utilities ['AllPub' 'NoSeWa']\n',
       'LotConfig ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']\n',
       'LandSlope ['Gtl' 'Mod' 'Sev']\n',
       'Neighborhood ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'\n 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'\n 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'\n 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']\n',
       'Condition1 ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']\n',
       'Condition2 ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']\n',
       'Bld

In [8]:
# Make note of anything that appears to follow an order, just for reference. Not exhaustive
# Not intending to input this anywhere
# Also derived from data description https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data
ordinals = ['BldgType', 'HouseStyle', 'Utilities', 'ExterCond', 'ExterQual', 'Functional', 'PavedDrive']

# There are even more, lets just go with the overall Home functionality
functional = ['Typ' 'Min1' 'Maj1' 'Min2' 'Mod' 'Maj2' 'Sev']
df['Functional'] = df['Functional'].astype('category')
df['Functional'].cat.set_categories(functional, ordered=True)

df_test['Functional'] = df_test['Functional'].astype('category')
df_test['Functional'].cat.set_categories(functional, ordered=True)

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1454    NaN
1455    NaN
1456    NaN
1457    NaN
1458    NaN
Name: Functional, Length: 1459, dtype: category
Categories (1, object): ['TypMin1Maj1Min2ModMaj2Sev']

### Dates

In [9]:
df['saledate'] = df['MoSold'].astype(str) + '/' + df['YrSold'].astype(str)
df = add_datepart(df, 'saledate')
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,...,saleIs_quarter_start,saleIs_year_end,saleIs_year_start,saleElapsed
0,1,60,RL,65.0,...,False,False,False,1.201824e+09
1,2,20,RL,80.0,...,False,False,False,1.177978e+09
2,3,60,RL,68.0,...,False,False,False,1.220227e+09
3,4,70,RL,60.0,...,False,False,False,1.138752e+09
4,5,60,RL,84.0,...,False,False,False,1.228090e+09
...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,...,False,False,False,1.185926e+09
1456,1457,20,RL,85.0,...,False,False,False,1.264982e+09
1457,1458,70,RL,66.0,...,False,False,False,1.272672e+09
1458,1459,20,RL,68.0,...,True,False,False,1.270080e+09


In [10]:
# Do the same for the test dataset
df_test['saledate'] = df_test['MoSold'].astype(str) + '/' + df_test['YrSold'].astype(str)
df_test = add_datepart(df_test, 'saledate')
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,...,saleIs_quarter_start,saleIs_year_end,saleIs_year_start,saleElapsed
0,1461,20,RH,80.0,...,False,False,False,1.275350e+09
1,1462,20,RL,81.0,...,False,False,False,1.275350e+09
2,1463,60,RL,74.0,...,False,False,False,1.267402e+09
3,1464,60,RL,78.0,...,False,False,False,1.275350e+09
4,1465,120,RL,43.0,...,True,False,True,1.262304e+09
...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,...,False,False,False,1.149120e+09
1455,2916,160,RM,21.0,...,True,False,False,1.143850e+09
1456,2917,20,RL,160.0,...,False,False,False,1.157069e+09
1457,2918,85,RL,62.0,...,True,False,False,1.151712e+09


### Depedent variable

`SalePrice` is the dependent variable. We are told the evaluation is on the RMSE between the log of the predicted and actual sale price (Taking logs means that errors in predicting expensive houses and cheap houses will affect the result equally.)

Update - I'm not sure if this should be done in the submission or not

In [11]:
dep_var = 'SalePrice'
#df[dep_var] = np.log(df[dep_var])

## Decision Tree
First lets have a go at building a plain old decision tree and submit that to Kaggle

### Test and validation set
In this case I don't see the test set contain future dates compared to the training set, so my first attempt at a validation set will just take a random sample of the training set.

In [12]:
df['YrSold'].unique(), df_test['YrSold'].unique()

(array([2008, 2007, 2006, 2009, 2010]), array([2010, 2009, 2008, 2007, 2006]))

In [13]:
from sklearn.model_selection import train_test_split
df_train, df_valid = train_test_split(df, test_size=0.2)
df_train.size, df_valid.size
# Need to get the index from the row in df of all the fields in df_valid
train_idx = df_train.Id.map(lambda id: df.index[df.Id == id][0])
valid_idx = df_valid.Id.map(lambda id: df.index[df.Id == id][0])
splits = (list(train_idx), list(valid_idx))

### Handle missing data and strings
Sklearn cannot handle missing data or strings.

`Categorify` is a TabularProc that replaces a column with a numeric categorical column. 

`FillMissing` is a TabularProc that replaces missing values with the median of the column

In [14]:
cont, cat = cont_cat_split(df, 1, dep_var=dep_var)
procs = [Categorify, FillMissing]
tabular_data = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)

The underlying data is now all numeric

In [15]:
tabular_data.items.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,...,saleElapsed,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
415,416,20,4,73.0,...,1185926000.0,1,1,1


Need to apply the same transform to the test dataset

In [16]:
test_data = TabularPandas(df_test, procs, cat, cont)
# test = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
len(cont) + len(cat) # 96, now 104
df # 94 cols
tabular_data # 97 cols
test_data # 104 cols
tabular_data.all_col_names # 105
#dir(tabular_data)
len(tabular_data.cat_names) + len(tabular_data.cont_names) # 104


104

## Creating the decision tree

In [17]:
def draw_tree(t, df, size=10, ratio=0.6, precision=0, **kwargs):
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True, rounded=True,
                      special_characters=True, rotate=False, precision=precision, **kwargs)
    return graphviz.Source(re.sub('Tree {', f'Tree {{ size={size}; ratio={ratio}', s))

In [18]:
tabular_data.train.xs

KeyError: "['BsmtFinSF1_na', 'BsmtFinSF2_na', 'BsmtUnfSF_na', 'TotalBsmtSF_na', 'BsmtFullBath_na', 'BsmtHalfBath_na', 'GarageCars_na', 'GarageArea_na'] not in index"

In [None]:
xs, y = tabular_data.train.xs, tabular_data.train.y # The independent and dependent params
valid_xs, valid_y = tabular_data.valid.xs, tabular_data.valid.y

m = DecisionTreeRegressor(max_leaf_nodes=5)
m.fit(xs, y)

draw_tree(m, xs, size=10, leaves_parallel=True, precision=2)

In [None]:
dtreeviz.model(m,
               X_train=xs, y_train=y,
               feature_names=list(xs.columns),
               target_name='Sale Price').view()

Create a decision tree with more nodes. We shouldn't have as many nodes as we can as this will overfit the data.

In [None]:
m = DecisionTreeRegressor(min_samples_leaf=25)
m.fit(xs, y)

I'm adding the log here in the `r_mse` function instead of in the data itself. I'm unsure if this is correct but the submission sample suggests the result should just be the predicted price, not the log of it.

In [None]:
def r_mse(prediction, y): return round(math.sqrt(((np.log(prediction) - np.log(y) ) ** 2).mean()), 6)
def m_rmse(model, xs, y): return r_mse(model.predict(xs), y)    

In [None]:
m_rmse(m, xs, y), m_rmse(m, valid_xs, valid_y)
# What are these numbers, what is the kind of scale I should be thinking about
# What is a good rmse? The part that's confusing me is the book say 0.331466 is bad but 
# 0.323396 is good.
# In any case (0.165274, 0.197932) looks pretty good.

## Submit first attempt
Take a look at the submission sample to see what we need. A CSV of ID and sale price

In [None]:
sub_sample = pd.read_csv(path/'sample_submission.csv')
sub_sample

In [None]:
preds = m.predict(test_data.xs)
# Zip up the ids and predictions, then transform to 2D array from Array of tuples
submission = [list(t) for t in list(zip(test_data['Id'], preds))]

# Convert to Pandas dataframe to output CSV for submission.
sub_df = pd.DataFrame(submission, columns=['Id', 'SalePrice'])
sub_df.to_csv('subm.csv', index=False)
sub_sample.Id, test_data.columns

In [None]:
from kaggle import api
#api.competition_submit_cli('subm.csv', 'Initial decision tree', comp)