In [102]:
import pandas as pd
import numpy as np
import os, pathlib

Some helper functions:

In [103]:
# Return the number of unique values by feature. Useful to find categorical
# features "disguised" as numeric features (or vice-versa)
def unique_values(df, type_ = 'cat'):
    
    columns = df.columns
    data_types = df.dtypes.values
    
    cat_vars = columns[data_types == 'O']
    num_vars = columns[data_types != 'O']
    
    if type_ == 'cat':
        return df[cat_vars].nunique().sort_values(ascending=False)
    if type_ == 'num':
        return df[num_vars].nunique().sort_values(ascending=False)

In [104]:
# Use median for numeric values and 'Missing' for categorical
# It also adds a boolean 'is_missing' by default
def fix_na(df, add_boolean=True):
    
    df = df.copy()
    
    columns = df.columns
    data_types = df.dtypes.values
    
    cat_vars = columns[data_types == 'O']
    num_vars = columns[data_types != 'O']
    
    # Adding boolean or not
    if add_boolean:
        for col in cat_vars:
            df[col+'_is_missing'] = df[col].isna()*1
            df[col] = df[col].fillna('Missing')
        for col in num_vars:
            df[col+'_is_missing'] = df[col].isna()*1
            median = df[col].median()
            df[col] = df[col].fillna(median)            
    else:
        for col in cat_vars:
            df[col] = df[col].fillna('Missing')
        for col in num_vars:
            median = df[col].median()
            df[col] = df[col].fillna(median)
    
    return df    

In [105]:
# Encode categorical features as integers
def encode_categoricals(df, return_maps = False):
    
    df = df.copy()
    
    columns = df.columns
    data_types = df.dtypes.values
    cat_vars = columns[data_types == 'O']
    
    # Dictionary with mappings for all categorical features
    cat_maps = {}
    for col in cat_vars:
        
        # mapping labels to integers
        cat_map = {label:idx for (idx, label) in enumerate(df[col].unique())}
        df[col] = df[col].map(cat_map)
        cat_maps[col] = cat_map
        
    if return_maps:
        return (df, cat_maps)
    
    return df

In [143]:
# Evaluate local modal using random forest oob error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

def local_valid_oob(X, y, model = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True),
                    score=mean_squared_error):
    
    model.fit(X, y)
    oob_predictions = model.oob_prediction_
    score = score(y, oob_predictions, squared=False)
    
    print(f"OOB score:{score:.4f}")    

Basic outline of a workflow for a fast baseline model (before any preprocessing, including categorical variables). Steps:

- **Basic Cleaning**: make sure data types are correct. Any categoricals that are encoded as integers can be used on the baseline model (even though this might not be the best encoding). Drop useless features (id is usually one of them).
- **Missing Values**: use median + boolean column with missing values for numeric features (use integers, so we don't need encoding). For categorical features, just add 'Missing' as a new category.  
- **Model**: Random Forest or Gradient Boosting. Two options: ignoring categorical variables at first or simply encoding categoricals as integers. Evaluate model performance using cross-validation.

Loading Data

In [106]:
data_dir = pathlib.Path('./data/house_prices/')
os.listdir(data_dir)

['data_description.txt', 'sample_submission.csv', 'test.csv', 'train.csv']

In [107]:
train = pd.read_csv(data_dir / 'train.csv')

Id is probably not useful. The dataset contains a mix of numerical and categorical variables.

In [108]:
train.head().T

Unnamed: 0,0,1,2,3,4
Id,1,2,3,4,5
MSSubClass,60,20,60,70,60
MSZoning,RL,RL,RL,RL,RL
LotFrontage,65,80,68,60,84
LotArea,8450,9600,11250,9550,14260
Street,Pave,Pave,Pave,Pave,Pave
Alley,,,,,
LotShape,Reg,Reg,IR1,IR1,IR1
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub


Columns with missing values:

In [109]:
train.isna().any().sort_values(ascending=False)

FireplaceQu       True
GarageCond        True
BsmtFinType1      True
BsmtExposure      True
BsmtCond          True
BsmtQual          True
Electrical        True
MasVnrArea        True
MasVnrType        True
GarageType        True
GarageYrBlt       True
GarageFinish      True
GarageQual        True
BsmtFinType2      True
LotFrontage       True
Alley             True
MiscFeature       True
Fence             True
PoolQC            True
LotConfig        False
RoofStyle        False
MSSubClass       False
MSZoning         False
LotArea          False
Foundation       False
ExterCond        False
ExterQual        False
Exterior2nd      False
Exterior1st      False
RoofMatl         False
                 ...  
GarageArea       False
PavedDrive       False
WoodDeckSF       False
OpenPorchSF      False
3SsnPorch        False
BsmtUnfSF        False
ScreenPorch      False
PoolArea         False
MiscVal          False
MoSold           False
YrSold           False
SaleType         False
Functional 

Dropping Id:

In [110]:
assert train['Id'].nunique() == train.shape[0]
train.drop('Id', axis=1, inplace=True)

Fixing missing values:

In [111]:
train = fix_na(train)

Some numerical features seem to be categorical: features involving dates (like YearBuilt) and quality (like OverallQual). As we mentioned on the introduction, we won't worry about these for now.

In [112]:
unique_values(train, type_ ='num')

LotArea                     1073
GrLivArea                    861
BsmtUnfSF                    780
1stFlrSF                     753
TotalBsmtSF                  721
SalePrice                    663
BsmtFinSF1                   637
GarageArea                   441
2ndFlrSF                     417
MasVnrArea                   327
WoodDeckSF                   274
OpenPorchSF                  202
BsmtFinSF2                   144
EnclosedPorch                120
YearBuilt                    112
LotFrontage                  110
GarageYrBlt                   97
ScreenPorch                   76
YearRemodAdd                  61
LowQualFinSF                  24
MiscVal                       21
3SsnPorch                     20
MSSubClass                    15
MoSold                        12
TotRmsAbvGrd                  12
OverallQual                   10
OverallCond                    9
PoolArea                       8
BedroomAbvGr                   8
YrSold                         5
          

None of the categorical features seem to be numeric (the number of unique values is small):

In [113]:
unique_values(train, type_ ='cat')

Neighborhood     25
Exterior2nd      16
Exterior1st      15
SaleType          9
Condition1        9
Condition2        8
HouseStyle        8
RoofMatl          8
Functional        7
BsmtFinType1      7
GarageType        7
BsmtFinType2      7
RoofStyle         6
Heating           6
SaleCondition     6
Electrical        6
FireplaceQu       6
GarageQual        6
GarageCond        6
Foundation        6
MasVnrType        5
BldgType          5
LotConfig         5
BsmtQual          5
MSZoning          5
ExterCond         5
BsmtCond          5
BsmtExposure      5
HeatingQC         5
MiscFeature       5
Fence             5
LotShape          4
LandContour       4
ExterQual         4
GarageFinish      4
KitchenQual       4
PoolQC            4
PavedDrive        3
LandSlope         3
Alley             3
Utilities         2
CentralAir        2
Street            2
dtype: int64

Processing categorical:

In [114]:
train = encode_categoricals(train)

Baseline Model Evaluation:

In [146]:
local_valid_oob(train.drop('SalePrice', axis=1), np.log(train['SalePrice']))

OOB score:0.1432
