In [102]:
from IPython.display import Markdown, display

import pandas as pd
import numpy as np
random_seed = 42
np.random.seed(random_seed)
rng = np.random.default_rng(random_seed)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

from pathlib import Path
data_path = Path('./data')

from sklearn.impute import SimpleImputer
from category_encoders import LeaveOneOutEncoder

from sklearn.ensemble import RandomForestRegressor


In [87]:
train = pd.read_csv(
    data_path / "train.csv",
    usecols=["OverallQual", "LotArea", "GrLivArea", "SalePrice"],
)
test = pd.read_csv(
    data_path / "test.csv", usecols=["OverallQual", "LotArea", "GrLivArea"]
)

display(train.head(), test.head())

Unnamed: 0,LotArea,OverallQual,GrLivArea,SalePrice
0,8450,7,1710,208500
1,9600,6,1262,181500
2,11250,7,1786,223500
3,9550,7,1717,140000
4,14260,8,2198,250000


Unnamed: 0,LotArea,OverallQual,GrLivArea
0,11622,5,896
1,14267,6,1329
2,13830,5,1629
3,9978,6,1604
4,5005,8,1280


In [88]:
display(Markdown("### Train nulls"), train.isna().sum())
display(Markdown("### Test nulls"), test.isna().sum())

### Train nulls

LotArea        0
OverallQual    0
GrLivArea      0
SalePrice      0
dtype: int64

### Test nulls

LotArea        0
OverallQual    0
GrLivArea      0
dtype: int64

## Add nulls to the Categorical (OverallQual) feature

In [89]:
# replace 1 out of 10 values with Nan in OverallQual (Seed was set to 42)
train.loc[rng.choice(train.index, size=len(train) // 10), "OverallQual"] = np.nan
test.loc[rng.choice(test.index, size=len(test) // 10), "OverallQual"] = np.nan

In [90]:
display(Markdown("### Train nulls"), train.isna().sum())
display(Markdown("### Test nulls"), test.isna().sum())

### Train nulls

LotArea          0
OverallQual    135
GrLivArea        0
SalePrice        0
dtype: int64

### Test nulls

LotArea          0
OverallQual    142
GrLivArea        0
dtype: int64

## Impute OverallQual with the most frequent value

We also create a new dataframe for testing

In [91]:
median_train = train.copy()
median_test = test.copy()

imputer = SimpleImputer(strategy="median")
imputer.fit(train[["OverallQual"]])
median_train["OverallQual"] = imputer.transform(median_train[["OverallQual"]])
median_test["OverallQual"] = imputer.transform(median_test[["OverallQual"]])

display(Markdown("### Median (Train) nulls"), median_train.isna().sum())
display(Markdown("### Median (Test) nulls"), median_test.isna().sum())

### Median (Train) nulls

LotArea        0
OverallQual    0
GrLivArea      0
SalePrice      0
dtype: int64

### Median (Test) nulls

LotArea        0
OverallQual    0
GrLivArea      0
dtype: int64

## Enode the original OverallQual feature with Leave One Out Encoder

### Notes:
* it handles nulls by encoding them as the mean of the target
* It is only fit on the training targets (not on the test data)
* The encoding dictionary is created from the training data and applied to the test data

In [92]:
looe_train = train.copy()
looe_test = test.copy()

encoder = LeaveOneOutEncoder(cols=["OverallQual"])
encoder.fit(looe_train["OverallQual"], train["SalePrice"])
looe_train["OverallQual"] = encoder.transform(looe_train["OverallQual"])
looe_test["OverallQual"] = encoder.transform(looe_test["OverallQual"])

display(Markdown("### Leave One Out Encoder (Train) nulls"), looe_train.isna().sum())
display(Markdown("### Leave One Out Encoder (Test) nulls"), looe_test.isna().sum())
display(Markdown("### Leave One Out Encoder (Train)"), looe_train.head(10))

### Leave One Out Encoder (Train) nulls

LotArea        0
OverallQual    0
GrLivArea      0
SalePrice      0
dtype: int64

### Leave One Out Encoder (Test) nulls

LotArea        0
OverallQual    0
GrLivArea      0
dtype: int64

### Leave One Out Encoder (Train)

Unnamed: 0,LotArea,OverallQual,GrLivArea,SalePrice
0,8450,208453.540636,1710,208500
1,9600,161370.917404,1262,181500
2,11250,208453.540636,1786,223500
3,9550,208453.540636,1717,140000
4,14260,273912.802632,2198,250000
5,14115,133497.863128,1362,143000
6,10084,273912.802632,1694,307000
7,10382,208453.540636,2090,200000
8,6120,208453.540636,1774,129900
9,7420,133497.863128,1077,118000


## Make the feature categorical in pandas

In [93]:
from pandas import CategoricalDtype

median_categories = median_train["OverallQual"].unique()
median_categories.sort()

median_train["OverallQual"] = median_train["OverallQual"].astype(
    CategoricalDtype(categories=median_categories, ordered=True)
)
median_test["OverallQual"] = median_test["OverallQual"].astype(
    CategoricalDtype(categories=median_categories, ordered=True)
)

looe_categories = looe_train["OverallQual"].unique()
looe_categories.sort()

looe_train["OverallQual"] = looe_train["OverallQual"].astype(
    CategoricalDtype(categories=looe_categories, ordered=True)
)
looe_test["OverallQual"] = looe_test["OverallQual"].astype(
    CategoricalDtype(categories=looe_categories, ordered=True)
)

In [94]:
display(Markdown("### Median (Train)"), median_train.head())
display(Markdown("### Median (Test)"), median_test.head())
display(Markdown("### Leave One Out Encoder (Train)"), looe_train.head())
display(Markdown("### Leave One Out Encoder (Test)"), looe_test.head())

### Median (Train)

Unnamed: 0,LotArea,OverallQual,GrLivArea,SalePrice
0,8450,7.0,1710,208500
1,9600,6.0,1262,181500
2,11250,7.0,1786,223500
3,9550,7.0,1717,140000
4,14260,8.0,2198,250000


### Median (Test)

Unnamed: 0,LotArea,OverallQual,GrLivArea
0,11622,5.0,896
1,14267,6.0,1329
2,13830,5.0,1629
3,9978,6.0,1604
4,5005,8.0,1280


### Leave One Out Encoder (Train)

Unnamed: 0,LotArea,OverallQual,GrLivArea,SalePrice
0,8450,208453.540636,1710,208500
1,9600,161370.917404,1262,181500
2,11250,208453.540636,1786,223500
3,9550,208453.540636,1717,140000
4,14260,273912.802632,2198,250000


### Leave One Out Encoder (Test)

Unnamed: 0,LotArea,OverallQual,GrLivArea
0,11622,133497.863128,896
1,14267,161370.917404,1329
2,13830,133497.863128,1629
3,9978,161370.917404,1604
4,5005,273912.802632,1280


## Run a RandomForest model

In [None]:
X = looe_train.drop(columns=["SalePrice"])
y = looe_train["SalePrice"]

model = RandomForestRegressor(n_estimators=100, random_state=random_seed)
model.fit(X, y)
preds = model.predict(X)
rmse = np.sqrt(((y - preds) ** 2).mean())
display(Markdown(f"### Leave One Out Encoder (Train) RMSE: {rmse}"))

model

### Leave One Out Encoder (Train) RMSE: 15263.543140272775

In [100]:
X = median_train.drop(columns=["SalePrice"])
y = median_train["SalePrice"]

model = RandomForestRegressor(n_estimators=100, random_state=random_seed)
model.fit(X, y)
preds = model.predict(X)
rmse = np.sqrt(((y - preds) ** 2).mean())
display(Markdown(f"### Median (Train) RMSE: {rmse}"))

model

### Median (Train) RMSE: 15635.175870123865

<br/>
<br/>
<br/>


<hr/>

# Switching gears to use a scikit learn pipeline

<hr/>

<br/>
<br/>
<br/>
