In [1]:
import sys

In [2]:
sys.path.append('..')

In [3]:
from evaluate import load_train_dev_test, rmse

In [4]:
(X_train, y_train), (X_dev, y_dev), test = load_train_dev_test('../data/home-listings-subset.csv')

{'pct(train)': 0.8071748878923767, 'pct(dev)': 0.08968609865470852, 'pct(test)': 0.1031390134529148}


In [5]:
y_train

94     346000.0
180    545000.0
59     301000.0
176    530000.0
137    406900.0
         ...   
62     305000.0
111    369000.0
217    931000.0
52     298000.0
126    390220.0
Name: ClosePrice, Length: 180, dtype: float64

To get a first estimate of how my models are doing, I'll use as a baseline the mean and median price of the houses

In [6]:
import numpy as np

mean_price = np.mean(y_train)
median_price = np.median(y_train)

In [7]:
print("Train error with mean price:", rmse(y_train, np.full_like(y_train, mean_price)))
print("Dev error with mean price:", rmse(y_dev, np.full_like(y_dev, mean_price)))

Train error with mean price: 200564.19525065424
Dev error with mean price: 132918.99442758234


In [8]:
print("Train error with median price:", rmse(y_train, np.full_like(y_train, median_price)))
print("Dev error with median price:", rmse(y_dev, np.full_like(y_dev, median_price)))

Train error with median price: 207473.65310801112
Dev error with median price: 128569.6057869044


## Fit first models

In [92]:
from transforms import FeatureProjection, TargetEncoder
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

numerical_columns = [
'CDOM',
'LotSizeAreaSQFT',
'SqFtTotal',
]

features_pipe =  make_union(
    make_pipeline(
        FeatureProjection(numerical_columns),
        SimpleImputer() # There was no null data on the original dataset, but can be useful for new data
    ),
    TargetEncoder('BathsTotal'),
    TargetEncoder('BedsTotal'),
    TargetEncoder('ElementarySchoolName'),
    TargetEncoder('StructuralStyle', min_freq=1)

)

lr_pipe = make_pipeline(
    features_pipe,
    LinearRegression()
)

rf_pipe = make_pipeline(
    features_pipe,
    RandomForestRegressor(n_estimators=20, random_state=42)
)

In [86]:
lr_pipe.fit(X_train, y_train);

In [87]:
print("Train error with Linear Regression:", rmse(y_train, np.full_like(y_train, lr_pipe.predict(X_train))))
print("Dev error with Linear Regression:", rmse(y_dev, np.full_like(y_dev, lr_pipe.predict(X_dev))))

Train error with Linear Regression: 77214.58689202162
Dev error with Linear Regression: 78487.64517138987


In [93]:
rf_pipe.fit(X_train, y_train);

In [94]:
print("Train error with Random Forest:", rmse(y_train, np.full_like(y_train, rf_pipe.predict(X_train))))
print("Dev error with Random Forest:", rmse(y_dev, np.full_like(y_dev, rf_pipe.predict(X_dev))))

Train error with Random Forest: 30511.151692300762
Dev error with Random Forest: 60486.630099988455


In [90]:
RandomForestRegressor()

RandomForestRegressor()