In [2]:
import sys

In [3]:
sys.path.append('..')

In [4]:
from evaluate import load_train_dev_test, rmse

In [7]:
(X_train, y_train), (X_dev, y_dev) = load_train_dev_test('../data/home-listings-subset.csv')

In [8]:
y_train

141    415000.0
147    420990.0
143    417500.0
196    620345.0
67     310000.0
         ...   
106    364000.0
14     250000.0
92     342990.0
179    542000.0
102    362000.0
Name: ClosePrice, Length: 200, dtype: float64

To get a first estimate of how my models are doing, I'll use as a baseline the mean and median price of the houses

In [9]:
import numpy as np

mean_price = np.mean(y_train)
median_price = np.median(y_train)

In [10]:
print("Train error with mean price:", rmse(y_train, np.full_like(y_train, mean_price)))
print("Dev error with mean price:", rmse(y_dev, np.full_like(y_dev, mean_price)))

Train error with mean price: 194823.66596449667
Dev error with mean price: 214481.06045635117


In [11]:
print("Train error with median price:", rmse(y_train, np.full_like(y_train, median_price)))
print("Dev error with median price:", rmse(y_dev, np.full_like(y_dev, median_price)))

Train error with median price: 200982.08729587073
Dev error with median price: 217804.36574855525


## Fit first models

In [25]:
from transforms import FeatureProjection, TargetEncoder
from sklearn.pipeline import make_pipeline, make_union
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

numerical_columns = [
'CDOM',
'LotSizeAreaSQFT',
'SqFtTotal',
]

features_pipe =  make_union(
    make_pipeline(
        FeatureProjection(numerical_columns),
        SimpleImputer() # There was no null data on the original dataset, but can be useful for new data
    ),
    TargetEncoder('BathsTotal'),
    TargetEncoder('BedsTotal'),
    TargetEncoder('ElementarySchoolName'),
    TargetEncoder('StructuralStyle', min_freq=1)

)

lr_pipe = make_pipeline(
    features_pipe,
    LinearRegression()
)

rf_pipe = make_pipeline(
    features_pipe,
    RandomForestRegressor(n_estimators=20, random_state=42)
)

xg_pipe = make_pipeline(
    features_pipe,
    XGBRegressor(n_estimators=20, random_state=42)
)

In [13]:
lr_pipe.fit(X_train, y_train);

In [14]:
print("Train error with Linear Regression:", rmse(y_train, np.full_like(y_train, lr_pipe.predict(X_train))))
print("Dev error with Linear Regression:", rmse(y_dev, np.full_like(y_dev, lr_pipe.predict(X_dev))))

Train error with Linear Regression: 76624.98730320783
Dev error with Linear Regression: 50133.70993858404


In [15]:
rf_pipe.fit(X_train, y_train);

In [16]:
print("Train error with Random Forest:", rmse(y_train, np.full_like(y_train, rf_pipe.predict(X_train))))
print("Dev error with Random Forest:", rmse(y_dev, np.full_like(y_dev, rf_pipe.predict(X_dev))))

Train error with Random Forest: 30151.948182263655
Dev error with Random Forest: 37226.866988510024


In [26]:
xg_pipe.fit(X_train, y_train);

In [27]:
print("Train error with XGBoost:", rmse(y_train, np.full_like(y_train, xg_pipe.predict(X_train))))
print("Dev error with XGBoost:", rmse(y_dev, np.full_like(y_dev, xg_pipe.predict(X_dev))))

Train error with XGBoost: 14961.740490887054
Dev error with XGBoost: 57768.28516137212
