In [2]:
import sys
sys.path.append('../../lib')

In [3]:
from taller_model_selection.evaluate import load_train_dev_test

(X_train, y_train), (X_dev, y_dev), test = load_train_dev_test('.')

{'pct(train)': 0.7837289649483001, 'pct(dev)': 0.11952685477518159, 'pct(test)': 0.09674418027651828}


In [5]:
(
    max([e['created_on'] for e in X_train]), 
    max([e['created_on'] for e in X_dev]), 
    max([e['created_on'] for e in test[0]])
)

('2021-03-01', '2021-04-15', '2021-05-01')

In [6]:
from taller_model_selection.metrics import rmse

import numpy as np

mean_price = np.mean(y_train)
median_price = np.median(y_train)

In [7]:
import pandas as pd

baselines = [
    dict(
        name='mean_price',
        train=rmse(y_train, [mean_price] * len(y_train)),
        dev=rmse(y_dev, [mean_price] * len(y_dev))
    ),
    dict(
        name='median_price',
        train=rmse(y_train, [median_price] * len(y_train)),
        dev=rmse(y_dev, [median_price] * len(y_dev))
    )
]

pd.DataFrame(baselines)

Unnamed: 0,name,train,dev
0,mean_price,480141.367604,466108.164794
1,median_price,497116.9988,479062.43408


# Fit the model

In [8]:
from taller_model_selection.transformers import FeatureProjection
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

features_pipe =  make_union(
    make_pipeline(
        FeatureProjection(['rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered']),
        SimpleImputer()
    ),
    make_pipeline(
        FeatureProjection(['l3']), 
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder(sparse=False, drop='first')
    ), 
    make_pipeline(
        FeatureProjection(['l4']), 
        SimpleImputer(strategy='constant'),
        OneHotEncoder(sparse=False, drop='first')
    ), 
)

lr_pipe = make_pipeline(
    features_pipe,
    LinearRegression()
)

rf_pipe = make_pipeline(
    features_pipe,
    RandomForestRegressor(n_estimators=20)
)

In [9]:
lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('featureprojection',
                                                                  FeatureProjection(fields=['rooms',
                                                                                            'bedrooms',
                                                                                            'bathrooms',
                                                                                            'surface_total',
                                                                                            'surface_covered'])),
                                                                 ('simpleimputer',
                                                                  SimpleImputer())])),
                                                ('pipeline-2',
                                                 Pi

In [10]:
rf_pipe.fit(X_train, y_train)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('featureprojection',
                                                                  FeatureProjection(fields=['rooms',
                                                                                            'bedrooms',
                                                                                            'bathrooms',
                                                                                            'surface_total',
                                                                                            'surface_covered'])),
                                                                 ('simpleimputer',
                                                                  SimpleImputer())])),
                                                ('pipeline-2',
                                                 Pi

In [11]:
def eval_pipe(model_name, pipe):
    return dict(
        name=model_name,
        train=rmse(y_train, pipe.predict(X_train)),
        dev=rmse(y_dev, pipe.predict(X_dev))
    )

In [12]:
df = pd.DataFrame(baselines + [eval_pipe('lr', lr_pipe), eval_pipe('rf', rf_pipe)])
df

Unnamed: 0,name,train,dev
0,mean_price,480141.367604,466108.164794
1,median_price,497116.9988,479062.43408
2,lr,416289.600203,385109.650291
3,rf,205356.160365,317820.194874


In [13]:
df['pct_lift'] = df.dev / df.dev.max()
df.sort_values('dev')

Unnamed: 0,name,train,dev,pct_lift
3,rf,205356.160365,317820.194874,0.663421
2,lr,416289.600203,385109.650291,0.803882
0,mean_price,480141.367604,466108.164794,0.972959
1,median_price,497116.9988,479062.43408,1.0
