In [26]:
import sys
sys.path.append('../../lib')

In [2]:
import json

def iter_jl(fname):
    with open(fname) as f:
        for line in f:
            yield json.loads(line)

In [3]:
X, y = map(list, map(iter_jl, ['X_train.jl', 'y_train.jl']))

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)
X_dev, X_test, y_dev, y_test = train_test_split(X_test, y_test, test_size=0.5)
len(X_train), len(y_train), len(X_dev), len(y_dev), len(X_test), len(y_test)

(121272, 121272, 20212, 20212, 20212, 20212)

In [48]:
from taller_model_selection.metrics import rmse

In [49]:
import numpy as np

mean_price = np.mean(y_train)
median_price = np.median(y_train)

In [42]:
import pandas as pd

baselines = [
    dict(
        name='mean_price',
        train=rmse(y_train, [mean_price] * len(y_train)),
        dev=rmse(y_dev, [mean_price] * len(y_dev))
    ),
    dict(
        name='median_price',
        train=rmse(y_train, [median_price] * len(y_train)),
        dev=rmse(y_dev, [median_price] * len(y_dev))
    )
]

pd.DataFrame(baselines)

Unnamed: 0,name,train,dev
0,mean_price,454961.082819,454833.63318
1,median_price,470254.249757,470844.314292


# Fit the model

In [27]:
from taller_model_selection.transformers import FeatureProjection
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

features_pipe =  make_union(
    make_pipeline(
        FeatureProjection(['rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered']),
        SimpleImputer()
    ),
    make_pipeline(
        FeatureProjection(['l3']), 
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder(sparse=False, drop='first')
    ), 
    make_pipeline(
        FeatureProjection(['l4']), 
        SimpleImputer(strategy='constant'),
        OneHotEncoder(sparse=False, drop='first')
    ), 
)

lr_pipe = make_pipeline(
    features_pipe,
    LinearRegression()
)

rf_pipe = make_pipeline(
    features_pipe,
    RandomForestRegressor(n_estimators=20)
)

In [32]:
lr_pipe.fit(X_train, y_train)

In [33]:
rf_pipe.fit(X_train, y_train)

In [44]:
def eval_pipe(model_name, pipe):
    return dict(
        name=model_name,
        train=rmse(y_train, pipe.predict(X_train)),
        dev=rmse(y_dev, pipe.predict(X_dev))
    )

In [45]:
df = pd.DataFrame(baselines + [eval_pipe('lr', lr_pipe), eval_pipe('rf', rf_pipe)])
df

Unnamed: 0,name,train,dev
0,mean_price,454961.082819,454833.63318
1,median_price,470254.249757,470844.314292
2,lr,389793.704125,393506.516338
3,rf,190121.645911,306484.490109


In [47]:
df['pct_lift'] = df.dev / df.dev.max()
df.sort_values('dev')

Unnamed: 0,name,train,dev,pct_lift
3,rf,190121.645911,306484.490109,0.650925
2,lr,389793.704125,393506.516338,0.835747
0,mean_price,454961.082819,454833.63318,0.965996
1,median_price,470254.249757,470844.314292,1.0
