In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from model.extract import extract
from experiments.experiment import Experiment
from experiments.polar import LatLonPolar

import pandas as pd
import altair as alt
from lightgbm import LGBMRegressor

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder
from category_encoders import TargetEncoder, QuantileEncoder

pd.options.display.float_format = '{:,.2f}'.format

# Data Sample

In [3]:
X = extract('RENTAL')
X['month'] = pd.to_datetime(X.created_date.dt.strftime('%Y-%m-01'))
print(X.shape)
X.sample(5).T

(34486, 14)


title,Apartamento estilo Loft Centro na Lapa,Oportunidade Imperdível no Coração da Tijuca!,"Apartamento para Aluguel - Taquara, 2 Quartos, 42 m2","Recreio dos Bandeirantes | Apartamento 4 quartos, sendo 3 suites",Apartamento para aluguel com 65 metros quadrados com 2 quartos
url,/imovel/apartamento-1-quartos-centro-zona-central-rio-de-janeiro-40m2-aluguel-RS1100-id-2594569902/,/imovel/aluguel-apartamento-2-quartos-tijuca-zona-norte-rio-de-janeiro-rj-100m2-id-2604719781/,/imovel/apartamento-2-quartos-taquara-zona-oeste-rio-de-janeiro-com-garagem-42m2-aluguel-RS990-id-2622525892/,/imovel/aluguel-apartamento-4-quartos-com-interfone-recreio-dos-bandeirantes-zona-oeste-rio-de-janeiro-rj-172m2-id-2582913516/,/imovel/apartamento-2-quartos-maracana-zona-norte-rio-de-janeiro-65m2-aluguel-RS1300-id-2531069409/
origin,vivareal,zapimoveis,vivareal,zapimoveis,vivareal
neighborhood,Centro,Tijuca,Taquara,Recreio Dos Bandeirantes,Maracanã
usable_area,40,100,42,172,65
unit_types,APARTMENT,APARTMENT,APARTMENT,APARTMENT,APARTMENT
floors,10,0,0,0,0
bedrooms,1,2,2,4,2
bathrooms,1,3,1,4,1
suites,0,1,0,3,0
parking_spaces,0,1,1,3,0
amenities,DISABLED_ACCESS|GATED_COMMUNITY|ELECTRONIC_GAT...,,GARAGE|SPORTS_COURT|PARTY_HALL,INTERCOM|SAFETY_CIRCUIT,
lat,-22.91,0.00,0.00,0.00,-22.91


# Simple Model

In [4]:
y = X.pop('total')
exp = Experiment(X, y, 5)
exp

Experiment(n_splits=5, metrics=[<function mean_absolute_error at 0x7f285d641e10>, <function rmse at 0x7f285d6aacb0>, <function mean_absolute_percentage_error at 0x7f285d641f30>])

In [5]:
simple_pipe = make_column_transformer(
    ('passthrough', ['usable_area', 'bedrooms', 'bathrooms', 'parking_spaces'])
)
exp_simple_pipe = exp.run('simple', LGBMRegressor(), {}, simple_pipe)


simple_pipe_unit = make_column_transformer(
    ('passthrough', ['usable_area', 'bedrooms', 'bathrooms', 'parking_spaces']),
    (OneHotEncoder(), ['unit_types'])
)
exp_pipe_unit = exp.run('unit', LGBMRegressor(), {}, simple_pipe_unit)

simple_pipe_latlon = make_column_transformer(
    ('passthrough', ['usable_area', 'bedrooms', 'bathrooms', 'parking_spaces', 'lat', 'lon']),
    (OneHotEncoder(), ['unit_types'])
)
exp_pipe_latlon = exp.run('latlng', LGBMRegressor(), {}, simple_pipe_latlon)

simple_pipe_neighborhood = make_column_transformer(
    ('passthrough', ['usable_area', 'bedrooms', 'bathrooms', 'parking_spaces', 'lat', 'lon']),
    (OneHotEncoder(handle_unknown='ignore'), ['neighborhood', 'unit_types'])
)
exp_pipe_neighborhood = exp.run('neighborhood', LGBMRegressor(), {}, simple_pipe_neighborhood)

simple_pipe_target = make_column_transformer(
    ('passthrough', ['usable_area', 'bedrooms', 'bathrooms', 'parking_spaces', 'lat', 'lon']),
    (TargetEncoder(), ['neighborhood', 'unit_types'])
)
exp_pipe_target = exp.run('neighborhood_target', LGBMRegressor(), {}, simple_pipe_target)

simple_pipe_quantile = make_column_transformer(
    ('passthrough', ['usable_area', 'bedrooms', 'bathrooms', 'parking_spaces', 'lat', 'lon']),
    (QuantileEncoder(), ['neighborhood', 'unit_types'])
)
exp_pipe_quantile = exp.run('neighborhood_quantile', LGBMRegressor(), {}, simple_pipe_quantile)

exp.obs_metrics.query('split_name == "test"').groupby(['name', 'metric']).value.mean().unstack().sort_values('mae')

metric,mae,mape,rmse
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
neighborhood,2609.91,0.5,20267.21
neighborhood_target,2629.03,0.49,19865.1
neighborhood_quantile,2666.86,0.52,19902.39
latlng,2995.75,0.59,21425.96
unit,3055.2,0.63,22509.97
simple,3135.44,0.64,22565.79


In [6]:
polar_target = make_column_transformer(
    ('passthrough', ['usable_area', 'bedrooms', 'bathrooms', 'parking_spaces']),
    (LatLonPolar('neighborhood'), ['neighborhood', 'lat', 'lon']),
    (TargetEncoder(), ['neighborhood', 'unit_types'])
)
exp_pipe_target = exp.run('polar_target', LGBMRegressor(), {}, polar_target)

exp.obs_metrics.query('split_name == "test"').groupby(['name', 'metric']).value.mean().unstack().sort_values('mae')

metric,mae,mape,rmse
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
neighborhood,2609.91,0.5,20267.21
neighborhood_target,2629.03,0.49,19865.1
neighborhood_quantile,2666.86,0.52,19902.39
polar_target,2790.56,0.56,22530.84
latlng,2995.75,0.59,21425.96
unit,3055.2,0.63,22509.97
simple,3135.44,0.64,22565.79


In [7]:
def clip_target(X, y, quantiles=[.05, .95]):
    qinf = y.quantile(quantiles[0])
    qsup = y.quantile(quantiles[1])
    return X, y.clip(lower=qinf, upper=qsup)

exp_clipped_latlon = exp.run('clipped latlng', LGBMRegressor(), {}, simple_pipe_target, preprocess_train_fn=clip_target)

exp_clipped_neighborhood = exp.run('clipped neighborhood', LGBMRegressor(), {}, simple_pipe_neighborhood, preprocess_train_fn=clip_target)

exp.obs_metrics.query('split_name == "test"').groupby(['name', 'metric']).value.median().unstack().sort_values('mae')

metric,mae,mape,rmse
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
clipped latlng,2463.49,0.28,26635.3
clipped neighborhood,2479.93,0.29,26648.42
neighborhood,2683.82,0.45,23328.0
neighborhood_target,2708.29,0.5,22990.64
polar_target,2761.0,0.49,23469.87
neighborhood_quantile,2782.88,0.54,23237.33
latlng,2997.26,0.55,26564.81
unit,3125.15,0.6,26187.6
simple,3196.31,0.61,26214.83


In [8]:
base = alt.Chart(
    exp.obs_metrics.assign(name = exp.obs_metrics.metric + ' | ' + exp.obs_metrics.name + ' | ' + exp.obs_metrics.split_name).query('metric == "mape"')
)

selection = alt.selection_multi(fields=['name'], bind='legend')
opacity = alt.condition(selection, alt.value(1.0), alt.value(0))

(
    base.mark_point().encode(x = 'split', y = 'value', color = 'name', tooltip = ['name', 'value'], opacity = opacity) + 
    base.mark_line().encode(x = 'split', y = 'value', color = 'name', opacity = opacity)
).add_selection(selection).properties(width=900, height=600).interactive(bind_x = False)

In [13]:
y_pred = exp_clipped_latlon.predict(X, y)
points = [0] + [i.right for i in pd.qcut(y, 15).cat.categories.values]

pd.DataFrame({
    'true': pd.cut(y, points).tolist(),
    'pred': pd.cut(y_pred, points).tolist()
}).groupby(['true', 'pred'])\
.value_counts().unstack().fillna(0).astype(int)\
.style.background_gradient(axis=1)

pred,"(0.0, 1350.0]","(1350.0, 1720.0]","(1720.0, 2070.0]","(2070.0, 2411.0]","(2411.0, 2827.0]","(2827.0, 3250.0]","(3250.0, 3698.0]","(3698.0, 4200.0]","(4200.0, 4880.0]","(4880.0, 5924.0]","(5924.0, 7500.0]","(7500.0, 10450.0]","(10450.0, 15400.0]","(15400.0, 24300.0]","(24300.0, 2660000.0]"
true,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
"(0.0, 1350.0]",230,1388,400,125,58,17,21,17,37,24,8,8,0,0,0
"(1350.0, 1720.0]",34,794,742,481,147,37,12,9,1,7,4,3,0,1,0
"(1720.0, 2070.0]",10,454,620,622,409,123,42,10,2,6,0,0,2,0,0
"(2070.0, 2411.0]",2,122,399,621,638,351,103,34,16,6,3,1,0,0,0
"(2411.0, 2827.0]",1,31,144,364,684,639,251,101,44,27,3,4,2,0,0
"(2827.0, 3250.0]",0,14,62,186,425,644,495,243,183,68,14,0,0,0,1
"(3250.0, 3698.0]",0,5,35,98,201,537,525,374,359,95,24,7,4,0,0
"(3698.0, 4200.0]",0,0,9,44,81,252,448,397,626,368,94,16,5,0,0
"(4200.0, 4880.0]",0,7,2,52,38,92,177,303,594,693,245,56,5,0,0
"(4880.0, 5924.0]",0,0,4,11,14,27,94,133,366,818,555,217,41,8,5


In [12]:
pd.DataFrame({
    'feature': sum(exp_clipped_latlon.model[:-1][0]._columns, []),
    'imp': exp_clipped_latlon.model._final_estimator.feature_importances_
}).sort_values('imp', ascending=False)

Unnamed: 0,feature,imp
0,usable_area,1014
4,lat,431
5,lon,371
6,neighborhood,371
2,bathrooms,285
3,parking_spaces,238
1,bedrooms,198
7,unit_types,92


In [11]:
import m2cgen as m2c

code = m2c.export_to_go(exp_clipped_neighborhood.model._final_estimator, function_name='Predict')
code

'func Predict(input []float64) float64 {\n    var var0 float64\n    if input[0] > 220.50000000000003 {\n        if input[9] > 0.000000000000000000000000000000000010000000180025095 {\n            if input[0] > 465.50000000000006 {\n                var0 = 8848.792385028028\n            } else {\n                if input[52] > 0.000000000000000000000000000000000010000000180025095 {\n                    if input[0] > 310.50000000000006 {\n                        var0 = 8321.402760801138\n                    } else {\n                        var0 = 7862.339542949686\n                    }\n                } else {\n                    if input[0] > 250.50000000000003 {\n                        var0 = 8644.104809142893\n                    } else {\n                        var0 = 8211.766526556768\n                    }\n                }\n            }\n        } else {\n            if input[0] > 339.50000000000006 {\n                if input[5] <= -43.48908424377441 {\n                    