In [None]:
!wget -O title_fasttext.bin https://www.dropbox.com/s/9iltaakdc0pe0ma/title_fasttext.bin?dl=0
!wget -O desc_fasttext.bin https://www.dropbox.com/s/d2zh0u0knqjcvtl/desc_fasttext.bin?dl=0

In [1]:
import pandas as pd

import sys
sys.path.append('../../lib')

In [2]:
from taller_model_selection.evaluate import load_train_dev_test

(X_train, y_train), (X_dev, y_dev), test = load_train_dev_test('.')

{'pct(train)': 0.7837289649483001, 'pct(dev)': 0.11952685477518159, 'pct(test)': 0.09674418027651828}


In [3]:
from taller_model_selection.transformers import FeatureProjection, PretrainedFastTextTransformer
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


features_pipe = make_union(
    PretrainedFastTextTransformer('title_fasttext.bin', 'title'),
    # Importante: las descripciones son largas, tarda bastante en procesarlas
    # Pordría tener sentido precalcularlas y guardarlas en disco
    PretrainedFastTextTransformer('desc_fasttext.bin', 'description'),
    make_pipeline(
        FeatureProjection(['rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered']),
        SimpleImputer()
    ),
    make_pipeline(
        FeatureProjection(['l3']), 
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder(sparse=False, drop='first')
    ), 
    make_pipeline(
        FeatureProjection(['l4']), 
        SimpleImputer(strategy='constant'),
        OneHotEncoder(sparse=False, drop='first')
    ), 
)

lr_pipe = make_pipeline(
    features_pipe,
    LinearRegression()
)


In [4]:
rf_pipe = make_pipeline(
    features_pipe,
    RandomForestRegressor(n_estimators=20)
)

Si estas usando el encoder de la descripccion, el codigo de abajo toma 5 minutos en correr approx

In [5]:
lr_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train);



# Resultados agregando el titulo

In [8]:
from taller_model_selection.evaluate import Evaluator

ev = Evaluator(X_train, y_train, X_dev, y_dev)

# Hay que asegurarse que el pipeline de arriba tenga el titulo y no la descripccion
ev.eval_pipe('lr', lr_pipe)
ev.eval_pipe('rf', rf_pipe)

pd.DataFrame(ev.evaluations)

Unnamed: 0,name,train,dev
0,lr,392472.789838,358645.591731
1,rf,127622.563339,271146.93304


# Resultados con titulo + descripccion

In [12]:
from taller_model_selection.evaluate import Evaluator

# Hay que asegurarse que el pipeline de arriba tenga el titulo y la descripccion
ev = Evaluator(X_train, y_train, X_dev, y_dev)
ev.eval_pipe('lr', lr_pipe)
ev.eval_pipe('rf', rf_pipe)

pd.DataFrame(ev.evaluations)

Unnamed: 0,name,train,dev
0,lr,380659.911304,346740.224595
1,rf,113296.026345,262448.879737


In [None]:
df = pd.DataFrame([eval_pipe('lr', lr_pipe), eval_pipe('rf', rf_pipe)])
df