# Setup en colab

Crea un shortcut en tu drive de los [datos](https://drive.google.com/drive/folders/1djjceNkO42vrB10PubYTzQydfccPbzdB?usp=sharing)



In [None]:
!pip install fasttext # necesario si vas a usar los embeddings de titulo/descripcion

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 3.9 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.10.0-py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3163457 sha256=781f6dd1b5e09017a5706b00777746d425cfe6be6171712e754eedef19adc9bb
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.0


In [1]:
# Clonamos el repo para usar el codigo de la lib
!git clone https://github.com/elsonidoq/ml-practico-2022.git 

Cloning into 'ml-practico-2022'...
remote: Enumerating objects: 288, done.[K
remote: Counting objects: 100% (288/288), done.[K
remote: Compressing objects: 100% (200/200), done.[K
remote: Total 288 (delta 172), reused 197 (delta 85), pack-reused 0[K
Receiving objects: 100% (288/288), 3.63 MiB | 14.81 MiB/s, done.
Resolving deltas: 100% (172/172), done.


In [2]:
import sys
sys.path.append('ml-practico-2022/lib')

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Entrenamos un modelo

In [4]:
from taller_model_selection.evaluate import load_train_dev_test

(X_train, y_train), (X_dev, y_dev), test = load_train_dev_test('/content/gdrive/MyDrive/taller-model-selection-data')

{'pct(train)': 0.7837289649483001, 'pct(dev)': 0.11952685477518159, 'pct(test)': 0.09674418027651828}


In [None]:
(
    max([e['created_on'] for e in X_train]), 
    max([e['created_on'] for e in X_dev]), 
    max([e['created_on'] for e in test[0]])
)

('2021-03-01', '2021-04-15', '2021-05-01')

In [5]:
import numpy as np

mean_price = np.mean(y_train)
median_price = np.median(y_train)

In [6]:
from taller_model_selection.evaluate import Evaluator

evaluator = Evaluator(X_train, y_train, X_dev, y_dev)

In [7]:
evaluator.eval_prediction('mean_price', y_hat_train=[mean_price] * len(y_train), y_hat_dev=[mean_price] * len(y_dev))

{'name': 'mean_price', 'train': 480141.3676039339, 'dev': 466108.16479362577}

In [8]:
evaluator.eval_prediction('median_price', y_hat_train=[median_price] * len(y_train), y_hat_dev=[median_price] * len(y_dev))

{'name': 'median_price', 'train': 497116.9988004173, 'dev': 479062.4340803305}

In [9]:
import pandas as pd

pd.DataFrame(evaluator.evaluations)

Unnamed: 0,name,train,dev
0,mean_price,480141.367604,466108.164794
1,median_price,497116.9988,479062.43408


# Fit the model

In [12]:
from taller_model_selection.transformers import FeatureProjection, TargetEncoder
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

features_pipe =  make_union(
    make_pipeline(
        FeatureProjection(['rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered']),
        SimpleImputer()
    ),
    # make_pipeline(
    #     FeatureProjection(['l3']), 
    #     SimpleImputer(strategy='most_frequent'),
    #     OneHotEncoder(sparse=False, drop='first')
    # ), 
    # Podes probar usar target encoder, one hot, o ambos
    TargetEncoder('l3'),
    make_pipeline(
        FeatureProjection(['l4']), 
        SimpleImputer(strategy='constant'),
        OneHotEncoder(sparse=False, drop='first')
    ), 
)

lr_pipe = make_pipeline(
    features_pipe,
    LinearRegression()
)

rf_pipe = make_pipeline(
    features_pipe,
    RandomForestRegressor(n_estimators=20)
)

In [13]:
lr_pipe.fit(X_train, y_train)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('featureprojection',
                                                                  FeatureProjection(fields=['rooms',
                                                                                            'bedrooms',
                                                                                            'bathrooms',
                                                                                            'surface_total',
                                                                                            'surface_covered'])),
                                                                 ('simpleimputer',
                                                                  SimpleImputer())])),
                                                ('targetencoder',
                                                

In [14]:
rf_pipe.fit(X_train, y_train)

Pipeline(steps=[('featureunion',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('featureprojection',
                                                                  FeatureProjection(fields=['rooms',
                                                                                            'bedrooms',
                                                                                            'bathrooms',
                                                                                            'surface_total',
                                                                                            'surface_covered'])),
                                                                 ('simpleimputer',
                                                                  SimpleImputer())])),
                                                ('targetencoder',
                                                

In [15]:
evaluator.eval_pipe('lr', lr_pipe)

{'name': 'lr', 'train': 416657.1951026198, 'dev': 386008.88681383367}

In [16]:
evaluator.eval_pipe('rf', rf_pipe)

{'name': 'rf', 'train': 198646.09369999726, 'dev': 287474.7091955039}

In [17]:
df = pd.DataFrame(evaluator.evaluations)
df

Unnamed: 0,name,train,dev
0,mean_price,480141.367604,466108.164794
1,median_price,497116.9988,479062.43408
2,lr,416657.195103,386008.886814
3,rf,198646.0937,287474.709196


In [18]:
df['pct_lift'] = df.dev / df.dev.max()
df.sort_values('dev')

Unnamed: 0,name,train,dev,pct_lift
3,rf,198646.0937,287474.709196,0.600078
2,lr,416657.195103,386008.886814,0.805759
0,mean_price,480141.367604,466108.164794,0.972959
1,median_price,497116.9988,479062.43408,1.0
