In [1]:
# General Purpose
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

from tqdm import tqdm
from tqdm import tqdm_notebook
from tqdm import tqdm_pandas
import timeit

# Data access
from dataAccess import aws_df_from_S3_csv

# FE
from CustomTransform import DistanceEncoder, SizeMeanEncoder, RangeEncoder, CustomKNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Pipeline
from sklearn.pipeline import Pipeline

# Models
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Evaluation
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from auxModelFunctions import createCenteredGridSearch, modelEvaluate, bestModel

## Lectura de los datos y agregados previos (mi cole y restaurantes)

In [2]:
# Cargamos los datos

#dataTrain = aws_df_from_S3_csv(file = "04-2021/datos-dep/depDataTrain_Arnau_20-08.csv", bucket = 'datos-viviendas')
#dataTest = aws_df_from_S3_csv(file = "04-2021/datos-dep/depDataTest_Arnau_20-08.csv", bucket = 'datos-viviendas')

dataTrain = pd.read_csv('depDataTrain_Arnau_20-08.csv')
dataTest = pd.read_csv('depDataTest_Arnau_20-08.csv')

In [3]:
# Train-Test target

target = 'ide_price'

y_train = dataTrain.pop(target)
X_train = dataTrain

y_test = dataTest.pop(target)
X_test = dataTest

## Definición de los columns transformers

En el siguiente apartado definimos todos los transformers que sean necesarios antes de crear el pipeline

In [4]:
# Distance Encoder:
distanceOrigin = [['sol', [40.414650, -3.700400]],
                  ['nmi', [40.446278, -3.691814]],
                  ['pca', [40.466070, -3.689280]]]
distanceEncoderColumn = ['ide_latitude', 'ide_longitude']
distanceEncoder = list()
for i in range(3):
    distanceEncoder.append(
        DistanceEncoder('fe_distance_' + distanceOrigin[i][0], distanceEncoderColumn, distanceOrigin[i][1]))

# Size Mean Encoder:
sizeMeanEncoderColumn = 'ide_size'
sizeMeanGroup = ['barrio', 'distrito']
sizeMeanEncoders = list()
for i in range(2):
    sizeMeanEncoders.append(
        SizeMeanEncoder('fe_mean_size_' + sizeMeanGroup[i], sizeMeanEncoderColumn, 'geo_' + sizeMeanGroup[i]))

# Range Enconders:
rangeEncoders = list()
rangeEncoders.append(RangeEncoder('fe_latitude_bins', 'ide_latitude', 10))
rangeEncoders.append(RangeEncoder('fe_longitude_bins', 'ide_longitude', 10))

# Custom KNNImputer
imputer = CustomKNNImputer(n_neighbors = 5, weights = 'distance')

# Column Transformer for OneHotEncoder
categorical_features = ['geo_distrito', 'geo_barrio', 'fe_latitude_bins', 'fe_longitude_bins']
oneHotColumnTransformer = ColumnTransformer([
    ("oneHotEncoder", OneHotEncoder(handle_unknown='ignore'),
     categorical_features)
], remainder = 'passthrough')

## Instanciación de las clases de los modelos

In [5]:
randomForestRegresor = RandomForestRegressor()
xgboostRegressor = XGBRegressor()

## Creación  de los pipeline

In [6]:
# Pipeline
RandomForestRegressor_pipeline = Pipeline([
                     ('Encoder1', distanceEncoder[0]),
                     ('Encoder2', distanceEncoder[1]),
                     ('Encoder3', distanceEncoder[2]),
                     ('Encoder4', sizeMeanEncoders[0]),
                     ('Encoder5', sizeMeanEncoders[1]),
                     ('Encoder6', rangeEncoders[0]),
                     ('Encoder7', rangeEncoders[1]),
                     ('Imputer1', imputer),
                     ('Encoder8', oneHotColumnTransformer),
                     ('Model', randomForestRegresor)])

xgboostRegressor_pipeline = Pipeline([
                     ('Encoder1', distanceEncoder[0]),
                     ('Encoder2', distanceEncoder[1]),
                     ('Encoder3', distanceEncoder[2]),
                     ('Encoder4', sizeMeanEncoders[0]),
                     ('Encoder5', sizeMeanEncoders[1]),
                     ('Encoder6', rangeEncoders[0]),
                     ('Encoder7', rangeEncoders[1]),
                     ('Imputer1', imputer),
                     ('Encoder8', oneHotColumnTransformer),
                     ('Model', randomForestRegresor)])

"xgboostRegressor_pipeline = Pipeline([\n                     ('Encoder1', distanceEncoder[0]),\n                     ('Encoder2', distanceEncoder[1]),\n                     ('Encoder3', distanceEncoder[2]),\n                     ('Encoder4', sizeMeanEncoders[0]),\n                     ('Encoder5', sizeMeanEncoders[1]),\n                     ('Encoder6', rangeEncoders[0]),\n                     ('Encoder7', rangeEncoders[1]),\n                     ('Imputer1', imputer),\n                     ('Encoder8', oneHotColumnTransformer),\n                     ('Model', randomForestRegresor)])"

## Creación de los grid para el gridSearch

Es necesario defenir de la siguiente manera los grid para crear una serie semi aleatoria de los mismos:

In [7]:
# RandomForestRegressor
randomForestRegressor_n_estimators =  [int(x) for x in np.linspace(start = 200, stop = 4000, num = 10)]
randomForestRegressor_max_features = ['auto', 'sqrt','log2']
randomForestRegressor_max_depth =  [int(x) for x in np.linspace(5, 110, num = 11)]
randomForestRegressor_max_depth.append(None)
randomForestRegressor_max_depth = [None]
randomForestRegressor_min_samples_split =[2, 5, 10]
randomForestRegressor_min_samples_leaf = [1, 2, 4]
randomForestRegressor_bootstrap = [True]

RandomForestRegressor_grid = {'Model__n_estimators': randomForestRegressor_n_estimators,
                                  'Model__max_features': randomForestRegressor_max_features,
                                  'Model__max_depth': randomForestRegressor_max_depth,
                                  'Model__min_samples_split': randomForestRegressor_min_samples_split,
                                  'Model__min_samples_leaf': randomForestRegressor_min_samples_leaf,
                                  'Model__bootstrap': randomForestRegressor_bootstrap}

# XGBoostRegressor
xgboostRegressor_n_estimators = [int(x) for x in np.linspace(start = 200, stop = 4000, num = 10)]
xgboostRegressor_subsample = [int(x) for x in np.linspace(start = 0.4, stop = 1, num = 10)]
xgboostRegressor_max_depth = [int(x) for x in np.linspace(10, 120, num = 10)]
xgboostRegressor_colsample_bytree = [int(x) for x in np.linspace(start = 0.4, stop = 1, num = 10)]
xgboostRegressor_eta = [int(x) for x in np.linspace(start = 0.0001, stop = 0.5, num = 5)]

xgboostRegressor_grid = {'Model__n_estimators': xgboostRegressor_n_estimators,
               'Model__eta': xgboostRegressor_eta,
               'Model__max_depth': xgboostRegressor_max_depth,
               'Model__subsample': xgboostRegressor_subsample,
               'Model__colsample_bytree': xgboostRegressor_colsample_bytree}

In [8]:
models_dict = {"RandomForestRegressor":[RandomForestRegressor_pipeline, RandomForestRegressor_grid],
               "xgboostRegressor":[xgboostRegressor_pipeline, xgboostRegressor_grid]}

## Llamada al creador de modelos

In [9]:
bestModel(models_dict, X_train, y_train, X_test, y_test)

Probando algoritmos:   0%|          | 0/1 [00:00<?, ?it/s]

probando 1 combinaciones aleatorias de RandomForestRegressor
Fitting 5 folds for each of 1 candidates, totalling 5 fits




Centrando los mejores modelos aleatorios:   0%|          | 0/1 [00:00<?, ?it/s]

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Model Performance
Accuracy = 83.52%.


[Pipeline(steps=[('Encoder1',
                  DistanceEncoder(new_columns='fe_distance_sol',
                                  origin=[40.41465, -3.7004],
                                  transformed_columns=['ide_latitude',
                                                       'ide_longitude'])),
                 ('Encoder2',
                  DistanceEncoder(new_columns='fe_distance_nmi',
                                  origin=[40.446278, -3.691814],
                                  transformed_columns=['ide_latitude',
                                                       'ide_longitude'])),
                 ('Encoder3',
                  DistanceEncoder(new_columns='fe_distance_...
                               transformed_columns='ide_longitude')),
                 ('Imputer1',
                  CustomKNNImputer(n_neighbors=5, weights='distance')),
                 ('Encoder8',
                  ColumnTransformer(remainder='passthrough',
                                   