# Valuación de inmuebles - Navent

In [1]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 
import datetime
import category_encoders as ce 
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import mean_absolute_error
import re
#GEOCODER
from geopy.geocoders import Nominatim
import ssl
import certifi
import geopy.geocoders
from geopy.exc import GeocoderTimedOut
#------------
from sklearn.impute import KNNImputer
from IPython.display import clear_output

In [None]:
xgb.XGBRegressor()

# Preprocesamiento

In [3]:
#df = pd.read_csv('./train.csv', index_col='id')
df = pd.read_csv('./train_data_geocoded/train-geocoded-imputed.csv', index_col='id')

In [4]:
df.head(2)

Unnamed: 0_level_0,tipodepropiedad,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,precio,ano,cp,zona,asenta
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254099,Apartamento,Benito Juárez,Distrito Federal,,2.0,1.0,2.0,80.0,80.0,0.0,0.0,0.0,2273000.0,2015,3103.0,Urbano,Colonia
53461,Casa en condominio,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,2.0,268.0,180.0,0.0,0.0,0.0,3600000.0,2013,10710.0,Urbano,Colonia


## Clases de pipeline

In [11]:
# Dropear columnas que no se usan
class DropFeatures( BaseEstimator, TransformerMixin ):
    
    def __init__( self, features_to_drop ):
        self._features_to_drop = features_to_drop 
      
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        return X.drop(self._features_to_drop,axis=1 )

In [12]:
#Tomo el año de las fechas
class DateTransformer( BaseEstimator, TransformerMixin ):
      
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        X['ano'] = X.fecha.dt.year
        X = X.drop('fecha',axis=1)
        return X

In [13]:
#convertir lat y lng en latlng
class LatLngGenerator( BaseEstimator, TransformerMixin ):
      
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        X['latlng'] = X.apply(lambda row: '{},{}'.format(row['lat'], row['lng']), axis=1)
        X = X.drop(['lat','lng'],axis=1)
        return X

In [14]:
class FullAddressGenerator( BaseEstimator, TransformerMixin ):
      
    def __init__( self ):
        self.ctx = ssl.create_default_context(cafile=certifi.where())
        geopy.geocoders.options.default_ssl_context = self.ctx
        self.geolocator = Nominatim(user_agent="jupytercolab")
        
    def do_reverse_geocode(self, latlng, attempt=1, max_attempts=20):
        try:
            return self.geolocator.reverse(latlng)
        except GeocoderTimedOut:
            if attempt <= max_attempts:
                print('\nTIMEOUT EXCEPT. ATTEMPT N'+str(attempt)+'\n')
                return self.do_reverse_geocode(latlng, attempt=attempt+1)
            raise
            
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        processed_count = 0
        
        for index, row in X.iterrows():
            if(row.latlng != 'nan,nan'):
                try:
                    address_aux = self.do_reverse_geocode(row.latlng).address
                    X.loc[index,'full_address'] = address_aux 
                except:
                    print('Except!!')
            elif row.latlng == 'nan,nan':
                X.loc[index,'full_address'] = np.nan
            processed_count += 1
            print('--FullAddressGenerator for index '+str(index)+' done. -- ('+str(processed_count)+' of '+str(X.shape[0])+') processed records')
            if processed_count % 20 == 0:
                clear_output(wait=True)
            if processed_count % 10000 == 0:
                X.to_csv('backup_FullAddressGenerator_'+str(processed_count)+'.csv')     
        X.to_csv('train_data_with_fulladdress.csv') 
        print('Data backup saved at ./Data/train_data_with_fulladdress.csv')
        return X

In [15]:
class FullAddressImputer( BaseEstimator, TransformerMixin ):
      
    def __init__( self ):
        self.ctx = ssl.create_default_context(cafile=certifi.where())
        geopy.geocoders.options.default_ssl_context = self.ctx
        self.geolocator = Nominatim(user_agent="jupyterc")
        
    def do_geocode(self, address, attempt=1, max_attempts=20):
        try:
            add = self.geolocator.geocode(address)
            if add != None:
                return add.address
            return np.nan
        except GeocoderTimedOut:
            if attempt <= max_attempts:
                print('\n\n\nERROR TIMEOUT N'+str(attempt)+'\n\n')
                return self.do_geocode(address, attempt=attempt+1)
            raise    
        
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        processed_count = 0
        new_addresses_found = 0
        for index, row in X.iterrows():
            if (str(X.loc[index,'full_address']) == 'nan') & (str(X.loc[index,'direccion']) != 'nan'): #Si no tiene full_address pero tiene address..
                row_address = row.direccion
                if str(X.loc[index,'provincia']) != 'nan':
                    row_address += ' '
                    row_address += str(X.loc[index,'provincia'])
                if str(X.loc[index,'ciudad']) != 'nan':
                    row_address += ' '
                    row_address += str(X.loc[index,'ciudad'])
                row_address += ' MX'
                address_aux = self.do_geocode(row_address)
                if address_aux != np.nan:
                    new_addresses_found +=1
                X.loc[index,'full_address'] = address_aux
            processed_count += 1
            print('--FullAddressImputer for index '+str(index)+' done. -- ('+str(processed_count)+' of '+str(X.shape[0])+')')
            if processed_count % 10000 == 0:
                X.to_csv('backup_FullAddressImputer_'+str(processed_count)+'.csv') 
            if processed_count % 20 == 0:
                clear_output(wait=True)
        X.to_csv('train_data_with_fulladdress.csv') 
        print('Data backup saved at ./Data/train_data_with_fulladdress.csv')
        print('FullAddressImputer found '+str(new_addresses_found)+' new addresses.')
        return X

In [16]:
class CpGenerator( BaseEstimator, TransformerMixin ):
      
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        X['cp'] = [re.findall('\D(\d{5})\D',' '+str(cp)+' ')[0] if len(re.findall('\D(\d{5})\D',' '+str(cp)+' ')) == 1 else np.nan for cp in X.full_address]
        return X

In [17]:
#Encoder para las categorias
class CategoryEncoder( BaseEstimator, TransformerMixin ):
      
    def fit( self, X, y = None ):
        self.ciudad_te = ce.TargetEncoder().fit(X['ciudad'], X['precio'])
        self.tipo_te = ce.TargetEncoder().fit(X['tipodepropiedad'], X['precio'])
        self.prov_te = ce.TargetEncoder().fit(X['provincia'], X['precio'])
        self.zona_te = ce.TargetEncoder().fit(X['zona'], X['precio'])
        self.asenta_te = ce.TargetEncoder().fit(X['asenta'], X['precio'])
        return self 
    
    def transform( self, X, y = None ):
        X['ciudad'] = self.ciudad_te.transform(X['ciudad'])
        X['tipodepropiedad'] = self.tipo_te.transform(X['tipodepropiedad'])
        X['provincia'] = self.prov_te.transform(X['provincia'])
        X['zona'] = self.zona_te.transform(X['zona'])
        X['asenta'] = self.asenta_te.transform(X['asenta'])
        return X

In [18]:
class ZonaAndAsentaGenerator( BaseEstimator, TransformerMixin ):
    
    def __init__( self ):
        self.cp_df = pd.concat(pd.read_excel('./CPdescarga.xls', sheet_name=None), ignore_index=True)
        self.cp_df = self.cp_df[['d_codigo','d_zona','d_tipo_asenta']].dropna()
        self.cp_df[['d_codigo']] = self.cp_df.d_codigo.astype(int)
        self.cp_df[['d_codigo']] = self.cp_df.d_codigo.astype(object)
    
    def find_by_cp(self, cp):
        cp = float(cp)
        row = self.cp_df[self.cp_df.d_codigo == cp] 
        
        if row.shape[0] == 0: #si no encuentra con este cp
            count = 1
            while row.shape[0] == 0 and count<6: #empiezo a probar con cps restando 1 (5 intentos)
                row = self.cp_df[self.cp_df.d_codigo == (cp-count)] 
                count +=1
            if row.shape[0] == 0: #termino el while y no lo encontró..
                row = np.nan
        
        if type(row) == float:
            zona = np.nan
            tipo_asenta = np.nan
        elif row.shape[0] > 0:
            zona = row.mode().iloc[0,1]
            tipo_asenta = row.mode().iloc[0,2]
        elif row == np.nan:
            zona = np.nan
            tipo_asenta = np.nan
            
        data = [zona, tipo_asenta]
        return data
    
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        X['zona'] = np.nan
        X['asenta'] = np.nan
        processed_count = 0
        for index, row in X.iterrows():
            cp = X.loc[index,'cp']
            if str(cp) != 'nan':
                data = self.find_by_cp(cp)
                X.loc[index,'zona'] = data[0]
                X.loc[index,'asenta'] = data[1]
                processed_count += 1
                print('ZonaAndAsentaGenerator imputed'+str(processed_count)+ ' zonas and asentas') 
                if processed_count % 10 == 0:
                  clear_output(wait=True)
        X.to_csv('train_data_with_fulladdress.csv') 
        print('Data backup saved at ./Data/train_data_with_fulladdress.csv')
        return X

In [19]:
#Iterative imputer en los valores faltantes

class MissingValuesImputer( BaseEstimator, TransformerMixin ):
      
    def __init__( self, features_to_impute ):
        self._features_to_impute = features_to_impute
        
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        imp = IterativeImputer(missing_values=np.nan, max_iter=30, random_state=42)
        X[features_to_impute] = imp.fit_transform(X[features_to_impute])
        return X

In [20]:
class CpImputer( BaseEstimator, TransformerMixin ):
      
    def fit( self, X, y = None ):
        return self 
    
    def impute_by_idzona(self, X):
        imputed_count = 0
        for index, row in X.iterrows():
            if (str(X.loc[index,'cp'])=='nan') & (str(X.loc[index,'idzona'])!='nan'):
                id_zona = X.loc[index,'idzona']
                idzona_most_freq_cp_df = X[(X.idzona == id_zona) & (~X.cp.isna())]
                if idzona_most_freq_cp_df.shape[0] > 1:
                    cp = idzona_most_freq_cp_df.mode()['cp'][0]
                    imputed_count += 1
                    X.loc[index,'cp'] = cp
        print('\n------\nCP Imputer found '+str(imputed_count)+' new CPs by idzona\n------\\n')
        return X
        
    def impute_by_ciudad(self, X):
        imputed_count = 0
        for index, row in X.iterrows():
            if (str(X.loc[index,'cp'])=='nan') & (str(X.loc[index,'ciudad'])!='nan'):
                ciudad = X.loc[index,'ciudad']
                ciudad_most_freq_cp_df = X[(X.ciudad == ciudad) & (~X.cp.isna())]
                if ciudad_most_freq_cp_df.shape[0] > 1:
                    cp = ciudad_most_freq_cp_df.mode()['cp'][0]
                    imputed_count += 1
                    X.loc[index,'cp'] = cp
        print('\n------\nCP Imputer found '+str(imputed_count)+' new CPs by ciudad\n------\\n')
        return X
    
    def impute_by_provincia(self, X):
        imputed_count = 0
        for index, row in X.iterrows():
            if (str(X.loc[index,'cp'])=='nan') & (str(X.loc[index,'provincia'])!='nan'):
                provincia = X.loc[index,'provincia']
                provincia_most_freq_cp_df = X[(X.provincia == provincia) & (~X.cp.isna())]
                if provincia_most_freq_cp_df.shape[0] > 1:
                    cp = provincia_most_freq_cp_df.mode()['cp'][0]
                    imputed_count += 1
                    X.loc[index,'cp'] = cp
        print('\n------\nCP Imputer found '+str(imputed_count)+' new CPs by provincia\n------\\n')
        return X
    
    def transform( self, X, y = None ):
        X = self.impute_by_idzona(X)
        X = self.impute_by_ciudad(X)
        X = self.impute_by_provincia(X)
        X.to_csv('train_data_with_fulladdress.csv') 
        print('Data backup saved at ./Data/train_data_with_fulladdress.csv')
        return X

In [21]:
class ZonaAndAsentaGenerator( BaseEstimator, TransformerMixin ):
    
    def __init__( self ):
        self.cp_df = pd.concat(pd.read_excel('./CPdescarga.xls', sheet_name=None), ignore_index=True)
        self.cp_df = self.cp_df[['d_codigo','d_zona','d_tipo_asenta']].dropna()
        self.cp_df[['d_codigo']] = self.cp_df.d_codigo.astype(int)
        self.cp_df[['d_codigo']] = self.cp_df.d_codigo.astype(object)
    
    def find_by_cp(self, cp):
        cp = float(cp)
        row = self.cp_df[self.cp_df.d_codigo == cp] 
        
        if row.shape[0] == 0: #si no encuentra con este cp
            count = 1
            while row.shape[0] == 0 and count<6: #empiezo a probar con cps restando 1 (5 intentos)
                row = self.cp_df[self.cp_df.d_codigo == (cp-count)] 
                count +=1
            if row.shape[0] == 0: #termino el while y no lo encontró..
                row = np.nan
        
        if type(row) == float:
            zona = np.nan
            tipo_asenta = np.nan
        elif row.shape[0] > 0:
            zona = row.mode().iloc[0,1]
            tipo_asenta = row.mode().iloc[0,2]
        elif row == np.nan:
            zona = np.nan
            tipo_asenta = np.nan
            
        data = [zona, tipo_asenta]
        return data
    
    def fit( self, X, y = None ):
        return self 
    
    def transform( self, X, y = None ):
        X['zona'] = np.nan
        X['asenta'] = np.nan
        processed_count = 0
        for index, row in X.iterrows():
            cp = X.loc[index,'cp']
            if str(cp) != 'nan':
                data = self.find_by_cp(cp)
                X.loc[index,'zona'] = data[0]
                X.loc[index,'asenta'] = data[1]
                processed_count += 1
                print('ZonaAndAsentaGenerator imputed'+str(processed_count)+ ' zonas and asentas') 
                if processed_count % 10 == 0:
                  clear_output(wait=True)
        X.to_csv('train_data_with_fulladdress.csv') 
        print('Data backup saved at ./Data/train_data_with_fulladdress.csv')
        return X

In [22]:
drop_features_to_begin = ['titulo', 'descripcion','centroscomercialescercanos','escuelascercanas']

features_to_impute = ['gimnasio', 'usosmultiples', 'piscina', 'tipodepropiedad','ciudad','provincia',
                      'habitaciones','garages','banos','metroscubiertos','ano','metrostotales','antiguedad','cp','zona','asenta']

drop_features_to_finish = ['latlng','direccion','idzona','full_address']

before_split_pipeline = Pipeline( steps = [  ('feature_selector', DropFeatures(drop_features_to_begin)),
                                             ('date_transformer',DateTransformer()),
                                             ('latlng_generator',LatLngGenerator()),
                                             ('get_full_address', FullAddressGenerator()),
                                             ('impute_full_address_by_direccion', FullAddressImputer()),
                                             ('get_cp_from_full_address', CpGenerator()),
                                             ('impute_cp_from_idzona_ciudad_provincia',CpImputer()),
                                             ('get_zona_asenta_from_cp',ZonaAndAsentaGenerator()),
                                             ('drop', DropFeatures(drop_features_to_finish))
                                           ] )

after_split_pipeline = Pipeline( steps =   [ ('category_encoder',CategoryEncoder()),
                                             ('missing_values_imputer',MissingValuesImputer(features_to_impute)),
                                           ] )

In [36]:
%%time
#df_preprocessed = before_split_pipeline.fit_transform(df)

ZonaAndAsentaGenerator imputed239891 zonas and asentas
ZonaAndAsentaGenerator imputed239892 zonas and asentas
ZonaAndAsentaGenerator imputed239893 zonas and asentas
ZonaAndAsentaGenerator imputed239894 zonas and asentas
ZonaAndAsentaGenerator imputed239895 zonas and asentas
ZonaAndAsentaGenerator imputed239896 zonas and asentas
ZonaAndAsentaGenerator imputed239897 zonas and asentas
ZonaAndAsentaGenerator imputed239898 zonas and asentas
Data backup saved at ./Data/train_data_with_fulladdress.csv
CPU times: user 2h 15min 44s, sys: 1min 15s, total: 2h 16min 59s
Wall time: 2h 14min 37s


In [333]:
#df_preprocessed

Unnamed: 0_level_0,tipodepropiedad,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,precio,ano,cp,zona,asenta
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
254099,Apartamento,Benito Juárez,Distrito Federal,,2.0,1.0,2.0,80.0,80.0,0.0,0.0,0.0,2273000.0,2015,03103,Urbano,Colonia
53461,Casa en condominio,La Magdalena Contreras,Distrito Federal,10.0,3.0,2.0,2.0,268.0,180.0,0.0,0.0,0.0,3600000.0,2013,10710,Urbano,Colonia
247984,Casa,Tonalá,Jalisco,5.0,3.0,2.0,2.0,144.0,166.0,0.0,0.0,0.0,1200000.0,2015,45410,Urbano,Fraccionamiento
209067,Casa,Zinacantepec,Edo. de México,1.0,2.0,1.0,1.0,63.0,67.0,0.0,0.0,0.0,650000.0,2012,50100,Urbano,Colonia
185997,Apartamento,Zapopan,Jalisco,10.0,2.0,1.0,1.0,95.0,95.0,0.0,0.0,0.0,1150000.0,2016,45079,Urbano,Colonia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,Casa,Zinacantepec,Edo. de México,0.0,2.0,2.0,1.0,67.0,,0.0,0.0,0.0,650000.0,2015,51355,Rural,Colonia
259178,Casa,Toluca,Edo. de México,0.0,3.0,3.0,3.0,200.0,250.0,0.0,0.0,0.0,1940000.0,2014,50100,Urbano,Colonia
131932,Apartamento,Benito Juárez,Distrito Federal,20.0,2.0,1.0,2.0,138.0,138.0,0.0,0.0,0.0,3400000.0,2015,03100,Urbano,Colonia
146867,Casa,Iztapalapa,Distrito Federal,20.0,4.0,0.0,4.0,235.0,137.0,1.0,0.0,0.0,2890000.0,2014,08500,Urbano,Colonia


In [23]:
df_preprocessed = df.copy()

In [24]:
#Split en train y test
train_df, test_df = train_test_split(df_preprocessed, test_size=0.1)

In [25]:
after_split_pipeline.fit(train_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the do

Pipeline(memory=None,
         steps=[('category_encoder', CategoryEncoder()),
                ('missing_values_imputer',
                 MissingValuesImputer(features_to_impute=None))],
         verbose=False)

In [26]:
preproc_train_df = after_split_pipeline.transform(train_df)
preproc_test_df = after_split_pipeline.transform(test_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the do

In [27]:
X_train = preproc_train_df.drop('precio',axis=1)
X_test = preproc_test_df.drop('precio',axis=1)

y_train = preproc_train_df['precio']
y_test = preproc_test_df['precio']

In [386]:
X_train

Unnamed: 0_level_0,tipodepropiedad,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,ano,cp,zona,asenta
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
259327,2.528136e+06,2.528136e+06,2.528136e+06,6.000000,3.000000,1.000000,2.000000,158.000000,374.000000,0.0,0.0,0.0,2012.0,53100.0,2.528136e+06,2.528136e+06
288516,2.528136e+06,2.528136e+06,2.528136e+06,0.000000,2.571881,0.000000,1.911815,153.528772,200.000000,0.0,0.0,1.0,2013.0,76100.0,2.528136e+06,2.528136e+06
169665,2.528136e+06,2.528136e+06,2.528136e+06,5.000000,3.580390,0.000000,3.000000,330.000000,297.404927,0.0,0.0,0.0,2015.0,91020.0,2.528136e+06,2.528136e+06
148936,2.528136e+06,2.528136e+06,2.528136e+06,9.747993,2.000000,1.000000,1.000000,60.000000,78.563817,0.0,0.0,0.0,2016.0,14426.0,2.528136e+06,2.528136e+06
59299,2.528136e+06,2.528136e+06,2.528136e+06,0.000000,3.000000,1.547884,2.000000,160.000000,90.000000,0.0,0.0,0.0,2016.0,76128.0,2.528136e+06,2.528136e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282940,2.528136e+06,2.528136e+06,2.528136e+06,0.000000,3.000000,2.000000,2.000000,163.000000,300.000000,0.0,0.0,0.0,2015.0,34045.0,2.528136e+06,2.528136e+06
57173,2.528136e+06,2.528136e+06,2.528136e+06,5.000000,3.000000,2.000000,2.000000,180.000000,200.000000,0.0,0.0,0.0,2012.0,72520.0,2.528136e+06,2.528136e+06
245495,2.528136e+06,2.528136e+06,2.528136e+06,4.000000,3.000000,1.000000,2.000000,120.000000,144.000000,0.0,0.0,0.0,2014.0,29050.0,2.528136e+06,2.528136e+06
207347,2.528136e+06,2.528136e+06,2.528136e+06,0.000000,4.000000,3.000000,3.321797,255.000000,300.000000,0.0,0.0,1.0,2015.0,97117.0,2.528136e+06,2.528136e+06


# Modelos

## XGB 

In [57]:
xgbr = xgb.XGBRegressor(learning_rate=0.03,max_depth=8, n_estimators=1000, n_jobs=-1, objective ='reg:squarederror')
xgbr.fit(X_train,y_train)
xgbr_test_pred = xgbr.predict(X_test)
xgbr_train_pred = xgbr.predict(X_train)

In [58]:
mean_absolute_error(y_test,xgbr_test_pred)

611006.1909361979

In [59]:
mean_absolute_error(y_train,xgbr_train_pred)

528701.018723452

# Extra trees regressor

In [28]:
from sklearn.tree import ExtraTreeRegressor

etr = ExtraTreeRegressor(max_depth=1200, min_samples_split=51).fit(X_train, y_train)

In [54]:
etr_test_pred = etr.predict(X_test)
etr_train_pred = etr.predict(X_train)

In [55]:
mean_absolute_error(y_test,etr_test_pred)

792993.8495381813

In [56]:
mean_absolute_error(y_train,etr_train_pred)

701017.4850051504

# GradientBoostingRegressor

In [33]:
from sklearn.ensemble import GradientBoostingRegressor

In [60]:
gbr = GradientBoostingRegressor(max_depth=12, n_estimators=500)

In [61]:
gbr.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=12,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [62]:
gbr_test_pred = gbr.predict(X_test)
gbr_train_pred = gbr.predict(X_train)

In [63]:
mean_absolute_error(y_test,gbr_test_pred)

597575.1309355508

In [64]:
mean_absolute_error(y_train,gbr_train_pred)

261586.88867745418

# RandomForestRegressor

In [39]:
from sklearn.ensemble import RandomForestRegressor

In [40]:
random_forest = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=35, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=20,
                      min_samples_split=20, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [41]:
random_forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=35, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=20,
                      min_samples_split=20, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [42]:
rf_test_pred = random_forest.predict(X_test)
rf_train_pred = random_forest.predict(X_train)

In [43]:
mean_absolute_error(y_test,rf_test_pred)

669199.9698723967

In [44]:
mean_absolute_error(y_train,rf_train_pred)

600949.4038571818

# KNN

In [45]:
from sklearn.neighbors import KNeighborsRegressor

In [65]:
knn = KNeighborsRegressor(n_neighbors=100).fit(X_train,y_train)

In [66]:
knn_train_pred = knn.predict(X_train)
knn_test_pred = knn.predict(X_test)

In [67]:
mean_absolute_error(y_test,knn_test_pred)

769481.0670179167

In [68]:
mean_absolute_error(y_train,knn_train_pred)

765334.5002345371

# StackingRegressor

In [69]:
from sklearn.ensemble import StackingRegressor

In [70]:
estimators = [('xgbr',xgbr),
              ('gbr',gbr),
              ('etr',etr),
              ('knn',knn),
              ('random_forest',random_forest)]

In [71]:
final = xgb.XGBRegressor(learning_rate=0.01,max_depth=3, n_estimators=500, n_jobs=-1, objective ='reg:squarederror')

In [72]:
stacking_reg = StackingRegressor(estimators=estimators, final_estimator=final )

In [73]:
stacking_reg.fit(X_train, y_train)

StackingRegressor(cv=None,
                  estimators=[('xgbr',
                               XGBRegressor(base_score=0.5, booster=None,
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1, gamma=0,
                                            gpu_id=-1, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.03,
                                            max_delta_step=0, max_depth=8,
                                            min_child_weight=1, missing=nan,
                                            monotone_constraints=None,
                                            n_estimators=1000, n_jobs=-1,
                                            nu...
                                               max_delta_step=None, max_depth=3,
             

In [74]:
s_test_pred = stacking_reg.predict(X_test)
s_train_pred = stacking_reg.predict(X_train)

In [75]:
mean_absolute_error(y_test,rf_test_pred)

669199.9698723967

In [76]:
mean_absolute_error(y_train,rf_train_pred)

600949.4038571818

## NN con Entity embeddings

In [None]:
from keras.layers import Dense, Dropout, Embedding, Input, Reshape, Concatenate
from keras.models import Model
import keras

In [None]:
X_train.columns

In [None]:
x_train_nn = X_train.drop(["ciudad","metrostotales"],axis=1)
x_val_nn = X_test.drop(["ciudad","metrostotales"],axis=1)

In [None]:
cat_vars = ['tipodepropiedad','provincia','gimnasio','usosmultiples', 'piscina', 'zona', 'asenta']
cont_vars = ['antiguedad', 'habitaciones','garages', 'banos', 'metroscubiertos','ano']

In [None]:
scaler = StandardScaler().fit(x_train_nn[cont_vars])

In [None]:
x_train_nn[cont_vars] = scaler.transform(x_train_nn[cont_vars])
x_val_nn[cont_vars] = scaler.transform(x_val_nn[cont_vars])

In [None]:
from sklearn import preprocessing
le_tipodepropiedad = preprocessing.LabelEncoder().fit(x_train_nn['tipodepropiedad'])
le_provincia = preprocessing.LabelEncoder().fit(x_train_nn['provincia'])
le_zona = preprocessing.LabelEncoder().fit(x_train_nn['zona'])
le_asenta = preprocessing.LabelEncoder().fit(x_train_nn['asenta'])

In [None]:
x_train_nn['tipodepropiedad'] = le_tipodepropiedad.transform(x_train_nn['tipodepropiedad'])
x_val_nn['tipodepropiedad'] = le_tipodepropiedad.transform(x_val_nn['tipodepropiedad'])

x_train_nn['provincia'] = le_provincia.transform(x_train_nn['provincia'])
x_val_nn['provincia'] = le_provincia.transform(x_val_nn['provincia'])

x_train_nn['zona'] = le_zona.transform(x_train_nn['zona'])
x_val_nn['zona'] = le_zona.transform(x_val_nn['zona'])

x_train_nn['asenta'] = le_asenta.transform(x_train_nn['asenta'])
x_val_nn['asenta'] = le_asenta.transform(x_val_nn['asenta'])

In [None]:
x_train = []
x_val = []
x_train.append(x_train_nn[cont_vars].astype('float32').values)
x_val.append(x_val_nn[cont_vars].astype('float32').values)
for cat in cat_vars:
    x_train.append(x_train_nn[cat].values)
    x_val.append(x_val_nn[cat].values)

In [None]:
cat_sizes = {}
cat_embsizes = {}
for cat in cat_vars:
    cat_sizes[cat] = x_train_nn[cat].nunique()
    cat_embsizes[cat] = min(50, cat_sizes[cat]//2+1)

In [None]:
ins = []
concat = []

In [None]:
y = Dropout(0.4, input_shape=(len(cat_vars)+len(cont_vars),))
y = Input((len(cont_vars),), name='cont_vars')
ins.append(y)
concat.append(y)

In [None]:
for cat in cat_vars:
    x = Input((1,), name=cat)
    ins.append(x)
    x = Embedding(cat_sizes[cat]+1, cat_embsizes[cat], input_length=1)(x)
    x = Reshape((cat_embsizes[cat],))(x)
    concat.append(x)

In [None]:
y = Concatenate()(concat)
y = Dense(512, activation= 'relu')(y)
y = Dense(512, activation= 'relu')(y)
y = Dense(512, activation= 'relu')(y)
y = Dense(512, activation= 'relu')(y)
y = Dense(1)(y)
model = Model(ins, y)
model.compile('adam', 'mean_absolute_error')

In [None]:
model.summary ()

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

In [None]:
model.fit(x_train, y_train, 
          batch_size= 1024, 
          epochs = 100, 
          validation_data=(x_val, y_test),
          callbacks=[keras.callbacks.EarlyStopping(patience=10)])

# Submission


In [77]:
holdout_df = pd.read_csv('./test_data_geocoded/test-geocoded-imputed.csv')
#holdout_df = pd.read_csv('./test.csv',parse_dates=["fecha"],index_col="id")

In [78]:
#holdout_df = before_split_pipeline.transform(holdout_df)

In [79]:
holdout_df = after_split_pipeline.transform(holdout_df)

In [80]:
holdout_df

Unnamed: 0,tipodepropiedad,ciudad,provincia,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,gimnasio,usosmultiples,piscina,ano,cp,zona,asenta
0,2.531417e+06,2.531417e+06,2.531417e+06,29.000000,3.0,2.1268,4.0,300.0,269.902704,0.0,0.0,0.0,2013.0,11700.0,2.561498e+06,2.591994e+06
1,2.531417e+06,2.531417e+06,2.531417e+06,1.088686,1.0,1.0000,1.0,67.0,67.000000,0.0,0.0,0.0,2015.0,97117.0,2.561498e+06,2.591994e+06
2,2.531417e+06,2.531417e+06,2.531417e+06,0.000000,2.0,1.0000,2.0,87.0,100.000000,0.0,0.0,0.0,2015.0,4369.0,2.561498e+06,2.591994e+06
3,2.531417e+06,2.531417e+06,2.531417e+06,2.000000,2.0,2.0000,2.0,86.0,86.000000,0.0,0.0,0.0,2015.0,39300.0,2.561498e+06,2.232977e+06
4,2.531417e+06,2.531417e+06,2.531417e+06,10.000000,2.0,1.0000,1.0,80.0,76.000000,0.0,0.0,0.0,2013.0,55717.0,2.561498e+06,1.578973e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,2.531417e+06,2.531417e+06,2.531417e+06,20.000000,4.0,3.0000,3.0,291.0,247.511873,0.0,0.0,0.0,2015.0,15540.0,2.561498e+06,2.591994e+06
59996,2.531417e+06,2.531417e+06,2.531417e+06,10.000000,3.0,1.0000,2.0,71.0,87.000000,0.0,0.0,0.0,2016.0,55240.0,2.561498e+06,2.591994e+06
59997,2.531417e+06,2.531417e+06,2.531417e+06,5.000000,3.0,2.0000,2.0,102.0,122.948249,0.0,0.0,0.0,2014.0,67188.0,2.561498e+06,2.591994e+06
59998,2.531417e+06,2.531417e+06,2.531417e+06,0.000000,2.0,1.0000,2.0,130.0,144.000000,0.0,0.0,0.0,2016.0,76060.0,2.561498e+06,2.591994e+06


In [81]:
sumb_pred = gbr.predict(holdout_df)

In [84]:
sumb_pred

array([6875172.20257003, 1040272.99070581, 2313983.33085909, ...,
        870822.09441687, 1982696.28579622, 2203121.79020885])

In [86]:
subm_df = pd.DataFrame({"id":holdout_df.index,"target":sumb_pred})
subm_df

Unnamed: 0,id,target
0,0,6.875172e+06
1,1,1.040273e+06
2,2,2.313983e+06
3,3,1.168673e+06
4,4,5.448702e+05
...,...,...
59995,59995,3.967030e+06
59996,59996,7.460334e+05
59997,59997,8.708221e+05
59998,59998,1.982696e+06


In [87]:
subm_df.to_csv("sumb_final_pipeline.csv",index=False)

# Guardo modelos entrenados

In [None]:
import pickle
from joblib import dump, load

In [None]:
dump(xgbr, 'xgbr-v3.joblib') 
dump(preprocessing_pipeline, 'preprocessing_pipeline-v3.joblib') 

In [None]:
e