# TP de regresión (properati)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sklearn as sk
from sklearn import model_selection
from sklearn import ensemble
from sklearn import metrics
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# 1. Lectura de datos

In [None]:
data_train=pd.read_csv("entrenamiento.csv", index_col="id")
data_test = pd.read_csv("a_predecir.csv", index_col="id")
df_train = data_train.copy()
df_test = data_test.copy()

# 2. Limpieza y transformación de datos 🧹

In [None]:
#Renombro columnas
df_train.rename(columns={'l1':'Pais','l2': 'Provincia','l3':'Barrio','l4':'Sub_barrio','lat':'lon','lon':'lat'}, inplace=True)
df_test.rename(columns={'l1':'Pais','l2': 'Provincia','l3':'Barrio','l4':'Sub_barrio','lat':'lon','lon':'lat'}, inplace=True)

In [None]:
#Elimino duplicados, propiedades sin precio y propiedades con precio = 0
df_train=df_train.drop_duplicates()
df_train= df_train.dropna(subset=['price'])
df_train = df_train[df_train['price']!= 0]

## Titulo, descripcion y barrio a str y minusculas

In [None]:
df_train[['title', 'description']] = df_train[['title', 'description']].astype(str)
df_test[['title', 'description']] = df_test[['title', 'description']].astype(str)

df_train['title'] = df_train['title'].str.lower()
df_train['description'] = df_train['description'].str.lower()
df_test['title'] = df_test['title'].str.lower()
df_test['description'] = df_test['description'].str.lower()
df_train['Barrio'] = df_train['Barrio'].str.lower()
df_test['Barrio'] = df_test['Barrio'].str.lower()

## Separo en CASA DEPARTAMENTO COCHERA

In [None]:
#creo filtros para separar tipo de propiedad
filtro_casa=df_train.property_type== "Casa"
filtro_depa=df_train.property_type=="Departamento"
filtro_cochera=df_train.property_type=="Cochera"
########
filtro_casa_t=df_test.property_type=="Casa"
filtro_depa_t=df_test.property_type=="Departamento"
filtro_cochera_t=df_test.property_type=="Cochera"

#parto df_train en 3. lo mismo para df_test
df_train_casa=df_train.loc[filtro_casa]
df_train_depa=df_train.loc[filtro_depa]
df_train_cochera=df_train.loc[filtro_cochera]

df_test_casa=df_test.loc[filtro_casa_t]
df_test_depa=df_test.loc[filtro_depa_t]
df_test_cochera=df_test.loc[filtro_cochera_t]

In [None]:
col_eliminar=['property_type']

df_train_casa.drop(col_eliminar, axis=1, inplace= True)
df_train_depa.drop(col_eliminar, axis=1, inplace= True)
df_train_cochera.drop(col_eliminar, axis=1, inplace= True)
df_test_casa.drop(col_eliminar, axis=1, inplace= True)
df_test_depa.drop(col_eliminar, axis=1, inplace= True)
df_test_cochera.drop(col_eliminar, axis=1, inplace= True)

## Creo filtros para cada uno de los dataset

## CASA

In [None]:
lat_min = df_test_casa['lat'].min()
lat_max = df_test_casa['lat'].max()
lon_min = df_test_casa['lon'].min()
lon_max = df_test_casa['lon'].max()

barrios_types_casa=df_test_casa['Barrio'].unique()

df_train_casa = df_train_casa.loc[
    (df_train_casa["Pais"] == "Argentina") &
    (df_train_casa["operation_type"] == "Venta") &
    (df_train_casa["currency"] == 'USD') &
    (df_train_casa["Barrio"].isin(barrios_types_casa)) &
    (df_train_casa["lat"] >= lat_min) &
    (df_train_casa["lat"] <= lat_max) &
    (df_train_casa["lon"] >= lon_min) &
    (df_train_casa["lon"] <= lon_max)
]

## DEPARTAMENTO

In [None]:
lat_min = df_test_depa['lat'].min()
lat_max = df_test_depa['lat'].max()
lon_min = df_test_depa['lon'].min()
lon_max = df_test_depa['lon'].max()

barrios_types_depa=df_test_depa['Barrio'].unique()

df_train_depa = df_train_depa.loc[
    (df_train_depa["Pais"] == "Argentina") &
    (df_train_depa["operation_type"] == "Venta") &
    (df_train_depa["currency"] == 'USD') &
    (df_train_depa["Barrio"].isin(barrios_types_depa)) &
    (df_train_depa["lat"] >= lat_min) &
    (df_train_depa["lat"] <= lat_max) &
    (df_train_depa["lon"] >= lon_min) &
    (df_train_depa["lon"] <= lon_max)
]

In [None]:
#Para departamentos creo la columna monoambiente
df_train_depa["monoambiente"]=None
df_test_depa["monoambiente"]=None
df_train_depa["monoambiente"] = df_train_depa['description'].str.contains("monoambiente", case=False, na=False)
df_test_depa["monoambiente"] = df_test_depa['description'].str.contains("monoambiente", case=False, na=False)

## COCHERA

In [None]:
lat_min = df_test_depa['lat'].min()
lat_max = df_test_depa['lat'].max()
lon_min = df_test_depa['lon'].min()
lon_max = df_test_depa['lon'].max()

barrios_types_cochera=df_test_cochera['Barrio'].unique()

df_train_cochera = df_train_cochera.loc[
    (df_train_cochera["Pais"] == "Argentina") &
    (df_train_cochera["operation_type"] == "Venta") &
    (df_train_cochera["currency"] == 'USD') &
    (df_train_cochera["Barrio"].isin(barrios_types_cochera)) &
    (df_train_cochera["lat"] >= lat_min) &
    (df_train_cochera["lat"] <= lat_max) &
    (df_train_cochera["lon"] >= lon_min) &
    (df_train_cochera["lon"] <= lon_max)
]

In [None]:
### elimino de train/test cochera los ambientes
col_eliminar=['rooms','bedrooms','bathrooms']
df_train_cochera.drop(col_eliminar, axis=1, inplace= True)
df_test_cochera.drop(col_eliminar, axis=1, inplace= True)

# Nuevas columnas a partir de description y title

In [None]:
def agregar_caracteristicas(df, col1="title", col2="description"):
    features = ["gym", "sum", "parrilla", "loundry", "soleado", "subte", "terraza", "pileta", "jacuzzi", "balcon"]
    for feature in features:
        df[f"{feature}_r"] = 0

    patrones = {
        "gym": r"(gym|gimnasio)",
        "sum": r"\bsum\b",
        "parrilla": r"\bparrilla\b",
        "loundry": r"\bloundry\b",
        "soleado": r"\bsoleado\b",
        "subte": r"\bsubte\b",
        "terraza": r"\bterraza\b",
        "pileta": r"\bpileta\b",
        "jacuzzi": r"([JjYy]acu[zs]{1,2}i)",
        "balcon": r"\bbalc[oó]n\b"
    }

    for feature, pattern in patrones.items():
        df[f"{feature}_r"] += df[col1].str.contains(pattern, case=False, na=False).astype(int)
        df[f"{feature}_r"] += df[col2].str.contains(pattern, case=False, na=False).astype(int)

    for feature in features:
        df[f"{feature}_r"] = df[f"{feature}_r"].apply(lambda x: 1 if x > 0 else 0)

    df['lujoso'] = df[[f"{feature}_r" for feature in features]].sum(axis=1)

    return df

In [None]:
agregar_caracteristicas(df_train_depa)
agregar_caracteristicas(df_train_casa)
agregar_caracteristicas(df_test_casa)
agregar_caracteristicas(df_test_depa)

In [None]:
def agregar_caracteristicas_cochera(df, col1="title", col2="description"):
    features = ["garage en block", "galpón cocheras", "cochera deposito", 
                "cocheras + local+ deposito en san cristóbal", "paquete de cocheras","cocheras cubiertas","edificio de cocheras"]
    for feature in features:
        df[f"{feature}_r"] = 0

    patrones = {
        "garage en block": r"\bgarage en block\b",
        "galpón cocheras": r"\bgalpón cocheras\b",
        "cochera deposito": r"\bcochera deposito\b",
        "cocheras cubiertas": r"\bcocheras cubiertas\b",
        "edificio de cocheras":r"\bedificio de cocheras\b",
        "cocheras + local+ deposito en san cristóbal": r"\bcocheras \+ local\+ deposito en san cristóbal\b",
        "paquete de cocheras": r"\bpaquete de cocheras\b"
    }

    for feature, pattern in patrones.items():
        df[f"{feature}_r"] = (df[col1].str.contains(pattern, case=False, na=False).astype(int) |
                              df[col2].str.contains(pattern, case=False, na=False).astype(int))

    df["COMPLEJO COCHERAS"] = df[[f"{feature}_r" for feature in features]].any(axis=1).astype(int)

    return df

In [None]:
agregar_caracteristicas_cochera(df_train_cochera)
agregar_caracteristicas_cochera(df_test_cochera)

## Encontrar el barrio de propiedad en el titulo o descripcion

In [None]:
def encontrar_barrios(df, columna1, columna2, lista_barrios):
    lista_barrios=lista_barrios.astype("str")
    barrios= [palabra.lower() for palabra in lista_barrios]
    def buscar_barrio(texto):
        for barrio in barrios:
            if barrio in texto:
                return barrio
        return "Nada"
    
    df['barrios_encontrados_title'] = df[columna1].apply(buscar_barrio)
    df['barrios_encontrados_description'] = df[columna2].apply(buscar_barrio)
    return df

def encontrar_consenso(row):
    titulo = row['barrios_encontrados_title']
    descripcion = row['barrios_encontrados_description']
    if titulo == "Nada" and descripcion== "Nada":
        return None
    elif descripcion == "Nada" and titulo !="Nada":
        return titulo
    elif titulo == "Nada" and descripcion !="Nada":
        return descripcion
    else:
        return titulo

In [None]:
df_train_casa=encontrar_barrios(df_train_casa,"title","description",barrios_types_casa)
df_test_casa=encontrar_barrios(df_test_casa,"title","description",barrios_types_casa)
df_train_casa['consenso_barrios'] = df_train_casa.apply(encontrar_consenso, axis=1)
df_test_casa['consenso_barrios'] = df_test_casa.apply(encontrar_consenso, axis=1)

df_train_depa=encontrar_barrios(df_train_depa,"title","description",barrios_types_depa)
df_test_depa=encontrar_barrios(df_test_depa,"title","description",barrios_types_depa)
df_train_depa['consenso_barrios'] = df_train_depa.apply(encontrar_consenso, axis=1)
df_test_depa['consenso_barrios'] = df_test_depa.apply(encontrar_consenso, axis=1)

df_train_cochera=encontrar_barrios(df_train_cochera,"title","description",barrios_types_cochera)
df_test_cochera=encontrar_barrios(df_test_cochera,"title","description",barrios_types_cochera)
df_train_cochera['consenso_barrios'] = df_train_cochera.apply(encontrar_consenso, axis=1)
df_test_cochera['consenso_barrios'] = df_test_cochera.apply(encontrar_consenso, axis=1)

### Completo los valores faltantes de los barrios segun el consenso que arme

In [None]:
df_train_casa.loc[df_train_casa['Barrio'].isna(), 'Barrio'] = df_train_casa['consenso_barrios']
df_train_depa.loc[df_train_depa['Barrio'].isna(), 'Barrio'] = df_train_depa['consenso_barrios']
df_test_casa.loc[df_test_casa['Barrio'].isna(), 'Barrio'] = df_test_casa['consenso_barrios']
df_test_depa.loc[df_test_depa['Barrio'].isna(), 'Barrio'] = df_test_depa['consenso_barrios']

### Divido palermo con los sub_barrios

In [None]:
df_train_casa.loc[df_train_casa['Barrio'] == 'Palermo', 'Barrio'] = df_train_casa['Sub_barrio']
df_test_casa.loc[df_test_casa['Barrio'] == 'Palermo', 'Barrio'] = df_test_casa['Sub_barrio']

In [None]:
df_train_depa.loc[df_train_depa['Barrio'] == 'Palermo', 'Barrio'] = df_train_depa['Sub_barrio']
df_test_depa.loc[df_test_depa['Barrio'] == 'Palermo', 'Barrio'] = df_test_depa['Sub_barrio']

## SUPERFICIE TOTAL VS CUBIERTA

In [None]:
def invertir_superficies(df):
    # filtro las filas que tienen mayor superficie cubierta que total
    f = df['surface_covered'] > df['surface_total']
    df.loc[f, ['surface_total', 'surface_covered']] = df.loc[f, ['surface_covered', 'surface_total']].values
    return df

def asignar_superficie_total(df):
    # Primero, intenta completar 'surface_total' con 'surface_covered' si 'surface_total' es NaN
    df['surface_total'] = df['surface_total'].where(pd.notnull(df['surface_total']), df['surface_covered'])
    
    # Luego, intenta completar 'surface_covered' con 'surface_total' si 'surface_covered' es NaN
    df['surface_covered'] = df['surface_covered'].where(pd.notnull(df['surface_covered']), df['surface_total'])
    return df


df_train_casa=invertir_superficies(df_train_casa)
df_train_depa=invertir_superficies(df_train_depa)
df_train_cochera=invertir_superficies(df_train_cochera)

df_test_casa=invertir_superficies(df_test_casa)
df_test_depa=invertir_superficies(df_test_depa)
df_test_cochera=invertir_superficies(df_test_cochera)

df_train_casa=asignar_superficie_total(df_train_casa)
df_train_depa=asignar_superficie_total(df_train_depa)
df_train_cochera=asignar_superficie_total(df_train_cochera)

df_test_casa=asignar_superficie_total(df_test_casa)
df_test_depa=asignar_superficie_total(df_test_depa)
df_test_cochera=asignar_superficie_total(df_test_cochera)

### Imputacion de outliers para superficie total y cubierta.

In [None]:
df_train_depa.loc[139033, ['surface_total','surface_covered']] = 47.63
df_train_depa.loc[275571, 'surface_total'] = 108.96
df_train_depa.loc[581682, ['surface_total','surface_covered']] = 104.89
df_train_depa.loc[742758, 'surface_total'] = 48.17
df_train_depa.loc[942718, ['surface_total','surface_covered']] = 104.89
df_train_depa.loc[994027, ['surface_total','surface_covered']] = 21.92

In [None]:
df_test_depa.loc[973771, 'surface_total'] = 69
df_test_depa.loc[973771, 'surface_covered'] = 63
df_test_depa.loc[899423, 'surface_total'] = 81
df_test_depa.loc[899423, 'surface_covered'] = 71.2

In [None]:
df_train_casa.loc[49928, 'surface_total'] = 377.38
df_train_casa.loc[49928, 'surface_covered'] = 219

In [None]:
df_train_cochera.loc[865148, 'surface_total'] = 20.5
df_train_cochera.drop(865149,inplace=True)### misma propiedad que la anterior

In [None]:
df_test_cochera.loc[866125, 'surface_total'] = 13

## Si conozco el barrio puedo imputar la lat y lon con el promedio de cada variable por barrio

In [None]:
lat_barrio_depa=df_train_depa.groupby("Barrio").lat.mean().reset_index().set_index("Barrio")
lon_barrio_depa=df_train_depa.groupby("Barrio").lon.mean().reset_index().set_index("Barrio")

lat_barrio_casa=df_train_casa.groupby("Barrio").lat.mean().reset_index().set_index("Barrio")
lon_barrio_casa=df_train_casa.groupby("Barrio").lon.mean().reset_index().set_index("Barrio")

lat_barrio_cochera=df_train_cochera.groupby("Barrio").lat.mean().reset_index().set_index("Barrio")
lon_barrio_cochera=df_train_cochera.groupby("Barrio").lon.mean().reset_index().set_index("Barrio")

In [None]:
missing_lat_depa = df_train_depa['lat'].isna()
missing_lon_depa = df_train_depa['lon'].isna()

missing_lat_casa = df_train_casa['lat'].isna()
missing_lon_casa = df_train_casa['lon'].isna()

missing_lat_cochera = df_train_cochera['lat'].isna()
missing_lon_cochera = df_train_cochera['lon'].isna()

# Completo los faltantes en train
df_train_depa.loc[missing_lat_depa, 'lat'] = df_train_depa.loc[missing_lat_depa, 'Barrio'].map(lat_barrio_depa['lat'])
df_train_depa.loc[missing_lon_depa, 'lon'] = df_train_depa.loc[missing_lon_depa, 'Barrio'].map(lon_barrio_depa['lon'])

df_train_casa.loc[missing_lat_casa, 'lat'] = df_train_casa.loc[missing_lat_casa, 'Barrio'].map(lat_barrio_casa['lat'])
df_train_casa.loc[missing_lon_casa, 'lon'] = df_train_casa.loc[missing_lon_casa, 'Barrio'].map(lon_barrio_casa['lon'])

df_train_cochera.loc[missing_lat_cochera, 'lat'] = df_train_cochera.loc[missing_lat_cochera, 'Barrio'].map(lat_barrio_cochera['lat'])
df_train_cochera.loc[missing_lon_cochera, 'lon'] = df_train_cochera.loc[missing_lon_cochera, 'Barrio'].map(lon_barrio_cochera['lon'])

In [None]:
missing_lat_depa = df_test_depa['lat'].isna()
missing_lon_depa = df_test_depa['lon'].isna()

missing_lat_casa = df_test_casa['lat'].isna()
missing_lon_casa = df_test_casa['lon'].isna()

missing_lat_cochera = df_test_cochera['lat'].isna()
missing_lon_cochera = df_test_cochera['lon'].isna()


# Completo los faltantes en test
df_test_depa.loc[missing_lat_depa, 'lat'] = df_test_depa.loc[missing_lat_depa, 'Barrio'].map(lat_barrio_depa['lat'])
df_test_depa.loc[missing_lon_depa, 'lon'] = df_test_depa.loc[missing_lon_depa, 'Barrio'].map(lon_barrio_depa['lon'])

df_test_casa.loc[missing_lat_casa, 'lat'] = df_test_casa.loc[missing_lat_casa, 'Barrio'].map(lat_barrio_casa['lat'])
df_test_casa.loc[missing_lon_casa, 'lon'] = df_test_casa.loc[missing_lon_casa, 'Barrio'].map(lon_barrio_casa['lon'])

df_test_cochera.loc[missing_lat_cochera, 'lat'] = df_test_cochera.loc[missing_lat_cochera, 'Barrio'].map(lat_barrio_cochera['lat'])
df_test_cochera.loc[missing_lon_cochera, 'lon'] = df_test_cochera.loc[missing_lon_cochera, 'Barrio'].map(lon_barrio_cochera['lon'])

## Promedio de precio por barrio

In [None]:
price_barrio_depa = df_train_depa.groupby("Barrio").price.median().round().astype(int).reset_index().set_index("Barrio")
price_barrio_casa=df_train_casa.groupby("Barrio").price.median().round().astype(int).reset_index().set_index("Barrio")
price_barrio_cochera=df_train_cochera.groupby("Barrio").price.median().round().astype(int).reset_index().set_index("Barrio")

In [None]:
df_train_depa["promedio_barrio"] = df_train_depa["Barrio"].map(price_barrio_depa["price"])
df_train_casa["promedio_barrio"] = df_train_casa["Barrio"].map(price_barrio_depa["price"])
df_train_cochera["promedio_barrio"] = df_train_cochera["Barrio"].map(price_barrio_depa["price"])

df_test_depa["promedio_barrio"] = df_test_depa["Barrio"].map(price_barrio_depa["price"])
df_test_casa["promedio_barrio"] = df_test_casa["Barrio"].map(price_barrio_depa["price"])
df_test_cochera["promedio_barrio"] = df_test_cochera["Barrio"].map(price_barrio_depa["price"])

## MICE

In [None]:
imp = IterativeImputer(max_iter=100, random_state=42)  
imputed_values=imp.fit_transform(df_train_casa[['lon', 'lat', 'rooms', 'bedrooms',"bathrooms","surface_total","surface_covered","promedio_barrio"]])

df_train_casa[['lon', 'lat', 'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered',"promedio_barrio"]] = imputed_values

In [None]:
imp = IterativeImputer(max_iter=100, random_state=42) 
imputed_values=imp.fit_transform(df_train_depa[['lon', 'lat', 'rooms', 'bedrooms',"bathrooms","surface_total","surface_covered","promedio_barrio"]])

df_train_depa[['lon', 'lat', 'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered',"promedio_barrio"]] = imputed_values

In [None]:
imp = IterativeImputer(max_iter=100, random_state=42)  # tiene add_indicator
imputed_values=imp.fit_transform(df_train_cochera[['lon', 'lat',"surface_total","surface_covered","promedio_barrio"]])

df_train_cochera[['lon', 'lat', 'surface_total', 'surface_covered',"promedio_barrio"]] = imputed_values

In [None]:
imp = IterativeImputer(max_iter=100, random_state=42)  # tiene add_indicator
imputed_values=imp.fit_transform(df_test_casa[['lon', 'lat', 'rooms', 'bedrooms',"bathrooms","promedio_barrio"]])

df_test_casa[['lon', 'lat', 'rooms', 'bedrooms', 'bathrooms',"promedio_barrio"]] = imputed_values

In [None]:
imp = IterativeImputer(max_iter=100, random_state=42)  # tiene add_indicator
imputed_values=imp.fit_transform(df_test_depa[['lon', 'lat', 'rooms', 'bedrooms',"bathrooms","promedio_barrio"]])

df_test_depa[['lon', 'lat', 'rooms', 'bedrooms', 'bathrooms',"promedio_barrio"]] = imputed_values

### Columnas a eliminar

In [None]:
#Columnas a eliminar desp de una chusmeada, en ambos df, de momento creo q no me aportan
col_eliminar_casa=["consenso_barrios","barrios_encontrados_description","barrios_encontrados_title","Barrio",'Pais','Provincia','operation_type','Sub_barrio','l5','l6','price_period','title','description','ad_type','start_date','end_date','created_on','currency']
col_eliminar_depa=["consenso_barrios","barrios_encontrados_description","barrios_encontrados_title","Barrio",'Pais','Provincia','operation_type','Sub_barrio','l5','l6','price_period','title','description','ad_type','start_date','end_date','created_on','currency']
col_eliminar_cochera=['garage en block_r', 'galpón cocheras_r',
       'cochera deposito_r', 'cocheras + local+ deposito en san cristóbal_r',
       'paquete de cocheras_r', 'cocheras cubiertas_r',
       'edificio de cocheras_r',"consenso_barrios","barrios_encontrados_description","barrios_encontrados_title","Barrio",'Pais','Provincia','operation_type','Sub_barrio','l5','l6','price_period','title','description','ad_type','start_date','end_date','created_on','currency']

df_train_casa.drop(col_eliminar_casa, axis=1, inplace= True)
df_train_depa.drop(col_eliminar_depa, axis=1, inplace= True)
df_train_cochera.drop(col_eliminar_cochera, axis=1, inplace= True)

df_test_casa.drop(col_eliminar_casa, axis=1, inplace= True)
df_test_depa.drop(col_eliminar_depa, axis=1, inplace= True)
df_test_cochera.drop(col_eliminar_cochera, axis=1, inplace= True)

## Aplico el log10 a precio

In [None]:
def aplicar_log_base_10_a_columna(df, columna="price"):
    df[columna] = np.log10(df[columna])
    return df

In [None]:
aplicar_log_base_10_a_columna(df_train_depa)
aplicar_log_base_10_a_columna(df_train_casa)
aplicar_log_base_10_a_columna(df_train_cochera)

# 3. Entrenamiento del Modelo con Random Forest

In [None]:
def entrenamiento(df):
    df = df.select_dtypes(include=['float64', 'int64', 'int32', 'int16', 'int8', 'bool'])

    X = df[df.columns.drop('price')]
    y = df['price']
    
    for n_estimators in [50,75,100,1000]:
        for max_depth in [5,10,20,30,40]:
            print(f"{n_estimators=} -- {max_depth=}")

            # Creamos el modelo
            reg = sk.ensemble.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,n_jobs=-1, random_state=42)

            scores_train = []
            scores_test = []

            # Validación cruzada, 10 folds, shuffle antes, semilla aleatoria
            kf = sk.model_selection.KFold(n_splits=10, shuffle=True, random_state=42)

            for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
                # Partimos el fold en entrenamiento y prueba...
                X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]

                # Entrenamos el modelo en entramiento
                reg.fit(X_train, y_train)

                # Predecimos en train
                y_pred = reg.predict(X_train)

                # Medimos la performance de la predicción en entramiento
                score_train = sk.metrics.mean_squared_error(y_train, y_pred, squared=False)
                scores_train.append(score_train)

                # Predecimos en test
                y_pred = reg.predict(X_test)

                # Medimos la performance de la predicción en prueba
                score_test = sk.metrics.mean_squared_error(y_test, y_pred, squared=False)
                scores_test.append(score_test)

                print("\t", f"{fold=}, {score_train=} {score_test=}")

            print(f"Media de scores en entrenamiento={pd.Series(scores_train).mean()}, std={pd.Series(scores_train).std()}")
            print(f"Media de scores en prueba={pd.Series(scores_test).mean()}, std={pd.Series(scores_test).std()}")
            print()

In [None]:
entrenamiento(df_train_casa)

In [None]:
entrenamiento(df_train_depa)

In [None]:
entrenamiento(df_train_cochera)

# 4. Predicción para kaggle -- ⚠️⚠️⚠️ MODIFICAR HIPERPARÁMETROS ⚠️⚠️⚠️


In [None]:
def predecir(df_train,df_test,n_estimators,max_depth):
    ## Datos a predecir
    X = df_train[df_train.columns.drop('price')]
    y = df_train['price']

    X_test = df_test[df_train.columns.drop('price')]

    reg = sk.ensemble.RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, random_state=42)
    reg.fit(X, y)

    # Predecimos
    df_test['price'] = reg.predict(X_test)

    # Creamos el dataframe para entregar
    df_sol = df_test[["price"]]
    return df_sol,reg,X

In [None]:
predic_casa,mod_casa,X=predecir(df_train_casa,df_test_casa,1000,40)

In [None]:
predic_depa,mod_depa,X=predecir(df_train_depa,df_test_depa,1000,40)

In [None]:
predic_cochera,mod_cochera,X=predecir(df_train_cochera,df_test_cochera,1000,40)

## Concateno las 3 predicciones

In [None]:
df_sol = pd.concat([predic_casa, predic_depa, predic_cochera], axis=0)

### Ahora revierto el log10

In [None]:
def revertir_log_base_10_a_columna(df, columna="price"):
    df[columna] = np.power(10, df[columna])
    return df

In [None]:
revertir_log_base_10_a_columna(df_sol)

In [None]:
# redondeo el precio
df_sol['price'] = df_sol['price'].round()
df_sol['price'] = df_sol['price'].astype(int)

In [None]:
# Tests de validación de la predicción antes de subirla
# Estos tests TIENEN que pasar sin error

assert (df_sol["price"] <= 0).sum() == 0, "Hay predicciones de precios menores o iguales a 0."
assert df_sol.shape[0] == 7808, f"La cantidad de filas no es correcta. Es {df_sol.shape[0]} y debe ser 7808."
assert df_sol.shape[1] == 1, f"La cantidad de columnas no es correcta. Es {df_sol.shape[1]} y debe ser 1."
assert 'price' in df_sol.columns, "Falta la columna 'price'."
assert df_sol.index.name == 'id', "El índice debe llamarse 'id'."


In [None]:
#imputacion del precio de una cochera a predecir que tenia el precio en la descripcion
df_sol.loc[274482, 'price'] = 2500000

In [None]:
# Guardamos la version
version = "v-FINAL - Fpicado"
df_sol['price'].to_csv(f"solucion-{version}.csv", index=True)

In [None]:
importances = mod_cochera.feature_importances_
std = np.std([tree.feature_importances_ for tree in mod_cochera.estimators_], axis=0)

forest_importances = pd.Series(importances, index=X.columns.to_list())

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()