# **Dataset Properati.**
---

#### Librerias

In [4]:
import pandas as pd
import numpy as np
import re
import random
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from scipy.spatial import cKDTree

from sklearn import linear_model
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, RepeatedKFold

import statsmodels.api as sm

# import foliuma
# from folium.plugins import HeatMap

#### Configuraciones

In [6]:
pd.set_option('display.max_columns', None)

---
#### Cargamos el dataset

In [8]:
df = pd.read_csv("properatti_FINAL.csv", sep=",")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106342 entries, 0 to 106341
Data columns (total 65 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Unnamed: 0.1                    106342 non-null  int64  
 1   Unnamed: 0                      106342 non-null  int64  
 2   ID                              106342 non-null  int64  
 3   property_type                   106342 non-null  object 
 4   rooms                           106342 non-null  float64
 5   rooms_inputted                  106342 non-null  object 
 6   place_with_parent_names         106342 non-null  object 
 7   NivelAdm1                       106342 non-null  object 
 8   NivelAdm2                       102520 non-null  object 
 9   NivelAdm3                       36662 non-null   object 
 10  NivelAdm4                       1608 non-null    object 
 11  geonames_id                     106342 non-null  float64
 12  geonames_id_name

#### Dummies property type

In [11]:
df.property_type.value_counts(True).mul(100).round(2)

property_type
apartment    56.14
house        35.36
PH            4.94
store         3.56
Name: proportion, dtype: float64

In [12]:
property_type_dummies = pd.get_dummies(df.property_type, prefix='pt', drop_first = True)

In [13]:
df.drop(property_type_dummies.columns, axis=1, inplace=True, errors='ignore')
df = pd.concat([df, property_type_dummies], axis=1)

#### Dummies Geolocalización

In [15]:
df.geonames_id.value_counts()

geonames_id
3430863.0     6733
3838574.0     5963
3860259.0     5477
3430234.0     3015
3436077.0     2672
              ... 
12035939.0       1
3835354.0        1
3473391.0        1
3433456.0        1
3430940.0        1
Name: count, Length: 875, dtype: int64

In [16]:
full_geo_dummies = pd.get_dummies(df.geonames_id, prefix='fullgeo', drop_first = True)

In [17]:
full_geo_dummies.columns

Index(['fullgeo_3427212.0', 'fullgeo_3427213.0', 'fullgeo_3427326.0',
       'fullgeo_3427354.0', 'fullgeo_3427362.0', 'fullgeo_3427366.0',
       'fullgeo_3427367.0', 'fullgeo_3427373.0', 'fullgeo_3427375.0',
       'fullgeo_3427377.0',
       ...
       'fullgeo_12035853.0', 'fullgeo_12035911.0', 'fullgeo_12035915.0',
       'fullgeo_12035934.0', 'fullgeo_12035939.0', 'fullgeo_12035944.0',
       'fullgeo_12035960.0', 'fullgeo_12036353.0', 'fullgeo_12076997.0',
       'fullgeo_12128682.0'],
      dtype='object', length=874)

In [18]:
df = pd.concat([df, full_geo_dummies], axis=1)

#### Dummies Geolocalización Limit

In [20]:
# Establece un límite de cantidad de geonames, el resto los asigna al más cercano
LimitGeonamesPercent = 20

LimitGeonamesCant = int(df.geonames_id.value_counts().shape[0]*(LimitGeonamesPercent/100)) # Vamos a mantener el 25%, que representa el 93.84% de las propiedades

# Obtengo los valores únicos
df_geo = df[['geonames_id','geonames_id_name','geonames_id_lat','geonames_id_lon']].copy().drop_duplicates()

# Genero Points
df_geo.insert(df_geo.columns.get_loc('geonames_id_lon')+1, 'geopoint', gpd.points_from_xy(df_geo.geonames_id_lat, df_geo.geonames_id_lon))

# nueva columna para hacer el mapeo
df_geo['geonames_id_new'] = None
df_geo['geonames_id_new_name'] = None

# Obtengo los $LimitGeonamesCant valores más frecuentes
ListTopGeoID = df.geonames_id.value_counts().head(LimitGeonamesCant).index.tolist()

Capturadas = df.geonames_id.isin(ListTopGeoID).sum()
CapturadasPC = round((Capturadas / df.shape[0])*100, 2)

print(f'Limitando al {LimitGeonamesPercent}% de ubicaciones más populares')
print(f'\tNos quedamos con {LimitGeonamesCant} ubicaciones (o variables dummies)')
print(f'\tRepresentan el {CapturadasPC}% de todas las propiedades')

Limitando al 20% de ubicaciones más populares
	Nos quedamos con 175 ubicaciones (o variables dummies)
	Representan el 90.92% de todas las propiedades


In [21]:
# Asigno geonames_id_new a los más frecuentes
df_geo.loc[df_geo.geonames_id.isin(ListTopGeoID), 'geonames_id_new'] = df_geo['geonames_id']
df_geo.loc[df_geo.geonames_id.isin(ListTopGeoID), 'geonames_id_new_name'] = df_geo['geonames_id_name']

In [22]:
# Genero cKDTree para buscar los mas cercanos
df_geo_top = df_geo[df_geo.geonames_id_new.notnull()].copy()
df_geo_top_btree = cKDTree(np.array(list(df_geo_top.apply(lambda row: (row.geonames_id_lat, row.geonames_id_lon), axis=1))))

In [30]:
# Función que asigna los mas cercanos
def GeoNameMasCercano(row):
    # Usamos 'pd.isna' o 'is None' para mayor seguridad con valores nulos de Pandas
    if pd.isna(row['geonames_id_new']):
        # Extraemos las coordenadas como una tupla o lista (x, y)
        punto_coords = [row.geopoint.x, row.geopoint.y]
        
        # Ahora el btree sí puede procesar los números
        dist, idx = df_geo_top_btree.query(punto_coords, k=1)
        
        row['geonames_id_new'] = df_geo_top.iloc[idx]['geonames_id']
        row['geonames_id_new_name'] = df_geo_top.iloc[idx]['geonames_id_name']
    return row

In [32]:
# Aplico la función... 
df_geo = df_geo.apply(GeoNameMasCercano, axis=1)

# Genero un dict para la asignación, el df_geo fue creado para tener control del resultado, no era necesario
map_geonames = dict(df_geo[['geonames_id', 'geonames_id_new']].values.tolist())

In [34]:
df.drop(['geonames_id_limited'], axis=1, inplace=True, errors='ignore')
df.insert(df.columns.get_loc('geonames_id')+1, '', np.NaN)

# Asigno el geonames limitado
for i in map_geonames:
    df.loc[df.geonames_id == i, 'geonames_id_limited'] = map_geonames.get(i)


In [36]:
# creo dummies para geonames limitados
limited_geo_dummies = pd.get_dummies(df.geonames_id_limited, prefix='geo_limited', drop_first = True)

In [38]:
limited_geo_dummies.columns.shape

(174,)

In [40]:
# Dropeo las columnas por si ya ejecuté esta celda
df.drop(limited_geo_dummies.columns, axis=1, inplace=True, errors='ignore')

# Agrego las columnas
df = pd.concat([df, limited_geo_dummies], axis=1)

In [42]:
df[df.geonames_id != df.geonames_id_limited].shape

(9657, 1118)

In [44]:
df.shape

(106342, 1118)

---
## Modelado
#### Definición de Columnas a utilizar



In [47]:
# Vamos a probar sobre dos objetivos
TargetCols = ['price_usd_per_m2', 'superficie_precio_m2']

# Todas las columnas posibles
Cols = ['pt_apartment', 'pt_house', 'pt_store',
        'rooms',
        'EsBarrioCerrado', 
        'surface_total_in_m2',
        'surface_covered_in_m2',
        'price_aprox_usd',
        'superficie',
        'CercanoHospital', 
        'CercanoCentroSalud', 
        'CercanoPuntoMedico',
        'CercanoEducPrimaria', 
        'CercanoEducSecundaria', 
        'CercanoEducInicial',
        'CercanoEducAdultos', 
        'CercanoUniversidad', 
        'CercanoTerminalOmnibus',
        'CercanoRecreacion', 
        'CercanoPlaza', 
        'CercanoParque',
        'keyWord_amenities', 
        'keyWord_amoblado', 
        'keyWord_avenida',
        'keyWord_banco', 
        'keyWord_buenEstado', 
        'keyWord_cochera',
        'keyWord_dependencias', 
        'keyWord_dueno', 
        'keyWord_electrogeno',
        'keyWord_enPozo', 
        'keyWord_estrenar', 
        'keyWord_extras',
        'keyWord_gimnasio', 
        'keyWord_lujoso', 
        'keyWord_luminoso',
        'keyWord_parrilla', 
        'keyWord_petfriendly', 
        'keyWord_pileta',
        'keyWord_profesional', 
        'keyWord_quincho', 
        'keyWord_seguridad',
        'keyWord_subte', 
        'keyWord_terraza', 
        'keyWord_vista'
]

Cols += limited_geo_dummies.columns.tolist()
#Cols += full_geo_dummies.columns.tolist() # sólo fue para probar... no tiene sentido utilizar el 100% de las ubicaciones


#### Separación Train/Test

In [51]:
#Separamos en train y test
X = df[Cols].copy().astype(int)
y = df.price_usd_per_m2.astype(int)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=1)

In [53]:
# Escalado ??
#scaler = StandardScaler()
#scaler.fit_transform(Xtrain)

## Modelo Regresion Lineal

In [58]:
# Modelo
model = linear_model.LinearRegression(fit_intercept=True)
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)
ypred_train = model.predict(Xtrain)

# Resultado
print('Mean Absolute Error:', round(mean_absolute_error(ytest, ypred), 2))
print('Mean Squared Error:', round(mean_squared_error(ytest, ypred), 2))
print('R2 Score:', round(r2_score(ytest, ypred), 3))

Mean Absolute Error: 332.43
Mean Squared Error: 237623.83
R2 Score: 0.728


## Modelo Regresion Lineal Ridge

In [63]:
lm_ridge = linear_model.Ridge()
reg_lineal_ridge = lm_ridge.fit(Xtrain, ytrain)
ypred_ridge = reg_lineal_ridge.predict(Xtest)

# Resultado
print('--- Resultados Ridge ---')
print('Mean Absolute Error:', round(mean_absolute_error(ytest, ypred_ridge), 2))
print('Mean Squared Error:', round(mean_squared_error(ytest, ypred_ridge), 2))
print('R2 Score:', round(r2_score(ytest, ypred_ridge), 3))
print('Score (R2) directo del modelo:', round(reg_lineal_ridge.score(Xtest, ytest), 3))

--- Resultados Ridge ---
Mean Absolute Error: 332.41
Mean Squared Error: 237602.9
R2 Score: 0.728
Score (R2) directo del modelo: 0.728


## Modelo Regresion Lineal Lasso

In [66]:
# Modelo
model = linear_model.Lasso(alpha=1.0)

# Método de evaluación
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluación
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# positivos
scores = abs(scores)
print('Mean Absolute Error: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

KeyboardInterrupt: 

## Modelo Regresion Lineal Elastic Net

In [None]:
lm_lasso =  linear_model.ElasticNet()

#Entreno el modelo
reg_lineal_lasso = lm_lasso.fit(Xtrain, ytrain)

#Prediccion con el set de testeo
print('Mean Absolute Error:', mean_absolute_error(ytest, ypred).round(2))
print('Mean Squared Error:', mean_squared_error(ytest, ypred).round(2))
print('square Mean Squared Error:', np.sqrt(mean_squared_error(ytest, ypred)).round(2))
print('R2 Score:', r2_score(ytest, ypred).round(3))
print('Score del modelo Lasso:', reg_lineal_lasso.score(Xtest, ytest))

## Modelo Stats Models OLS

In [None]:
Xs = df[Cols]
ys = df["price_usd_per_m2"]

X = sm.add_constant(Xs)

model = sm.OLS(ys.astype(float), X.astype(float)).fit()
predictions = model.predict(X)
print ("Mean Squared Error:", mean_squared_error(ys, predictions))
print (model.summary())

## Modelo Naive Bayes Gaussian

In [None]:
# Modelo
model = naive_bayes.GaussianNB()
model.fit(Xtrain, ytrain) 
ypred = model.predict(Xtest) 

# Resultado
print(f'Accuracy Score: {round(accuracy_score(ytest, ypred), 2)}')
print ('Mean Absolute Error:', mean_absolute_error(ytest, ypred).round(2))
print ('Mean Squared Error:', mean_squared_error(ytest, ypred).round(2))
print ('R2 Score:', r2_score(ytest, ypred).round(3))