# **Dataset Properati.**
---

#### Librerias

In [2]:
import pandas as pd
import numpy as np
import re
import random
import seaborn as sns
import matplotlib.pyplot as plt
# import geopandas as gpd
from scipy.spatial import cKDTree

from sklearn import linear_model
from sklearn import naive_bayes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, RepeatedKFold

import statsmodels.api as sm

# import folium
# from folium.plugins import HeatMap

#### Configuraciones

In [4]:
pd.set_option('display.max_columns', None)

---
#### Cargamos el dataset

In [6]:
df = pd.read_csv("properatti_FINAL.csv", sep=",")

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106342 entries, 0 to 106341
Data columns (total 65 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Unnamed: 0.1                    106342 non-null  int64  
 1   Unnamed: 0                      106342 non-null  int64  
 2   ID                              106342 non-null  int64  
 3   property_type                   106342 non-null  object 
 4   rooms                           106342 non-null  float64
 5   rooms_inputted                  106342 non-null  object 
 6   place_with_parent_names         106342 non-null  object 
 7   NivelAdm1                       106342 non-null  object 
 8   NivelAdm2                       102520 non-null  object 
 9   NivelAdm3                       36662 non-null   object 
 10  NivelAdm4                       1608 non-null    object 
 11  geonames_id                     106342 non-null  float64
 12  geonames_id_name

#### Dummies property type

In [None]:
df.property_type.value_counts(True).mul(100).round(2)

In [None]:
property_type_dummies = pd.get_dummies(df.property_type, prefix='pt', drop_first = True)

In [None]:
df.drop(property_type_dummies.columns, axis=1, inplace=True, errors='ignore')
df = pd.concat([df, property_type_dummies], axis=1)

#### Dummies Geolocalización

In [None]:
df.geonames_id.value_counts()

In [None]:
full_geo_dummies = pd.get_dummies(df.geonames_id, prefix='fullgeo', drop_first = True)

In [None]:
full_geo_dummies.columns

In [None]:
df = pd.concat([df, full_geo_dummies], axis=1)

#### Dummies Geolocalización Limit

In [None]:
# Establece un límite de cantidad de geonames, el resto los asigna al más cercano
LimitGeonamesPercent = 20

LimitGeonamesCant = int(df.geonames_id.value_counts().shape[0]*(LimitGeonamesPercent/100)) # Vamos a mantener el 25%, que representa el 93.84% de las propiedades

# Obtengo los valores únicos
df_geo = df[['geonames_id','geonames_id_name','geonames_id_lat','geonames_id_lon']].copy().drop_duplicates()

# Genero Points
df_geo.insert(df_geo.columns.get_loc('geonames_id_lon')+1, 'geopoint', gpd.points_from_xy(df_geo.geonames_id_lat, df_geo.geonames_id_lon))

# nueva columna para hacer el mapeo
df_geo['geonames_id_new'] = None
df_geo['geonames_id_new_name'] = None

# Obtengo los $LimitGeonamesCant valores más frecuentes
ListTopGeoID = df.geonames_id.value_counts().head(LimitGeonamesCant).index.tolist()

Capturadas = df.geonames_id.isin(ListTopGeoID).sum()
CapturadasPC = round((Capturadas / df.shape[0])*100, 2)

print(f'Limitando al {LimitGeonamesPercent}% de ubicaciones más populares')
print(f'\tNos quedamos con {LimitGeonamesCant} ubicaciones (o variables dummies)')
print(f'\tRepresentan el {CapturadasPC}% de todas las propiedades')

In [None]:
# Asigno geonames_id_new a los más frecuentes
df_geo.loc[df_geo.geonames_id.isin(ListTopGeoID), 'geonames_id_new'] = df_geo['geonames_id']
df_geo.loc[df_geo.geonames_id.isin(ListTopGeoID), 'geonames_id_new_name'] = df_geo['geonames_id_name']

In [None]:
# Genero cKDTree para buscar los mas cercanos
df_geo_top = df_geo[df_geo.geonames_id_new.notnull()].copy()
df_geo_top_btree = cKDTree(np.array(list(df_geo_top.apply(lambda row: (row.geonames_id_lat, row.geonames_id_lon), axis=1))))

In [None]:
# Función que asigna los mas cercanos
def GeoNameMasCercano(row):
    if row['geonames_id_new'] == None:
        dist, idx = df_geo_top_btree.query(row.geopoint, k=1)
        row['geonames_id_new'] = df_geo_top.iloc[idx]['geonames_id']
        row['geonames_id_new_name'] = df_geo_top.iloc[idx]['geonames_id_name']
    return row  

In [None]:
# Aplico la función... 
df_geo = df_geo.apply(GeoNameMasCercano, axis=1)

# Genero un dict para la asignación, el df_geo fue creado para tener control del resultado, no era necesario
map_geonames = dict(df_geo[['geonames_id', 'geonames_id_new']].values.tolist())

In [None]:
df.drop(['geonames_id_limited'], axis=1, inplace=True, errors='ignore')
df.insert(df.columns.get_loc('geonames_id')+1, '', np.NaN)

# Asigno el geonames limitado
for i in map_geonames:
    df.loc[df.geonames_id == i, 'geonames_id_limited'] = map_geonames.get(i)


In [None]:
# creo dummies para geonames limitados
limited_geo_dummies = pd.get_dummies(df.geonames_id_limited, prefix='geo_limited', drop_first = True)

In [None]:
limited_geo_dummies.columns.shape

In [None]:
# Dropeo las columnas por si ya ejecuté esta celda
df.drop(limited_geo_dummies.columns, axis=1, inplace=True, errors='ignore')

# Agrego las columnas
df = pd.concat([df, limited_geo_dummies], axis=1)

In [None]:
df[df.geonames_id != df.geonames_id_limited].shape

In [None]:
df.shape

---
## Modelado
#### Definición de Columnas a utilizar



In [None]:
# Vamos a probar sobre dos objetivos
TargetCols = ['price_usd_per_m2', 'superficie_precio_m2']

# Todas las columnas posibles
Cols = ['pt_apartment', 'pt_house', 'pt_store',
        'rooms',
        'EsBarrioCerrado', 
        'surface_total_in_m2',
        'surface_covered_in_m2',
        'price_aprox_usd',
        'superficie',
        'CercanoHospital', 
        'CercanoCentroSalud', 
        'CercanoPuntoMedico',
        'CercanoEducPrimaria', 
        'CercanoEducSecundaria', 
        'CercanoEducInicial',
        'CercanoEducAdultos', 
        'CercanoUniversidad', 
        'CercanoTerminalOmnibus',
        'CercanoRecreacion', 
        'CercanoPlaza', 
        'CercanoParque',
        'keyWord_amenities', 
        'keyWord_amoblado', 
        'keyWord_avenida',
        'keyWord_banco', 
        'keyWord_buenEstado', 
        'keyWord_cochera',
        'keyWord_dependencias', 
        'keyWord_dueno', 
        'keyWord_electrogeno',
        'keyWord_enPozo', 
        'keyWord_estrenar', 
        'keyWord_extras',
        'keyWord_gimnasio', 
        'keyWord_lujoso', 
        'keyWord_luminoso',
        'keyWord_parrilla', 
        'keyWord_petfriendly', 
        'keyWord_pileta',
        'keyWord_profesional', 
        'keyWord_quincho', 
        'keyWord_seguridad',
        'keyWord_subte', 
        'keyWord_terraza', 
        'keyWord_vista'
]

Cols += limited_geo_dummies.columns.tolist()
#Cols += full_geo_dummies.columns.tolist() # sólo fue para probar... no tiene sentido utilizar el 100% de las ubicaciones


#### Separación Train/Test

In [None]:
#Separamos en train y test
X = df[Cols].copy().astype(int)
y = df.price_usd_per_m2.astype(int)

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.30, random_state=1)

In [None]:
# Escalado ??
#scaler = StandardScaler()
#scaler.fit_transform(Xtrain)

## Modelo Regresion Lineal

In [None]:
# Modelo
model = linear_model.LinearRegression(fit_intercept=True)
model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)
ypred_train = model.predict(Xtrain)

# Resultado
print ('Mean Absolute Error:', mean_absolute_error(ytest, ypred).round(2))
print ('Mean Squared Error:', mean_squared_error(ytest, ypred).round(2))
print ('R2 Score:', r2_score(ytest, ypred).round(3))

## Modelo Regresion Lineal Ridge

In [None]:
lm_ridge =  linear_model.Ridge()
reg_lineal_ridge = lm_ridge.fit(Xtrain, ytrain)

# Resultado
print ('Mean Absolute Error:', mean_absolute_error(ytest, ypred).round(2))
print ('Mean Squared Error:', mean_squared_error(ytest, ypred).round(2))
print ('R2 Score:', r2_score(ytest, ypred).round(3))
print('Score del modelo Ridge:', reg_lineal_ridge.score(Xtest, ytest))

## Modelo Regresion Lineal Lasso

In [None]:
# Modelo
model = linear_model.Lasso(alpha=1.0)

# Método de evaluación
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Evaluación
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

# positivos
scores = abs(scores)
print('Mean Absolute Error: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

## Modelo Regresion Lineal Elastic Net

In [None]:
lm_lasso =  linear_model.ElasticNet()

#Entreno el modelo
reg_lineal_lasso = lm_lasso.fit(Xtrain, ytrain)

#Prediccion con el set de testeo
print('Mean Absolute Error:', mean_absolute_error(ytest, ypred).round(2))
print('Mean Squared Error:', mean_squared_error(ytest, ypred).round(2))
print('square Mean Squared Error:', np.sqrt(mean_squared_error(ytest, ypred)).round(2))
print('R2 Score:', r2_score(ytest, ypred).round(3))
print('Score del modelo Lasso:', reg_lineal_lasso.score(Xtest, ytest))

## Modelo Stats Models OLS

In [None]:
Xs = df[Cols]
ys = df["price_usd_per_m2"]

X = sm.add_constant(Xs)

model = sm.OLS(ys.astype(float), X.astype(float)).fit()
predictions = model.predict(X)
print ("Mean Squared Error:", mean_squared_error(ys, predictions))
print (model.summary())

## Modelo Naive Bayes Gaussian

In [None]:
# Modelo
model = naive_bayes.GaussianNB()
model.fit(Xtrain, ytrain) 
ypred = model.predict(Xtest) 

# Resultado
print(f'Accuracy Score: {round(accuracy_score(ytest, ypred), 2)}')
print ('Mean Absolute Error:', mean_absolute_error(ytest, ypred).round(2))
print ('Mean Squared Error:', mean_squared_error(ytest, ypred).round(2))
print ('R2 Score:', r2_score(ytest, ypred).round(3))