In [2]:
import pandas as pd
from numpy import random
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import numpy as np

from scripts.scrapper_gallito import flatten, scrapper, scrap_latlng, format_barrio, encode_barrio

In [None]:
base_url = 'https://www.gallito.com.uy/inmuebles/casas/venta/montevideo/ord_asc?pag={}'
all_urls= [base_url.format(i) for i in range(1, 200)]

threads = 20

urls = np.array_split(all_urls, threads)

all_r = []

with ThreadPoolExecutor(threads) as executor:
    for result in executor.map(scrapper, urls):
        all_r.append(result)

In [None]:
data = pd.concat(all_r).reset_index(drop=True)
data.head()

In [None]:
filename = f'data/ventas_mvdeo_{str(datetime.today().date())}.csv'
data.to_csv(filename, index=False)

## Recupera coordenadas

In [2]:
data = pd.read_csv('data/ventas_mvdeo_2022-08-03.csv')
data.head()

Unnamed: 0,desc,valor,metraje,url
0,3 Dormitorios en Aires Puros,$U 190.000,110.0,https://www.gallito.com.uy/excelente-garaje-2-...
1,3 Dormitorios en Prado,U$S 15.500,100.0,https://www.gallito.com.uy/gran-oportunidad-in...
2,3 Dormitorios en Villa Española,U$S 38.000,150.0,https://www.gallito.com.uy/casa-3-dormitorios-...
3,2 Dormitorios en La Teja,U$S 39.900,60.0,https://www.gallito.com.uy/economica-con-terre...
4,2 Dormitorios en Colon,U$S 45.000,50.0,https://www.gallito.com.uy/oficina-sosa-proxim...


In [4]:
threads = 15

urls = np.array_split(list(data.url.values), threads)

len(urls)

15

In [5]:
all_r = []

with ThreadPoolExecutor(threads) as executor:
    for result in executor.map(scrap_latlng, urls):
        all_r.append(result)

In [6]:
coords = flatten(all_r)

### Guarda las coordenadas

In [19]:
data = pd.read_csv('data/ventas_mvdeo_2022-08-03.csv')
data.head()

Unnamed: 0,desc,valor,metraje,url
0,3 Dormitorios en Aires Puros,$U 190.000,110.0,https://www.gallito.com.uy/excelente-garaje-2-...
1,3 Dormitorios en Prado,U$S 15.500,100.0,https://www.gallito.com.uy/gran-oportunidad-in...
2,3 Dormitorios en Villa Española,U$S 38.000,150.0,https://www.gallito.com.uy/casa-3-dormitorios-...
3,2 Dormitorios en La Teja,U$S 39.900,60.0,https://www.gallito.com.uy/economica-con-terre...
4,2 Dormitorios en Colon,U$S 45.000,50.0,https://www.gallito.com.uy/oficina-sosa-proxim...


In [20]:
r = [i=='Nan,Nan' for i in coords]
sum(r)

168

In [25]:
coords_splitted = [i.split(',') for i in coords]
data[['lat', 'lng']] = coords_splitted

In [26]:
data['lat'] = data['lat'].astype(float).round(5)
data['lng'] = data['lng'].astype(float).round(5)

In [27]:
data.head(8)

Unnamed: 0,desc,valor,metraje,url,lat,lng
0,3 Dormitorios en Aires Puros,$U 190.000,110.0,https://www.gallito.com.uy/excelente-garaje-2-...,,
1,3 Dormitorios en Prado,U$S 15.500,100.0,https://www.gallito.com.uy/gran-oportunidad-in...,-34.85913,-56.20633
2,3 Dormitorios en Villa Española,U$S 38.000,150.0,https://www.gallito.com.uy/casa-3-dormitorios-...,-34.86785,-56.14285
3,2 Dormitorios en La Teja,U$S 39.900,60.0,https://www.gallito.com.uy/economica-con-terre...,-34.8572,-56.24105
4,2 Dormitorios en Colon,U$S 45.000,50.0,https://www.gallito.com.uy/oficina-sosa-proxim...,-34.80505,-56.23007
5,4 Dormitorios en Cerrito,U$S 45.000,97.0,https://www.gallito.com.uy/oportunidad-casa-de...,-34.85944,-56.15642
6,1 Dormitorio en Villa Española,U$S 48.000,39.0,https://www.gallito.com.uy/oficina-sosa-apto-1...,-32.87555,-56.02015
7,2 Dormitorios en La Teja,U$S 48.000,50.0,https://www.gallito.com.uy/casa-a-1-cuadra-de-...,-34.86604,-56.23782


In [28]:
# limpieza
data['valor'].str.replace('U', '')

for word, rep in {"U":" ", "S":"", "$":"", ".":""}.items():
    data['valor'] = data['valor'].str.replace(word, rep, regex=False)

In [29]:
data['valor'] = data['valor'].astype(int)

In [30]:
data = data.loc[~((data.valor == 111111111) | (data.valor < 45000))]
data.head()

Unnamed: 0,desc,valor,metraje,url,lat,lng
0,3 Dormitorios en Aires Puros,190000,110.0,https://www.gallito.com.uy/excelente-garaje-2-...,,
4,2 Dormitorios en Colon,45000,50.0,https://www.gallito.com.uy/oficina-sosa-proxim...,-34.80505,-56.23007
5,4 Dormitorios en Cerrito,45000,97.0,https://www.gallito.com.uy/oportunidad-casa-de...,-34.85944,-56.15642
6,1 Dormitorio en Villa Española,48000,39.0,https://www.gallito.com.uy/oficina-sosa-apto-1...,-32.87555,-56.02015
7,2 Dormitorios en La Teja,48000,50.0,https://www.gallito.com.uy/casa-a-1-cuadra-de-...,-34.86604,-56.23782


In [31]:
data[['dormitorios', 'barrio']] = data['desc'].str.split(' en ', n = 1, expand = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [32]:
data['barrio_ine'] = data['barrio']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [34]:
data['barrio_ine'] = format_barrio(data, 'barrio')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [36]:
data['cod_barrio_ine'] = encode_barrio(data, 'barrio_ine')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
data.to_csv('data/ventas_mvdeo_2022-08-03_depurado.csv', index=False)

In [38]:
data = data.loc[(data.metraje > 10) & (data.metraje < 4000) & (~data.metraje.isna())]

data['metraje'] = data['metraje'].astype(int)

data['valor_metro'] = data['valor'] / data['metraje']

agru = data.groupby('cod_barrio_ine').median('valor_metro').reset_index()

agru['cod_barrio_ine'] = agru['cod_barrio_ine'].astype(int)

agru.to_csv('data/datos_agrupados_2022-08-03.csv', index=False)