# Load Data from S3

In [74]:
import boto3
import json
import pandas as pd
import numpy as np
import re

BASE_URL = "https://clasificados.lostiempos.com"

s3 = boto3.resource('s3')
my_bucket = s3.Bucket('s3-lostiempos-data')

In [75]:
def get_phones(s):
    return re.findall(r"([\d]{8}|[\d]{7})", s)

In [175]:
def get_price(d):
    if len(d['price']) == 0:
        price = get_by_regex(d['description'], r"(\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\sB[S|s]\.)", r"B[s|S]?\.\s*\d+\.\d+")
        return price
    else:
        return d['price']

In [176]:
def get_by_regex(s, regex1, regex2=None):
    result = re.findall(regex1, s)
    if len(result) > 0:
        return result[0]

    elif regex2 is not None:
        result = re.findall(regex2, s)
        if len(result) > 0:
            return result[0]

    return ''

In [172]:
def get_m2(s):
    m2 = get_by_regex(s, r"(\b\d+(\.\d+)? m2\b)", r"(\b\d+(\.\d+)?m2\b)")
    if m2 != '':
        return m2[0]
    return m2

## Cargamos archivos JSON de S3

se carga los archivos JSON desde S3 de manera iterativa

In [180]:
df_result = None

for object_summary in my_bucket.objects.filter(Prefix="json/"):
    key_path = object_summary.key
    content_object = s3.Object('s3-lostiempos-data', key_path)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    json_content = json.loads(file_content)
    print('loaded: ', key_path)

    df = pd.DataFrame(json_content)
    if df_result is None:
        df_result = df.copy()
    else:
        df_result = pd.concat([df_result, df], ignore_index=True)

print( 'tamaño del dataframe:', df_result.shape[0])

loaded:  json/los_tiempos_clasificados_2023-07-28.json
tamaño del dataframe: 67


## Limpiamos el Dataframe

* Eliminamos diplicados
* eliminamos dato innecesarios

In [193]:
df_result = df_result.drop_duplicates(subset=['path'], keep='last')

df_result['link'] = BASE_URL + df_result['path']
df_result['city'] = df_result['keys'].apply(lambda x: x.split('-')[0].strip())
df_result['category'] = df_result['keys'].apply(lambda x: x.split('-')[1].strip())
df_result['home_type'] = df_result['keys'].apply(lambda x: x.split('-')[2].strip())
df_result['phones'] = df_result['description'].apply(get_phones)
df_result['price'] = df_result.apply(get_price, axis=1)
df_result['m2'] = df_result['description'].apply(get_m2)
df_result['n_rooms'] = df_result['description'].apply(
    lambda x: get_by_regex(x, r'(\d{1} dormitorio)', r'(\d{1} cuarto)', ).split(' ')[0]
)
df_result['lote_terreno'] = df_result['description'].str.contains('TERRENO') | df_result['description'].str.contains('LOTE')

In [220]:
df_result['category'] = df_result['category'].astype('category')
df_result['home_type'] = df_result['home_type'].astype('category')
df_result['m2'] = df_result['m2'].astype('float64')
df_result['n_rooms'] = df_result['n_rooms'].astype('float64')

ValueError: could not convert string to float: ''

In [209]:
def clean_prices(p):
    price_result = 1.0
    if p == '':
        return np.nan

    if '$us' in p:
        price_result = 6.94

    price = p.replace('Bs.', '').replace('BS.', '').replace('$us', '').strip().replace('.', '')
    price = int(price)
    price_result = price * price_result
    return price_result



#### TODO: realizar una función que obtenga el precio de la descricion

In [210]:
df_alquileres = df_result[df_result['category'] == 'Alquiler']
df_venta = df_result[df_result['category'] == 'Venta']
df_anticretico = df_result[df_result['category'] == 'Anticrético']

In [217]:
df_alquileres['price'] = df_alquileres['price'].apply(clean_prices)
df_alquileres['price'] = df_alquileres['price'].astype('float64')

TypeError: argument of type 'float' is not iterable

In [164]:
for d in df_result['description']:
    print(d)

ALQUILO departamento, zona norte. Ref. 77412631.
TERRENO amurallado, 320 m2, zona fabrica Copelme, 100 $us m2. Telf. 72004478.
ALQUILER oficina, sobre Heroínas casi Antezana, ideal consultorio dental, segundo piso, 45 m2. Cel. 65711091.
VENDO Departamento América casi Libertador, 3 dormitorios, 2 baños. 90 m2 sin garaje Ref. 70744006
PROPIETARIO Alquila departamento cerca al IC Norte Hipódromo Watsap 70792429
ALQUILO habitación para estudiante, a una cuadra de U. San Simón. Cel. 76404055.
CASA en anticrético, zona Quintanilla, nueva, 3 pisos, dormitorios, roperos empotrados, baños, living, comedor, cocina, garaje, patio, jardín, urbanización Hermanos Bazoalto, $us 30.000. Ref. 64942185, 79798561.
VENDO departamento, zona Pacata de 198 m2, 4 dormitorios, 1 escritorio, sala amplia, cocina, cajonería alta, baja y garaje, precio 130.000 dólares. Ref. 60752144, whatsapp.
DEPARTAMENTOS Condominio Ubuntu. A 3 cuadras de la plaza de Cala – Cala, torre de 25 pisos de entrega inmediata. Supervis