# Load Data from S3

In [74]:
import boto3
import json
import pandas as pd
import numpy as np
import re

BASE_URL = "https://clasificados.lostiempos.com"

s3 = boto3.resource('s3')
my_bucket = s3.Bucket('s3-lostiempos-data')

In [75]:
def get_phones(s):
    return re.findall(r"([\d]{8}|[\d]{7})", s)

In [138]:
def get_price(d):
    if len(d['price']) == 0:
        price = re.findall(r"(\b\d{1,3}(?:\.\d{3})*(?:,\d+)?\sB[S|s]\.)", d['description'])
        if len(price) > 0:
            return price[0]
        else:
            price = re.findall(r"B[s|S]?\.\s*\d+\.\d+", d['description'])
            if len(price) > 0:
                return price[0]
        return ''
    else:
        return d['price']

In [159]:
def get_m2(s):
    m2 = re.findall(r"(\b\d+(\.\d+)? m2\b)", s)
    if len(m2) > 0:
        return m2[0][0]
    else:
        m2 = re.findall(r"(\b\d+(\.\d+)?m2\b)", s)
        if len(m2) > 0:
            return m2[0][0]
    return ''

## Cargamos archivos JSON de S3

se carga los archivos JSON desde S3 de manera iterativa

In [146]:
df_result = None

for object_summary in my_bucket.objects.filter(Prefix="json/"):
    key_path = object_summary.key
    content_object = s3.Object('s3-lostiempos-data', key_path)
    file_content = content_object.get()['Body'].read().decode('utf-8')
    json_content = json.loads(file_content)
    print('loaded: ', key_path)

    df = pd.DataFrame(json_content)
    if df_result is None:
        df_result = df.copy()
    else:
        df_result = pd.concat([df_result, df], ignore_index=True)

print( 'tamaño del dataframe:', df_result.shape[0])

loaded:  json/los_tiempos_clasificados_2023-07-28.json
tamaño del dataframe: 67


## Limpiamos el Dataframe

* Eliminamos diplicados
* eliminamos dato innecesarios

In [160]:
df_result = df_result.drop_duplicates(subset=['path'], keep='last')

df_result['link'] = BASE_URL + df_result['path']
df_result['city'] = df_result['keys'].apply(lambda x: x.split('-')[0].strip())
df_result['category'] = df_result['keys'].apply(lambda x: x.split('-')[1].strip())
df_result['home_type'] = df_result['keys'].apply(lambda x: x.split('-')[2].strip())
df_result['phones'] = df_result['description'].apply(get_phones)
df_result['price'] = df_result.apply(get_price, axis=1)
df_result['m2'] = df_result['description'].apply(get_m2)

[]
[('320 m2', '')]
[('45 m2', '')]
[('90 m2', '')]
[]
[]
[]
[('198 m2', '')]
[]
[]
[]
[]
[]
[]
[]
[('300 m2', '')]
[]
[]
[]
[]
[]
[]
[]
[('310 m2', '')]
[]
[]
[]
[]
[('134 m2', '')]
[]
[]
[]
[('130 m2', '')]
[('134 m2', '')]
[]
[]
[]
[]
[]
[]
[]
[('79 m2', '')]
[]
[]
[]
[]
[]
[]
[('300 m2', '')]
[]
[]
[]
[]
[]
[('300 m2', '')]
[]
[]
[('615.79 m2', '.79')]
[]
[('400 m2', '')]
[]
[]
[]
[]
[]
[]
[]


In [161]:
df_result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67 entries, 0 to 66
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        67 non-null     object
 1   date         67 non-null     object
 2   description  67 non-null     object
 3   price        67 non-null     object
 4   keys         67 non-null     object
 5   path         67 non-null     object
 6   html         67 non-null     object
 7   link         67 non-null     object
 8   city         67 non-null     object
 9   category     67 non-null     object
 10  home_type    67 non-null     object
 11  phones       67 non-null     object
 12  m2           67 non-null     object
dtypes: object(13)
memory usage: 7.3+ KB


#### TODO: realizar una función que obtenga el precio de la descricion

In [162]:
df_result['m2']

0           
1     320 m2
2      45 m2
3      90 m2
4           
       ...  
62          
63          
64          
65          
66          
Name: m2, Length: 67, dtype: object

In [164]:
for d in df_result['description']:
    print(d)

ALQUILO departamento, zona norte. Ref. 77412631.
TERRENO amurallado, 320 m2, zona fabrica Copelme, 100 $us m2. Telf. 72004478.
ALQUILER oficina, sobre Heroínas casi Antezana, ideal consultorio dental, segundo piso, 45 m2. Cel. 65711091.
VENDO Departamento América casi Libertador, 3 dormitorios, 2 baños. 90 m2 sin garaje Ref. 70744006
PROPIETARIO Alquila departamento cerca al IC Norte Hipódromo Watsap 70792429
ALQUILO habitación para estudiante, a una cuadra de U. San Simón. Cel. 76404055.
CASA en anticrético, zona Quintanilla, nueva, 3 pisos, dormitorios, roperos empotrados, baños, living, comedor, cocina, garaje, patio, jardín, urbanización Hermanos Bazoalto, $us 30.000. Ref. 64942185, 79798561.
VENDO departamento, zona Pacata de 198 m2, 4 dormitorios, 1 escritorio, sala amplia, cocina, cajonería alta, baja y garaje, precio 130.000 dólares. Ref. 60752144, whatsapp.
DEPARTAMENTOS Condominio Ubuntu. A 3 cuadras de la plaza de Cala – Cala, torre de 25 pisos de entrega inmediata. Supervis