In [10]:
import pandas as pd
import time
import random
import requests
from bs4 import BeautifulSoup
import re
import json
from tqdm import tqdm
from unidecode import unidecode 

### Funciones básicas para realizar scraping
#### Funciones para limpiar texto y moneda

In [11]:
#Función para obtener tipo de cambio
def usd():
    #Obtener tipo de cambio
    banxico="https://www.banxico.org.mx/SieAPIRest/service/v1/series/SF43718/datos/?token=0e825df61e5eca2dd60340f1d39766f5cbefc052fb00f49b257095da3e004921"
    r=requests.get(banxico).json()
    #Obtener último dato
    mxn=r["bmx"]["series"][0]["datos"][-1]["dato"]
    #transformar a float
    mxn=float(mxn)
    return mxn

In [12]:
def limpia_texto(text):
    if text is None:
        return ""
    # Elimina caracteres no alfanuméricos, caracteres, puntuación, espacios extras y signos de pesos
    cleaned_text = re.sub(r'[^\w\s.]', '', text).strip()
    # Minúsculas
    cleaned_text = cleaned_text.lower()
    #Eliminar acentos
    cleaned_text = unidecode(cleaned_text)
    return cleaned_text

def limpia_moneda(text):
    if text is None:
        return ""
    #Eliminar "\n"
    cleaned_coin = re.sub(r'\n', '', text).strip()
    #Elimina comas
    cleaned_coin = re.sub(r',', '', text).strip()
    #Eliminar signo de pesos
    cleaned_coin = re.sub(r'$', '', cleaned_coin)

    return cleaned_coin

In [13]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
def equalize_lists(main_list, *lists):
    main_length = len(main_list)
    for lst in lists:
        while len(lst) < main_length:
            lst.append(None)

In [14]:
def lamudi(estado, municipio,tipo="venta"):
    
    if tipo=="venta":
    
        url_basica="https://www.lamudi.com.mx/{}/{}/for-sale/?propertyTypeGroups=casa%2Cdepartamento"
    elif tipo=="renta":

        url_basica="https://www.lamudi.com.mx/oaxaca/for-rent/?propertyTypeGroups=casa%2Cdepartamento&geos=1997%2C2021%2C2044%2C2063%2C2066%2C2078%2C2079%2C2082%2C2086%2C2102%2C2110%2C2140%2C2152%2C2159%2C2224%2C2287%2C2334%2C2342%2C2371%2C2381%2C2387%2C2394%2C2402%2C2404%2C2514%2C2531%2C2544%2C2549%2C2552"
    elif tipo=="terreno":
        #url_basica="https://www.lamudi.com.mx/oaxaca/{}/terreno/lote-terreno/for-sale/"
        url_basica="https://www.lamudi.com.mx/oaxaca/for-sale/?propertyTypeGroups=terreno%2Cdepartamento&geos=1997%2C2021%2C2044%2C2063%2C2066%2C2078%2C2079%2C2082%2C2086%2C2102%2C2110%2C2140%2C2152%2C2159%2C2224%2C2287%2C2334%2C2342%2C2371%2C2381%2C2387%2C2394%2C2402%2C2404%2C2514%2C2531%2C2544%2C2549%2C2552"
    else:
        raise ValueError("Selecciona un tipo de propiedad válido: venta, renta o terreno")
    
    all_data = pd.DataFrame()
    all_complemento_data = pd.DataFrame()
    paginacion = "&page="
    #url_basica = "https://www.lamudi.com.mx/{}/casa/for-sale/"
    # if tipo=="venta" or tipo=="renta":
    #     paginacion = "&page="
    # elif tipo=="terreno":
    #     paginacion = "?page="
        
        
    
    for i in tqdm(range(1, 80), desc=f"Scrapeando Lamudi en {estado}, {municipio}"):
        url=url_basica.format(estado, municipio)+paginacion+str(i)
       #url=url_basica+paginacion+str(i)
        response = requests.get(url, headers=headers)
        time.sleep(random.randint(1, 3))
        soup = BeautifulSoup(response.text, "html.parser")

        elements = soup.find_all("div", class_="snippet__content")
        complemento = soup.find_all("script", type="application/ld+json")
        # Listas para almacenar datos
        oferta, precios, bedrooms, bathrooms, superficie, urls = [], [], [], [], [], []
        
        for element in elements:
            oferta.append(element.find("span", class_="snippet__content__title").text.strip() if element.find("span",
                                                                                                         class_="snippet__content__title") else None)
            precios.append(element.find("div", class_="snippet__content__price").text.strip() if element.find("div", class_="snippet__content__price") else None)
        
            bedrooms.append(element.find("span", class_="property__number bedrooms").text.strip() if element.find("span",
                                                                                                       class_="property__number bedrooms") else None)
            bathrooms.append(element.find("span", class_="property__number bathrooms").text.strip() if element.find("span",
                                                                                                       class_="property__number bathrooms") else None)
            superficie.append(element.find("span", class_="property__number area").text.strip() if element.find("span",
                                                                                                       class_="property__number area") else None)
            whatsapp_button = element.find("button", class_="js-whatsapp whatsapp serp-buttons")
            if whatsapp_button and 'value' in whatsapp_button.attrs:
                full_whatsapp_url = whatsapp_button['value']
                url_part = full_whatsapp_url.split('text=')[1] if 'text=' in full_whatsapp_url else None
                clean_url = url_part.split(' ')[0] if url_part else None
                urls.append(clean_url)
            else:
                urls.append(None)
        page_data = pd.DataFrame(
            {"oferta": oferta, "precio": precios, "recamaras": bedrooms, "bathrooms": bathrooms, "mts": superficie,
             "url": urls})
        all_data = pd.concat([all_data, page_data], ignore_index=True) 
        if complemento:
            data_json = json.loads(complemento[0].text)[0]['about']
            descripcion, latitud, longitud, oferta_complemento, url_complemento = [], [], [], [], []
            for item in data_json:
                latitud.append(item['geo']['latitude'] if 'latitude' in item['geo'] else None)
                longitud.append(item['geo']['longitude'] if 'longitude' in item['geo'] else None)
                oferta_complemento.append(item['name'] if 'name' in item else None)
                url_complemento.append(item['url'] if 'url' in item else None)
            complemento_page_data = pd.DataFrame(
            {"oferta": oferta_complemento, "lat": latitud, "lon": longitud, "url": url_complemento})
            all_complemento_data = pd.concat([all_complemento_data, complemento_page_data], ignore_index=True)
    final_data = pd.merge(all_data, all_complemento_data, on="url", how="left")
    
    if final_data.empty:
        return final_data

    final_data = final_data.drop(columns=["oferta_y", "url"], errors="ignore")
    # Renombrar oferta_x a oferta
    final_data = final_data.rename(columns={"oferta_x": "oferta"})
    # Añadir fecha de consulta
    final_data["fecha_consulta"] = pd.to_datetime("today")
    # Añadir fuente
    final_data["fuente"] = "lamudi"
    # Añadir municipio
    #final_data["municipio"] = municipio
    #Limpiar oferta
    final_data["oferta"] = final_data["oferta"].apply(limpia_texto)
    #Limpiar precio
    final_data["precio"] = final_data["precio"].apply(limpia_moneda)
    final_data["precio"] = final_data["precio"].str.replace("$", "", regex=False)
    final_data['amount'] = final_data['precio'].str.extract('(\d+[\d,.]*)')
    final_data['moneda'] = final_data['precio'].str.extract('([A-Za-z]+)')
    final_data = final_data.drop(columns=['precio'])
    # Renombrar columna amount a precio
    final_data = final_data.rename(columns={'amount': 'precio'})
    final_data["precio"] = pd.to_numeric(final_data["precio"], errors="coerce")
    final_data['precio'] = final_data.apply(lambda x: x['precio'] * usd() if x['moneda'] == 'USD' else x['precio'], axis=1)
    final_data = final_data.drop(columns=['moneda'])
    final_data["precio"] = final_data["precio"].astype(float)
    #Eliminar nans
    final_data = final_data[final_data["precio"].notna()]
    #Dejar solo números en mts
    final_data["mts"] = final_data["mts"].str.extract(r'(\d+)')
    final_data["mts"] = pd.to_numeric(final_data["mts"], errors="coerce")
    final_data["mts"] = final_data["mts"].astype(float)
    return final_data

  final_data['amount'] = final_data['precio'].str.extract('(\d+[\d,.]*)')


In [15]:
def limpia_datos(df):
    df = df.reset_index(drop=True)
   
    #Eliminar registros con precio 0 o nan
    df=df[df['precio']>0]
    df=df[df['precio'].notna()]
    #Eliminar registros que en oferta contengan "terreno"
    df=df[~df['oferta'].str.contains('terreno')]
    df=df[~df['oferta'].str.contains('remodelar')]
    df=df[~df['oferta'].str.contains('hectareas')]
    #Si la fuente es goodlers, sacar el promedio de precio_min y precio_max y ponerlo en precio
    #Eliminar registros con misma oferta y mismo precio
    df=df.drop_duplicates(subset=['oferta','precio','recamaras','bathrooms'],keep='first')
    #Calcular precio por metro cuadrado
    df['precio_m2'] = df['precio'] / df['mts']

    return df

#### Realizar webscraping para viviendas en venta

In [16]:
#Scrapear lamudi
viviendas=lamudi("sinaloa","mazatlan","venta")

Scrapeando Lamudi en sinaloa, mazatlan: 100%|██████████| 79/79 [03:56<00:00,  2.99s/it]


In [17]:
vivi_limpia=viviendas.copy()
#Eliminar si oferta dice "lote" o "terreno"
vivi_limpia=vivi_limpia[~vivi_limpia["oferta"].str.contains("lote|terreno")]
#Eliminar si lat es nulo
vivi_limpia=vivi_limpia[vivi_limpia['lat'].notna()]
#Aplicar función de limpieza
vivi_limpia=limpia_datos(vivi_limpia)
vivi_limpia

Unnamed: 0,oferta,recamaras,bathrooms,mts,lat,lon,fecha_consulta,fuente,precio,precio_m2
0,casa en venta en el cid,5,6,995.0,23.26094646,-106.4547592,2024-09-02 13:00:56.917298,lamudi,24500000.0,24623.115578
1,casa en fraccionamiento en venta en villa marina,4,3,270.0,23.29016368,-106.46693848,2024-09-02 13:00:56.917298,lamudi,7950000.0,29444.444444
2,casa en venta en balcones de loma linda,4,5,510.0,23.2028407,-106.4288318,2024-09-02 13:00:56.917298,lamudi,27980000.0,54862.745098
3,departamento en venta en el cid,3,3,163.0,23.25995117,-106.44616744,2024-09-02 13:00:56.917298,lamudi,6500000.0,39877.300613
4,casa en venta en real del valle,2,1,60.0,23.2876619,-106.3990281,2024-09-02 13:00:56.917298,lamudi,1350000.0,22500.000000
...,...,...,...,...,...,...,...,...,...,...
2136,casa en venta en hacienda los mangos,4,2,180.0,23.2604839,-106.4257927,2024-09-02 13:00:56.917298,lamudi,558598.0,3103.322222
2137,casa en venta en hacienda los mangos,5,2,180.0,23.2604839,-106.4257927,2024-09-02 13:00:56.917298,lamudi,558598.0,3103.322222
2139,casa en venta en hacienda del seminario,3,2,160.0,23.2914088,-106.4202731,2024-09-02 13:00:56.917298,lamudi,838000.0,5237.500000
2141,casa en fraccionamiento en venta en los mangos,3,1,140.0,23.2589153,-106.423656,2024-09-02 13:00:56.917298,lamudi,558598.0,3989.985714


In [18]:
#guardar csv
vivi_limpia.to_csv("C:/Users/claud/Documents/GitHub/vivi_mazatlan/output/viviendas_mazatlan_lamudi.csv",index=False)