# Test Web Scraping  

This code is a draft version of the full web scraping process. The goal is to collect all available data to analyze what information can be gathered through this method and use it to define the next steps of the project.  

**v1 – Last reviewed on: 2025-03-07**  


In [2]:
# Import libraries 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time

import os
import json

import pandas as pd  # 📌 Library to save data
from datetime import datetime  # 📌 To track execution date

In [3]:
options = Options()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")


# Get current date for logs and data
today = datetime.today().strftime("%Y-%m-%d")
log_filename = f"log_{today}.txt"

today

'2025-03-07'

In [4]:
# 🔹 Configure WebDriver
def setup_driver():
    """Configures and returns the Selenium WebDriver."""
    print("🌐 Configuring WebDriver...")
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Eliminar esta línea si prefieres ver el navegador
    driver = webdriver.Chrome(options=options)
    
    print("✅ WebDriver configured successfully.")
    return driver

# Load the webpage
def load_page(driver, url):
    """Loads the given URL in the browser and waits for the page to render."""
    driver.get(url)
    time.sleep(3)  # Allow some time for elements to load

# Scroll down to load all elements
def scroll_page(driver):
    """Scrolls to the bottom of the page to load dynamically loaded elements."""
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        if new_height == last_height:
            break
        last_height = new_height


### New extract data function 

In [23]:
def extract_data(driver, properties):
    """Extracts data from both highlighted and regular property sections."""
    wait = WebDriverWait(driver, 10)

    try:
        # Scroll to the bottom to load all dynamic content
        scroll_page(driver)

        # Find highlighted properties section
        highlight_section = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "property-list__highlights")))
        extract_properties_from_section(highlight_section, properties, is_highlight=True)

        # Find regular properties section
        results_section = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "property-list__results")))
        extract_properties_from_section(results_section, properties, is_highlight=False)

    except Exception as e:
        print(f"⚠ Error in extract_data(): {e}")

    return properties

def extract_properties_from_section(section, properties, is_highlight):
    """Extract properties from a given section (highlighted or regular listings)."""
    property_cards = section.find_elements(By.CLASS_NAME, "property-card__content")
    print(f"📌 Elements found in section ({'Highlights' if is_highlight else 'Results'}): {len(property_cards)}")

    for card in property_cards:
        try:
            # Extract the link
            link_element = card.find_element(By.TAG_NAME, "a")
            link = link_element.get_attribute("href") if link_element else None

            # Extract the location
            location_element = card.find_element(By.CLASS_NAME, "property-card__detail-top__left")
            location = location_element.text.strip() if location_element else None

            # Extract the price
            price_element = card.find_element(By.CLASS_NAME, "property-card__detail-price")
            price = price_element.text.strip() if price_element else None

            # Extract the property name
            name_element = card.find_element(By.CLASS_NAME, "property-card__detail-title")
            name = name_element.text.strip() if name_element else None

            # Extract property attributes (area, bedrooms, bathrooms, parking, id)
            specs = card.find_elements(By.TAG_NAME, "pt-main-specs")
            area = specs[0].get_attribute("squaremeter") if specs else None
            bedrooms = specs[0].get_attribute("bedrooms") if specs else None
            bathrooms = specs[0].get_attribute("toilets") if specs else None
            parking = specs[0].get_attribute("parking") if specs else None
            prop_id = specs[0].get_attribute("element-id").replace("specs-", "") if specs else None

            # Extract property tags (Project / Highlight)
            project_tag = False
            highlight_tag = False
            tags_list = []

            # Tags from picture section
            picture_tags = card.find_elements(By.CLASS_NAME, "property-card__photo-tags")
            for tag in picture_tags:
                if "projectTag" in tag.get_attribute("outerHTML"):
                    project_tag = True
                if "higlightTag" in tag.get_attribute("outerHTML"):
                    highlight_tag = True

            # Tags from details section (e.g., "Bajó de precio", "Renovado")
            details_tags = card.find_elements(By.CLASS_NAME, "property-card__detail-top__left")
            for tag in details_tags:
                pt_tags = tag.find_elements(By.TAG_NAME, "pt-tag")
                for pt_tag in pt_tags:
                    tag_text = pt_tag.text.strip()
                    if tag_text:
                        tags_list.append(tag_text)

            # Append the data to the properties list
            properties.append({
                "id": prop_id,
                "price": price,
                "location": location,
                "area": area,
                "bedrooms": bedrooms,
                "bathrooms": bathrooms,
                "parking": parking,
                "name": name,
                "project": project_tag,
                "highlighted": highlight_tag,
                "additional_attributes": tags_list,
                "link": link,
                "fecha": time.strftime("%Y-%m-%d")
            })

        except Exception as e:
            print(f"⚠ Error extracting data from a card: {e}")


In [None]:
# 🔹 Main function (SCRAPES DATA ONLY)
def scrape_pages(url):
    """Executes scraping on the first page and returns the data."""
    driver = setup_driver()
    properties = []  # 📌 List to store extracted data

    try:
        load_page(driver, url)
        print("\n📄 Scraping first page...")  # ✅ Registro de progreso
        extract_data(driver, properties)

    except Exception as e:
        print(f"⚠ Error in scrape_pages(): {e}")

    finally:
        driver.quit()  # ✅ Asegura que el WebDriver se cierra correctamente

    return properties  # ✅ Devuelve los datos de la primera página

In [24]:
# Run scraper
start_url = "https://www.metrocuadrado.com/apartamento-apartaestudio-casa-casalote/venta/bogota/?search=form&canary=true"
properties = scrape_pages(start_url)

🌐 Configuring WebDriver...
✅ WebDriver configured successfully.

📄 Scraping first page...
📌 Elements found in section (Highlights): 3
📌 Elements found in section (Results): 54


In [31]:
# 📂 Ruta de la carpeta donde guardar el archivo
data_folder = "data"

# Cambiar el nombre del archivo para que empiece con 'today' y luego 'properties'
file_name = f"{today}_properties.json"  # Combina 'today' con el nombre del archivo
file_path = os.path.join(data_folder, file_name)

# Guardar el archivo JSON sin convertir caracteres a Unicode
with open(file_path, "w", encoding="utf-8") as file:
    json.dump(properties, file, indent=4, ensure_ascii=False)

print(f"File saved at: {file_path}")


File saved at: data\2025-03-07_properties.json


In [32]:
# Convertir la lista de diccionarios en un DataFrame
properties_df = pd.DataFrame(properties)
properties_df.describe()

Unnamed: 0,id,price,location,area,bedrooms,bathrooms,parking,name,project,highlighted,additional_attributes,link,fecha
count,57,57,57,57,57,57,57,57,57,57,57,57,57
unique,57,49,51,50,5,5,5,51,2,2,3,57,1
top,17637-M5235292,$1.650.000.000,COLINA CAMPESTRE | Noroccidente | Bogotá D.C.,187,3,3,2,"Apartamento en Venta, COLINA CAMPESTRE, Bogotá...",False,False,[],https://www.metrocuadrado.com/inmueble/venta-a...,2025-03-07
freq,1,4,6,4,33,20,21,5,53,54,55,1,57


In [27]:
# Para cada columna booleana, ver cuántos True y False hay
bool_columns = properties_df.select_dtypes(include='bool').columns

for column in bool_columns:
    print(f"\nConteo de valores para la columna '{column}':")
    print(properties_df[column].value_counts())


Conteo de valores para la columna 'project':
project
False    53
True      4
Name: count, dtype: int64

Conteo de valores para la columna 'highlighted':
highlighted
False    54
True      3
Name: count, dtype: int64


In [28]:
# Filtrar las filas donde la columna 'tags' no esté vacía
filtered_properties_df = properties_df[properties_df['additional_attributes'].apply(lambda x: len(x) > 0)]

# Ver el resultado
filtered_properties_df


Unnamed: 0,id,price,location,area,bedrooms,bathrooms,parking,name,project,highlighted,additional_attributes,link,fecha
7,19155-M5607323,$287.000.000,GILMAR | Noroccidente | Bogotá D.C.\nRenovado\...,63,3,2,0,"Apartamento en Venta, GILMAR, Bogotá D.C.",False,False,"[Renovado, Bajó de precio]",https://www.metrocuadrado.com/inmueble/venta-a...,2025-03-07
8,19155-M5599949,$530.000.000,SAN JOSE DE BAVARIA | Noroccidente | Bogotá D....,65,2,2,1,"Apartamento en Venta, SAN JOSE DE BAVARIA, Bog...",False,False,[Bajó de precio],https://www.metrocuadrado.com/inmueble/venta-a...,2025-03-07


### Project properties

In [None]:
url_project_test = ""



### Regular projects (no new ones)

In [35]:
url_regularProperty_test = "https://www.metrocuadrado.com/inmueble/venta-apartamento-bogota-la-soledad-norte-3-habitaciones-2-banos-1-garajes/18222-M5606032?canary=true"


In [36]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configura las opciones de Chrome
chrome_options = Options()
chrome_options.add_argument("--headless")  # Para ejecutar sin abrir el navegador

# Inicia el navegador
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# URL de la página (cámbiala por la página que necesitas)
url_regularProperty_test

# Abre la página
driver.get(url_regularProperty_test)

# Espera hasta que el contenido esté cargado
wait = WebDriverWait(driver, 10)

# Función para hacer clic en un selector y extraer la información
def expand_section_and_extract_data(section_name):
    # Encuentra el botón de la sección (por ejemplo, 'Del sector', 'Exteriores', 'Zonas comunes')
    section_button = wait.until(EC.element_to_be_clickable((By.XPATH, f"//span[contains(text(), '{section_name}')]")))
    section_button.click()

    # Espera que la sección se expanda o que el botón de "Ver más" esté disponible
    try:
        # Si existe un botón "Ver más", haz clic en él
        see_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[contains(text(), 'Ver más')]")))
        see_more_button.click()

        # Espera hasta que los elementos adicionales estén visibles
        wait.until(EC.presence_of_all_elements_located((By.XPATH, "//div[contains(@class, 'Row-sc')]")))
    except Exception as e:
        print(f"No se encontró el botón 'Ver más' para {section_name}, o ya estaba cargado.")

    # Extrae los elementos dentro de la sección expandidada
    items = driver.find_elements(By.XPATH, f"//div[contains(@class, 'card-header') and contains(text(), '{section_name}')]/following-sibling::div//p")
    items_list = [item.text for item in items]
    
    return items_list

# Extraer la información de "Del sector"
del_sector_info = expand_section_and_extract_data("Del sector")
print(f"Información de 'Del sector': {del_sector_info}")

# Extraer la información de "Zonas comunes"
zonas_comunes_info = expand_section_and_extract_data("Zonas comunes")
print(f"Información de 'Zonas comunes': {zonas_comunes_info}")

# Extraer la información de "Exteriores"
exteriores_info = expand_section_and_extract_data("Exteriores")
print(f"Información de 'Exteriores': {exteriores_info}")

# Cierra el navegador
driver.quit()


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00D80B43+25139]
	(No symbol) [0x00D113F4]
	(No symbol) [0x00BF04E3]
	(No symbol) [0x00C383D7]
	(No symbol) [0x00C3872B]
	(No symbol) [0x00C81002]
	(No symbol) [0x00C5D014]
	(No symbol) [0x00C7E778]
	(No symbol) [0x00C5CDC6]
	(No symbol) [0x00C2BDE9]
	(No symbol) [0x00C2D124]
	GetHandleVerifier [0x01084373+3185251]
	GetHandleVerifier [0x010A291A+3309578]
	GetHandleVerifier [0x0109CF42+3286578]
	GetHandleVerifier [0x00E17AE0+643536]
	(No symbol) [0x00D1A20D]
	(No symbol) [0x00D170B8]
	(No symbol) [0x00D17257]
	(No symbol) [0x00D09E00]
	BaseThreadInitThunk [0x76CE5D49+25]
	RtlInitializeExceptionChain [0x77C9CDEB+107]
	RtlGetAppContainerNamedObjectPath [0x77C9CD71+561]
