In [9]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import random
from pathlib import Path
import re

In [None]:
current_file = Path.cwd()
parent_directory = current_file.parent

# Base URL without the pagination parameter
contract_type = 'venta'
investment_type = 'departamento'
region = 'metropolitana'
base_url = f"https://www.portalinmobiliario.com/{contract_type}/{investment_type}/{region}/"
start = 1  # Starting point for pagination
increment = 48  # Step size for pagination
max_pages = 500  # Set a limit to avoid infinite loops

# List to store all extracted links
all_links = []

def fetch_with_rate_limit(url, max_retries=3):
    """Fetches a URL with retry and rate limiting."""
    for _ in range(max_retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response
            elif response.status_code in (429, 503):  # Too Many Requests or Service Unavailable
                retry_after = response.headers.get("Retry-After")
                retry_after = int(retry_after) if retry_after and retry_after.isdigit() else random.uniform(1, 3)
                print(f"Rate limited. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                print(f"Unexpected status: {response.status_code}. Retrying...")
                time.sleep(random.uniform(1, 5))
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            time.sleep(random.uniform(1, 3))
    return None

def scrape_page(url):
    """Scrapes a single page and returns links."""
    response = fetch_with_rate_limit(url)
    if response:
        soup = BeautifulSoup(response.content, "html.parser")
        list_items = soup.select("main > div > div:nth-of-type(3) > section > ol > li")
        links = [
            item.find("a", class_="ui-search-result__image ui-search-link")['href']
            for item in list_items if item.find("a", class_="ui-search-result__image ui-search-link")
        ]
        return links
    return []

# Main scraping loop
for _ in range(max_pages):
    current_url = f"{base_url}_Desde_{start}_OrderId_PRICE_NoIndex_True"
    links = scrape_page(current_url)

    if not links:  # Stop if no links are found
        print("No more items found. Ending scrape.")
        break

    all_links.extend(links)
    print(f"Scraped {len(links)} items from page starting at {start}.")
    start += increment
    time.sleep(random.uniform(2, 5))  # Random delay to reduce server load

cleaned_links = [
    re.search(r'(https://www\.portalinmobiliario\.com/MLC-\d+)', url).group(1)
    for url in all_links if re.search(r'(https://www\.portalinmobiliario\.com/MLC-\d+)', url)
]

# Create DataFrame and save results
df = pd.DataFrame({
    "contract_type": contract_type,
    "investment_type": investment_type,
    "region": region,
    "url": cleaned_links
})

output_file = parent_directory / "data" / "raw" / 'scraped_links_portal_inmob.csv'
df.to_csv(output_file, index=False)
print("CSV file saved")

print(f"Total Links Collected: {len(all_links)}")

Scraped 48 items from page starting at 1.
Scraped 48 items from page starting at 49.
Scraped 48 items from page starting at 97.
Scraped 48 items from page starting at 145.
Scraped 48 items from page starting at 193.
Scraped 48 items from page starting at 241.
Scraped 48 items from page starting at 289.
Scraped 48 items from page starting at 337.
Scraped 48 items from page starting at 385.
Scraped 48 items from page starting at 433.
Scraped 48 items from page starting at 481.
Scraped 48 items from page starting at 529.
Scraped 48 items from page starting at 577.
Scraped 48 items from page starting at 625.
Unexpected status: 404. Retrying...
Scraped 48 items from page starting at 673.
Scraped 48 items from page starting at 721.
Scraped 48 items from page starting at 769.
Scraped 48 items from page starting at 817.
Scraped 48 items from page starting at 865.
Scraped 48 items from page starting at 913.
Scraped 48 items from page starting at 961.
Scraped 48 items from page starting at 1009.

In [11]:
from pathlib import Path

current_file = Path.cwd()
parent_directory = current_file.parent
links_path = parent_directory / "data" / "raw" / 'scraped_links_portal_inmob.csv'
df = pd.read_csv(links_path)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import time

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Replace with the appropriate WebDriver for your browser

# Initialize an empty list to store extracted data
scraped_data = []

# Iterate over each URL in the DataFrame
for index, row in df.iterrows():
    url = row['url']
    url = r'https://www.portalinmobiliario.com/MLC-2719611378-excelente-depto-en-santiago-sur-_JM'
    print(f"Scraping: {url}")
    try:
        # Load the page using Selenium
        driver.get(url)

        # Wait for the page to fully load (adjust timeout and conditions as needed)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#header > div > div.ui-pdp-header__title-container > h1"))
        )

        # Get the rendered page source and parse it with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract data using the provided selectors
        title = soup.select_one("#header > div > div.ui-pdp-header__title-container > h1")
        subtitle = soup.select_one("#header > div > div.ui-pdp-header__subtitle > span")
        price = soup.select_one("#price > div > div > div > span > span > span.andes-money-amount__fraction")
        common_expenses = soup.select_one("#maintenance_fee_vis > p")
        squared_meters = soup.select_one("#highlighted_specs_res > div > div:nth-child(1) > span")
        dorms = soup.select_one("#highlighted_specs_res > div > div:nth-child(2) > span")
        bathrooms = soup.select_one("#highlighted_specs_res > div > div:nth-child(3) > span")
        location = soup.select_one("#location > div > div.ui-pdp-media.ui-vip-location__subtitle.ui-pdp-color--BLACK > div > p")

        # Extract coordinates from the map image srcset
        map_img = soup.select_one("#ui-vip-location__map > div > img")
        coordinates = None
        if map_img and 'srcset' in map_img.attrs:
            srcset = map_img['srcset']
            if "center=" in srcset:
                coordinates = srcset.split("center=")[1].split("&")[0]  # Extract lat,lng
                coordinates = coordinates.replace("%2C", ",")

        # Extract tables
        tables = soup.find_all("tbody", class_="andes-table__body")

        table_data = {}
        for i, table in enumerate(tables, start=1):
            rows = table.find_all("tr", class_="andes-table__row ui-vpp-striped-specs__row")
            table_entries = []
            for row in rows:
                header = row.find("th", class_="andes-table__header").text.strip() if row.find("th", class_="andes-table__header") else None
                value = row.find("td", class_="andes-table__column").text.strip() if row.find("td", class_="andes-table__column") else None
                if header and value:
                    table_entries.append({"property": header, "value": value})
            table_data[f"table_{i}"] = table_entries  # Store each table with a unique key

        # Extract description
        description = soup.select_one("#description > div > div > div > p")

        # Extract verified seller info
        verified_seller = soup.select_one("#header > div > div.ui-pdp-seller-validated > p > a")

        # Extract image URL
        image = soup.select_one("#gallery > div > div > span:nth-child(3) > figure > img")
        image_url = image['src'] if image else None

        # Append the extracted data to the list
        scraped_data.append({
            "url": url,
            "title": title.text.strip() if title else None,
            "subtitle": subtitle.text.strip() if subtitle else None,
            "price": int(price.text.strip().replace(".", "")) if price else None,
            "common_expenses": common_expenses.text.strip() if common_expenses else None,
            "squared_meters": squared_meters.text.strip() if squared_meters else None,
            "dorms": dorms.text.strip() if dorms else None,
            "bathrooms": bathrooms.text.strip() if bathrooms else None,
            "location": location.text.strip() if location else None,
            "coordinates": coordinates,
            "description": description.text.strip() if description else None,
            "verified_seller": verified_seller.text.strip() if verified_seller else None,
            "image_url": image_url,
            **table_data
        })

    except Exception as e:
        print(f"Error scraping {url}: {e}")

    break

# Quit the Selenium driver
driver.quit()

# Convert the scraped data to a DataFrame
scraped_df = pd.DataFrame(scraped_data)


# Save the scraped data to a CSV file
output_file = parent_directory / "data" / "raw" / 'scraped_apartments_portal_inmob.csv'
scraped_df.to_csv(output_file, index=False)
print(f"Scraping completed and data saved to {output_file}.")


Scraping: https://www.portalinmobiliario.com/MLC-2719611378-excelente-depto-en-santiago-sur-_JM
Scraping completed and data saved to scraped_apartments.csv.


In [86]:
# Set up Selenium WebDriver
url = r'https://www.portalinmobiliario.com/MLC-1532065067-departamento-nuevo-_JM'

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

# Extract table data (reuse your existing table-scraping logic here)
sections = soup.find_all("div", class_="ui-vpp-striped-specs__table")
print(f"Found {len(sections)} sections with tables.")

all_table_data = []
for section in sections:
    category = section.find("h3", class_="ui-vpp-striped-specs__header")
    category_text = category.text.strip() if category else "Unknown Category"

    table = section.find("table", class_="andes-table")
    if not table:
        print(f"No table found in section: {category_text}")
        continue

    rows = table.find_all("tr", class_="andes-table__row ui-vpp-striped-specs__row")
    for row in rows:
        header = row.find("th", class_="andes-table__header")
        value = row.find("td", class_="andes-table__column")

        header_text = header.text.strip() if header else "N/A"
        value_text = value.text.strip() if value else "N/A"

        all_table_data.append({
            "Category": category_text,
            "Property": header_text,
            "Value": value_text
        })

df = pd.DataFrame(all_table_data)


Found 0 sections with tables.


In [None]:
# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Ensure the appropriate WebDriver is installed

# Target URL
url = r'https://www.portalinmobiliario.com/MLC-1532065067'
driver.get(url)

try:
    # Define the maximum number of attempts
    max_attempts = 5
    attempts = 0
    element_found = False

    while attempts < max_attempts and not element_found:
        try:
            # Wait for the cookie disclaimer to be present
            cookie_banner = WebDriverWait(driver, 3).until(
                EC.presence_of_element_located((By.ID, "newCookieDisclaimerBanner"))
            )
            # Attempt to close the banner
            close_button = cookie_banner.find_element(By.TAG_NAME, "button")  # Update selector as needed
            close_button.click()
            print("Cookie banner dismissed.")
        except Exception as e:
            print("No cookie banner found or failed to dismiss it:", e)

        try:
            # Wait for the button to be present
            button = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "button.ui-pdp-collapsable__action"))
            )
            # Click the button to expand the content
            button.click()
            element_found = True

        except Exception as e:
            print(str(e))
            print(f"Attempt {attempts + 1}: Button not found, refreshing page...")
            attempts += 1
            driver.refresh()  # Refresh the page
            time.sleep(2)  # Wait for the page to reload

    if not element_found:
        print("Button not found after maximum attempts.")
    else:
        print("Button clicked successfully. Proceeding...")
        time.sleep(5)

    # Further processing, such as parsing the page or waiting for the next element
    soup = BeautifulSoup(driver.page_source, "html.parser")

    sections = soup.find_all("div", class_="ui-vpp-striped-specs__table")
    print(f"Found {len(sections)} sections with tables.")

    all_table_data = []
    for section in sections:
        category = section.find("h3", class_="ui-vpp-striped-specs__header")
        category_text = category.text.strip() if category else "Unknown Category"

        table = section.find("table", class_="andes-table")
        if not table:
            print(f"No table found in section: {category_text}")
            continue

        rows = table.find_all("tr", class_="andes-table__row ui-vpp-striped-specs__row")
        for row in rows:
            header = row.find("th", class_="andes-table__header")
            value = row.find("td", class_="andes-table__column")

            header_text = header.text.strip() if header else "N/A"
            value_text = value.text.strip() if value else "N/A"

            all_table_data.append({
                "Category": category_text,
                "Property": header_text,
                "Value": value_text
            })

    df = pd.DataFrame(all_table_data)

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Quit the WebDriver
    driver.quit()


Cookie banner dismissed.
Message: 
Stacktrace:
0   chromedriver                        0x000000010298fac4 cxxbridge1$str$ptr + 3651580
1   chromedriver                        0x0000000102988314 cxxbridge1$str$ptr + 3620940
2   chromedriver                        0x00000001023f04b4 cxxbridge1$string$len + 89224
3   chromedriver                        0x0000000102434898 cxxbridge1$string$len + 368748
4   chromedriver                        0x000000010246e0fc cxxbridge1$string$len + 604368
5   chromedriver                        0x00000001024290b0 cxxbridge1$string$len + 321668
6   chromedriver                        0x0000000102429d00 cxxbridge1$string$len + 324820
7   chromedriver                        0x000000010295ae08 cxxbridge1$str$ptr + 3435328
8   chromedriver                        0x000000010295e120 cxxbridge1$str$ptr + 3448408
9   chromedriver                        0x000000010294217c cxxbridge1$str$ptr + 3333812
10  chromedriver                        0x000000010295e9e0 cxxbr

In [119]:
df

Unnamed: 0,Category,Property,Value
0,Principales,Superficie total,4 m²
1,Principales,Superficie útil,60 m²
2,Principales,Superficie de terraza,0 m²
3,Principales,Ambientes,0
4,Principales,Dormitorios,2
5,Principales,Baños,2
6,Principales,Estacionamientos,0
7,Principales,Bodegas,0
8,Principales,Cantidad de pisos,20
9,Principales,Departamentos por piso,1101
