In [46]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import random
from pathlib import Path
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options


current_file = Path.cwd()
parent_directory = current_file.parent

In [47]:
# Base URL without the pagination parameter
transaction_type = 'venta'
property_type = 'departamento'
listing_category = 'propiedades-usadas'
region_name = 'metropolitana'
base_url = f"https://www.portalinmobiliario.com/{transaction_type}/{property_type}/{listing_category}/{region_name}/"

start = 1  # Starting point for pagination
increment = 48  # Step size for pagination
max_pages = 500  # Set a limit to avoid infinite loops

# List to store all extracted links
all_links = []

def fetch_with_rate_limit(url, max_retries=3):
    """Fetches a URL with retry and rate limiting."""
    for _ in range(max_retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response
            elif response.status_code in (429, 503):  # Too Many Requests or Service Unavailable
                retry_after = response.headers.get("Retry-After")
                retry_after = int(retry_after) if retry_after and retry_after.isdigit() else random.uniform(1, 3)
                print(f"Rate limited. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                print(f"Unexpected status: {response.status_code}. Retrying...")
                time.sleep(random.uniform(1, 5))
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            time.sleep(random.uniform(1, 3))
    return None

def scrape_page(url):
    """Scrapes a single page and returns links."""
    response = fetch_with_rate_limit(url)
    if response:
        soup = BeautifulSoup(response.content, "html.parser")
        list_items = soup.select("main > div > div:nth-of-type(3) > section > ol > li")
        links = [
            item.find("a", class_="ui-search-result__image ui-search-link")['href']
            for item in list_items if item.find("a", class_="ui-search-result__image ui-search-link")
        ]
        return links
    return []

# Main scraping loop
for _ in range(max_pages):
    current_url = f"{base_url}_Desde_{start}_OrderId_PRICE_NoIndex_True"
    links = scrape_page(current_url)

    if not links:  # Stop if no links are found
        print("No more items found. Ending scrape.")
        break

    all_links.extend(links)
    print(f"Scraped {len(links)} items from page starting at {start}.")
    start += increment
    time.sleep(random.uniform(2, 5))  # Random delay to reduce server load

cleaned_links = [
    re.search(r'(https://www\.portalinmobiliario\.com/MLC-\d+)', url).group(1)
    for url in all_links if re.search(r'(https://www\.portalinmobiliario\.com/MLC-\d+)', url)
]

# Create DataFrame and save results
df = pd.DataFrame({
    "transaction_type": transaction_type,
    "property_type": property_type,
    "listing_category": listing_category,
    "region_name": region_name,
    "url": cleaned_links
})

output_file = parent_directory / "data" / "raw" / 'scraped_links_portal_inmob.csv'
df.to_csv(output_file, index=False)
print("CSV file saved")

print(f"Total Links Collected: {len(all_links)}")

Scraped 48 items from page starting at 1.
Scraped 48 items from page starting at 49.
Scraped 48 items from page starting at 97.
Scraped 48 items from page starting at 145.
Scraped 48 items from page starting at 193.
Scraped 48 items from page starting at 241.
Unexpected status: 404. Retrying...
Scraped 48 items from page starting at 289.
Scraped 48 items from page starting at 337.
Scraped 48 items from page starting at 385.
Scraped 48 items from page starting at 433.
Scraped 48 items from page starting at 481.
Scraped 48 items from page starting at 529.
Scraped 48 items from page starting at 577.
Scraped 48 items from page starting at 625.
Scraped 48 items from page starting at 673.
Scraped 48 items from page starting at 721.
Scraped 48 items from page starting at 769.
Scraped 48 items from page starting at 817.
Scraped 48 items from page starting at 865.
Unexpected status: 404. Retrying...
Scraped 48 items from page starting at 913.
Scraped 48 items from page starting at 961.
Scraped 

In [None]:
current_file = Path.cwd()
parent_directory = current_file.parent
links_path = parent_directory / "data" / "raw" / 'scraped_links_portal_inmob.csv'
df = pd.read_csv(links_path)
try:
    links_path = parent_directory / "data" / "raw" / 'scraped_apartments_portal_inmob.csv'
    scraped_df = pd.read_csv(links_path)
    df = df[~df['url'].isin(scraped_df['url'])]
except:
    print("no scraped_df csv")

In [64]:
options = Options()
options.add_argument("--headless")

# Initialize an empty list to store extracted data
scraped_data = []

# Iterate over each URL in the DataFrame
for index, row in df.iterrows():
    url = row['url']
    print(f"Scraping: {url}")
    try:
        # Load the page using Selenium to bypass anti-bot measures
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        # Must refresh to bypass anti-bot measures
        driver.refresh()
        WebDriverWait(driver, 3).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "ui-vpp-striped-specs__table"))
        )
        # Get the rendered page source and parse it with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract data using the provided selectors
        title = soup.select_one("#header > div > div.ui-pdp-header__title-container > h1")
        subtitle = soup.select_one("#header > div > div.ui-pdp-header__subtitle > span")
        price = soup.select_one("#price > div > div > div > span > span > span.andes-money-amount__fraction")
        currency = soup.select_one("#price > div > div > div > span > span > span.andes-money-amount__currency-symbol")
        common_expenses = soup.select_one("#maintenance_fee_vis > p")
        squared_meters = soup.select_one("#highlighted_specs_res > div > div:nth-child(1) > span")
        dorms = soup.select_one("#highlighted_specs_res > div > div:nth-child(2) > span")
        bathrooms = soup.select_one("#highlighted_specs_res > div > div:nth-child(3) > span")
        location = soup.select_one("#location > div > div.ui-pdp-media.ui-vip-location__subtitle.ui-pdp-color--BLACK > div > p")

        # Extract coordinates from the map image srcset
        map_img = soup.select_one("#ui-vip-location__map > div > img")
        coordinates = None
        if map_img and 'srcset' in map_img.attrs:
            srcset = map_img['srcset']
            if "center=" in srcset:
                coordinates = srcset.split("center=")[1].split("&")[0]  # Extract lat,lng
                coordinates = coordinates.replace("%2C", ",")

        # Extract tables
        tables = soup.find_all("tbody", class_="andes-table__body")

        flat_table_data = {}
        for table in tables:
            rows = table.find_all("tr", class_="andes-table__row ui-vpp-striped-specs__row")
            for row in rows:
                header = row.find("th", class_="andes-table__header").text.strip() if row.find("th", class_="andes-table__header") else None
                value = row.find("td", class_="andes-table__column").text.strip() if row.find("td", class_="andes-table__column") else None
                if header and value:
                    flat_table_data[header] = value

        # Extract description
        description = soup.select_one("#description > div > div > div > p")

        # Extract verified seller info
        verified_seller = soup.select_one("#header > div > div.ui-pdp-seller-validated > p > a")

        # Extract image URL
        image = soup.select_one("#gallery > div > div > span:nth-child(3) > figure > img")
        image_url = image['src'] if image else None

        # Append the extracted data to the list
        data_entry = {
            "url": url,
            "title": title.text.strip() if title else None,
            "subtitle": subtitle.text.strip() if subtitle else None,
            "price": int(price.text.strip().replace(".", "")) if price else None,
            "currency": currency.text.strip() if currency else None,
            "common_expenses": common_expenses.text.strip() if common_expenses else None,
            "squared_meters": squared_meters.text.strip() if squared_meters else None,
            "dorms": dorms.text.strip() if dorms else None,
            "bathrooms": bathrooms.text.strip() if bathrooms else None,
            "location": location.text.strip() if location else None,
            "coordinates": coordinates,
            "description": description.text.strip() if description else None,
            "verified_seller": verified_seller.text.strip() if verified_seller else None,
            "image_url": image_url
        }

        data_entry.update(flat_table_data)
        scraped_data.append(data_entry)

    except Exception as e:
        print(f"Error scraping {url}: {e}")

# Quit the Selenium driver
driver.quit()

Scraping: https://www.portalinmobiliario.com/MLC-2719611378
Scraping: https://www.portalinmobiliario.com/MLC-1532065067
Scraping: https://www.portalinmobiliario.com/MLC-2720332534
Scraping: https://www.portalinmobiliario.com/MLC-1551782849
Scraping: https://www.portalinmobiliario.com/MLC-2773813104
Scraping: https://www.portalinmobiliario.com/MLC-2764840298
Scraping: https://www.portalinmobiliario.com/MLC-2781898154
Scraping: https://www.portalinmobiliario.com/MLC-2760740166
Scraping: https://www.portalinmobiliario.com/MLC-2788388754
Scraping: https://www.portalinmobiliario.com/MLC-2792728652
Scraping: https://www.portalinmobiliario.com/MLC-1535010059
Scraping: https://www.portalinmobiliario.com/MLC-1554644913
Scraping: https://www.portalinmobiliario.com/MLC-2720383180
Scraping: https://www.portalinmobiliario.com/MLC-2732138746
Scraping: https://www.portalinmobiliario.com/MLC-2761937086
Scraping: https://www.portalinmobiliario.com/MLC-1554908281
Scraping: https://www.portalinmobiliario

In [65]:
# Check if `scraped_df` exists and update/append data accordingly
try:
    # If `scraped_df` already exists, append new data
    new_data_df = pd.DataFrame(scraped_data)
    scraped_df = pd.concat([scraped_df, new_data_df], ignore_index=True).drop_duplicates(subset="url")
except NameError:
    # If `scraped_df` doesn't exist, create it from `scraped_data`
    scraped_df = pd.DataFrame(scraped_data)

# Save the scraped data to a CSV file
output_file = parent_directory / "data" / "raw" / 'scraped_apartments_portal_inmob.csv'
scraped_df.to_csv(output_file, index=False)
print(f"Scraping completed and data saved to {output_file}.")

Scraping completed and data saved to /Users/felipemediavillalevinson/Documents/Hawspred/data/raw/scraped_apartments_portal_inmob.csv.


In [None]:
# options = Options()
# options.add_argument("--start-maximized")
# driver = webdriver.Chrome(options=options)
# driver.quit()