In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random

# Base URL without the pagination parameter
base_url = "https://www.portalinmobiliario.com/venta/departamento/metropolitana/"
start = 1  # Starting point for pagination
increment = 48  # Step size for pagination
max_pages = 20  # Set a limit to avoid infinite loops

# List to store all extracted links
all_links = []

def fetch_with_rate_limit(url, max_retries=5):
    for _ in range(max_retries):
        response = requests.get(url)
        if response.status_code == 200:
            return response
        elif response.status_code in (429, 503):  # Too Many Requests or Service Unavailable
            retry_after = int(response.headers.get("Retry-After", random.uniform(1, 3)))
            print(f"Rate limited. Retrying after {retry_after} seconds...")
            time.sleep(retry_after)
        else:
            print(f"Unexpected status: {response.status_code}. Retrying...")
            time.sleep(random.uniform(1, 5))  # Random delay to reduce server load
    raise Exception("Max retries exceeded.")

for page in range(max_pages):
    # Construct the URL for the current page
    current_url = f"{base_url}_Desde_{start}_OrderId_PRICE_NoIndex_True"

    # Fetch the page content
    response = fetch_with_rate_limit(current_url)

    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all <li> elements in the specified section
        list_items = soup.select("main > div > div:nth-of-type(3) > section > ol > li")

        # Extract links from each list item
        for item in list_items:
            anchor = item.find("a", class_="ui-search-result__image ui-search-link")
            if anchor and 'href' in anchor.attrs:
                all_links.append(anchor['href'])

        print(f"Scraped {len(list_items)} items from page starting at {start}.")

        # Increment for the next page
        start += increment
        time.sleep(random.uniform(2,5)) # Random delay to reduce server load
    else:
        print(f"Failed to fetch page at: {current_url}")
        print(response)
        break

# Print all collected links
print(f"Total Links Collected: {len(all_links)}")
for link in all_links:
    print(link)


Scraped 48 items from page starting at 1.
Scraped 48 items from page starting at 49.
Scraped 48 items from page starting at 97.
Scraped 48 items from page starting at 145.
Scraped 48 items from page starting at 193.
Scraped 48 items from page starting at 241.
Scraped 48 items from page starting at 289.
Scraped 48 items from page starting at 337.
Scraped 48 items from page starting at 385.
Scraped 48 items from page starting at 433.
Total Links Collected: 480
https://www.portalinmobiliario.com/MLC-2719611378-excelente-depto-en-santiago-sur-_JM#position=1&search_layout=grid&type=item&tracking_id=6b45bc96-ed14-4b7b-8e2e-84d48ec46631
https://www.portalinmobiliario.com/MLC-1532065067-departamento-nuevo-_JM#position=2&search_layout=grid&type=item&tracking_id=6b45bc96-ed14-4b7b-8e2e-84d48ec46631
https://www.portalinmobiliario.com/MLC-2720332534-vendo-promesa-de-compraventa-depto-mariposa-recoleta-_JM#position=3&search_layout=grid&type=item&tracking_id=6b45bc96-ed14-4b7b-8e2e-84d48ec46631
http

In [20]:
response.status_code

200