In [None]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import random
from pathlib import Path
import re

current_file = Path(__file__)
parent_directory = current_file.parent

# Base URL without the pagination parameter
contract_type = 'venta'
investment_type = 'departamento'
region = 'metropolitana'
base_url = f"https://www.portalinmobiliario.com/{contract_type}/{investment_type}/{region}/"
start = 1  # Starting point for pagination
increment = 48  # Step size for pagination
max_pages = 500  # Set a limit to avoid infinite loops

# List to store all extracted links
all_links = []

def fetch_with_rate_limit(url, max_retries=3):
    """Fetches a URL with retry and rate limiting."""
    for _ in range(max_retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response
            elif response.status_code in (429, 503):  # Too Many Requests or Service Unavailable
                retry_after = response.headers.get("Retry-After")
                retry_after = int(retry_after) if retry_after and retry_after.isdigit() else random.uniform(1, 3)
                print(f"Rate limited. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                print(f"Unexpected status: {response.status_code}. Retrying...")
                time.sleep(random.uniform(1, 5))
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            time.sleep(random.uniform(1, 3))
    return None

def scrape_page(url):
    """Scrapes a single page and returns links."""
    response = fetch_with_rate_limit(url)
    if response:
        soup = BeautifulSoup(response.content, "html.parser")
        list_items = soup.select("main > div > div:nth-of-type(3) > section > ol > li")
        links = [
            item.find("a", class_="ui-search-result__image ui-search-link")['href']
            for item in list_items if item.find("a", class_="ui-search-result__image ui-search-link")
        ]
        return links
    return []

# Main scraping loop
for _ in range(max_pages):
    current_url = f"{base_url}_Desde_{start}_OrderId_PRICE_NoIndex_True"
    links = scrape_page(current_url)

    if not links:  # Stop if no links are found
        print("No more items found. Ending scrape.")
        break

    all_links.extend(links)
    print(f"Scraped {len(links)} items from page starting at {start}.")
    start += increment
    time.sleep(random.uniform(2, 5))  # Random delay to reduce server load

cleaned_links = [
    re.search(r'(https://www\.portalinmobiliario\.com/MLC-\d+)', url).group(1)
    for url in all_links if re.search(r'(https://www\.portalinmobiliario\.com/MLC-\d+)', url)
]

# Create DataFrame and save results
df = pd.DataFrame({
    "contract_type": contract_type,
    "investment_type": investment_type,
    "region": region,
    "url": all_links
})


output_file = "scraped_links.csv"
df.to_csv(output_file, index=False)

print(f"Total Links Collected: {len(all_links)}")


Scraped 48 items from page starting at 1.
Scraped 48 items from page starting at 49.
Scraped 48 items from page starting at 97.
Scraped 48 items from page starting at 145.
Scraped 48 items from page starting at 193.
Scraped 48 items from page starting at 241.
Scraped 48 items from page starting at 289.
Scraped 48 items from page starting at 337.
Scraped 48 items from page starting at 385.
Scraped 48 items from page starting at 433.
Scraped 48 items from page starting at 481.
Scraped 48 items from page starting at 529.
Scraped 48 items from page starting at 577.
Scraped 48 items from page starting at 625.
Scraped 48 items from page starting at 673.
Scraped 48 items from page starting at 721.
Scraped 48 items from page starting at 769.
Scraped 48 items from page starting at 817.
Scraped 48 items from page starting at 865.
Scraped 48 items from page starting at 913.
Scraped 48 items from page starting at 961.
Scraped 48 items from page starting at 1009.
Scraped 48 items from page starting

In [31]:
from pathlib import Path

current_file = Path.cwd()
parent_directory = current_file.parent

output_file = parent_directory / "data" / "raw" / 'scraped_links_portal_inmob.csv'
df.to_csv(output_file, index=False)