In [1]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import random
from pathlib import Path
import re

In [4]:
current_file = Path.cwd()
parent_directory = current_file.parent

# Base URL without the pagination parameter
contract_type = 'venta'
investment_type = 'departamento'
region = 'metropolitana'
base_url = f"https://www.portalinmobiliario.com/{contract_type}/{investment_type}/{region}/"
start = 1  # Starting point for pagination
increment = 48  # Step size for pagination
max_pages = 500  # Set a limit to avoid infinite loops

# List to store all extracted links
all_links = []

def fetch_with_rate_limit(url, max_retries=3):
    """Fetches a URL with retry and rate limiting."""
    for _ in range(max_retries):
        try:
            response = requests.get(url)
            if response.status_code == 200:
                return response
            elif response.status_code in (429, 503):  # Too Many Requests or Service Unavailable
                retry_after = response.headers.get("Retry-After")
                retry_after = int(retry_after) if retry_after and retry_after.isdigit() else random.uniform(1, 3)
                print(f"Rate limited. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                print(f"Unexpected status: {response.status_code}. Retrying...")
                time.sleep(random.uniform(1, 5))
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            time.sleep(random.uniform(1, 3))
    return None

def scrape_page(url):
    """Scrapes a single page and returns links."""
    response = fetch_with_rate_limit(url)
    if response:
        soup = BeautifulSoup(response.content, "html.parser")
        list_items = soup.select("main > div > div:nth-of-type(3) > section > ol > li")
        links = [
            item.find("a", class_="ui-search-result__image ui-search-link")['href']
            for item in list_items if item.find("a", class_="ui-search-result__image ui-search-link")
        ]
        return links
    return []

# Main scraping loop
for _ in range(max_pages):
    current_url = f"{base_url}_Desde_{start}_OrderId_PRICE_NoIndex_True"
    links = scrape_page(current_url)

    if not links:  # Stop if no links are found
        print("No more items found. Ending scrape.")
        break

    all_links.extend(links)
    print(f"Scraped {len(links)} items from page starting at {start}.")
    start += increment
    time.sleep(random.uniform(2, 5))  # Random delay to reduce server load

cleaned_links = [
    re.search(r'(https://www\.portalinmobiliario\.com/MLC-\d+)', url).group(1)
    for url in all_links if re.search(r'(https://www\.portalinmobiliario\.com/MLC-\d+)', url)
]

# Create DataFrame and save results
df = pd.DataFrame({
    "contract_type": contract_type,
    "investment_type": investment_type,
    "region": region,
    "url": cleaned_links
})


output_file = parent_directory / "data" / "raw" / 'scraped_links_portal_inmob.csv'
df.to_csv(output_file, index=False)

print(f"Total Links Collected: {len(all_links)}")


Scraped 48 items from page starting at 1.
Scraped 48 items from page starting at 49.
Scraped 48 items from page starting at 97.
Scraped 48 items from page starting at 145.
Scraped 48 items from page starting at 193.
Unexpected status: 404. Retrying...
Scraped 48 items from page starting at 241.
Scraped 48 items from page starting at 289.
Scraped 48 items from page starting at 337.
Scraped 48 items from page starting at 385.
Scraped 48 items from page starting at 433.
Scraped 48 items from page starting at 481.
Unexpected status: 404. Retrying...
Scraped 48 items from page starting at 529.
Scraped 48 items from page starting at 577.
Scraped 48 items from page starting at 625.
Scraped 48 items from page starting at 673.
Unexpected status: 404. Retrying...
Scraped 48 items from page starting at 721.
Scraped 48 items from page starting at 769.
Scraped 48 items from page starting at 817.
Scraped 48 items from page starting at 865.
Scraped 48 items from page starting at 913.
Scraped 48 items

In [2]:
from pathlib import Path

current_file = Path.cwd()
parent_directory = current_file.parent
links_path = parent_directory / "data" / "raw" / 'scraped_links_portal_inmob.csv'
df = pd.read_csv(links_path)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Initialize an empty list to store extracted data
scraped_data = []

# Iterate over each URL in the DataFrame
for index, row in df.iterrows():
    url = row['url']
    url = r'https://www.portalinmobiliario.com/MLC-1544145281'
    print(f"Scraping: {url}")
    try:
        # Fetch the page content
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract data using the provided selectors
        title = soup.select_one("#header > div > div.ui-pdp-header__title-container > h1")
        subtitle = soup.select_one("#header > div > div.ui-pdp-header__subtitle > span")
        price = soup.select_one("#price > div > div > div > span > span > span.andes-money-amount__fraction")
        common_expenses = soup.select_one("#maintenance_fee_vis > p")
        squared_meters = soup.select_one("#highlighted_specs_res > div > div:nth-child(1) > span")
        dorms = soup.select_one("#highlighted_specs_res > div > div:nth-child(2) > span")
        bathrooms = soup.select_one("#highlighted_specs_res > div > div:nth-child(3) > span")
        location = soup.select_one("#location > div > div.ui-pdp-media.ui-vip-location__subtitle.ui-pdp-color--BLACK > div > p")

        # Extract coordinates from the map image srcset
        map_img = soup.select_one("#ui-vip-location__map > div > img")
        coordinates = None
        if map_img and 'srcset' in map_img.attrs:
            srcset = map_img['srcset']
            if "center=" in srcset:
                coordinates = srcset.split("center=")[1].split("&")[0]  # Extract lat,lng
                coordinates = coordinates.replace("%2C", ",")

        # Extract tables
        table1 = soup.select_one("#highlighted_specs_attrs > div > div > div > div:nth-child(1) > div:nth-child(1) > table")
        table2 = soup.select_one("#highlighted_specs_attrs > div > div > div > div:nth-child(2) > div:nth-child(1) > table")
        table3 = soup.select_one("#highlighted_specs_attrs > div > div > div > div:nth-child(1) > div:nth-child(2) > table")
        table4 = soup.select_one("#highlighted_specs_attrs > div > div > div > div:nth-child(2) > div:nth-child(2) > table")
        table5 = soup.select_one("#highlighted_specs_attrs > div > div > div > div:nth-child(2) > div:nth-child(3) > table")
        table6 = soup.select_one("#highlighted_specs_attrs > div > div > div > div:nth-child(2) > div:nth-child(4) > table")

        # Extract description
        description = soup.select_one("#description > div > div > div > p")

        # Extract verified seller info
        verified_seller = soup.select_one("#header > div > div.ui-pdp-seller-validated > p > a")

        # Extract image URL
        image = soup.select_one("#gallery > div > div > span:nth-child(3) > figure > img")
        image_url = image['src'] if image else None

        # Append the extracted data to the list
        scraped_data.append({
            "url": url,
            "title": title.text.strip() if title else None,
            "subtitle": subtitle.text.strip() if subtitle else None,
            "price": int(price.text.strip().replace(".","")) if price else None,
            "common_expenses": common_expenses.text.strip() if common_expenses else None,
            "squared_meters": squared_meters.text.strip() if squared_meters else None,
            "dorms": dorms.text.strip() if dorms else None,
            "bathrooms": bathrooms.text.strip() if bathrooms else None,
            "location": location.text.strip() if location else None,
            "coordinates": coordinates,
            "table1": str(table1) if table1 else None,
            "table2": str(table2) if table2 else None,
            "table3": str(table3) if table3 else None,
            "table4": str(table4) if table4 else None,
            "table5": str(table5) if table5 else None,
            "table6": str(table6) if table6 else None,
            "description": description.text.strip() if description else None,
            "verified_seller": verified_seller.text.strip() if verified_seller else None,
            "image_url": image_url
        })

    except Exception as e:
        print(f"Error scraping {url}: {e}")
    break



# Convert the scraped data to a DataFrame
scraped_df = pd.DataFrame(scraped_data)

output_file = parent_directory / "data" / "raw" / 'scraped_apartments_portal_inmob.csv'

# Save the scraped data to a CSV
scraped_df.to_csv(output_file, index=False)
print("Scraping completed and data saved to 'scraped_apartments.csv'.")


Scraping: https://www.portalinmobiliario.com/MLC-1544145281
Scraping completed and data saved to 'scraped_apartments.csv'.
