In [5]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time, random, requests, json, re

In [6]:
s = requests.Session()

# Set headers for the session
s.headers.update({
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                 "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
   "Accept-Language": "en-US,en;q=0.9"
})

# Function to fetch a URL with retries
def fetch(url, max_tries=10):
   delay = 3.0
   for i in range(max_tries):
       r = s.get(url, timeout=45)
       if r.status_code in (200, 304):
           # polite delay between successful fetches
           time.sleep(delay + random.random()*2)
           return r
       if r.status_code in (429, 503):  # too many / temporarily blocked
           time.sleep(delay)
           delay *= 2
           continue
       r.raise_for_status()
   raise RuntimeError(f"Failed after {max_tries} tries: {url}")

# Function to extract string from a BeautifulSoup object
def get_text(object, tag, attributes, text=None):
    try:
        return object.find(tag, attrs=attributes, text=text).string.strip()
    except Exception as e:
        return None

# Function for error handling
def maybe(function):
    try:
        return function()
    except:
        return None

In [7]:
base_url = "https://www.lamudi.com.ph"

# Condo For Sale in Metro Manila with Price above 1 peso and Floor Area above 1 sqm
list_url = "https://www.lamudi.com.ph/buy/metro-manila/makati/condo/?foreclosures=excluded&min-price=1&minArea=1&sorting=newest"

start_page = 1
end_page = 10

# Lamudi only allows navigation for upto 50 pages (with 30 listings per page) for a total of 3000 listings.
# Scraping methodology: Scrape 50 pages per city

In [8]:
# Create or clear the output file
with open('lamudi-listings.txt', 'w', encoding="utf-8") as f:
    pass

results = []
batch = []
batch_size = 50

# Fetch listings from the specified pages and write to file
with open('lamudi-listings.txt', 'a', encoding="utf-8") as f:
    for page in range(start_page, end_page + 1):
        response = fetch(list_url + (("&page=" + str(page)) if page > 1 else "" ))
        soup = BeautifulSoup(response.content, "html.parser")
        listings = soup.find_all('div', class_="snippet__content")

        for listing in listings:

            response = fetch(base_url + listing.find('a', href=True)['href'])
            soup = BeautifulSoup(response.content, "html.parser")
            details = soup.find('div', class_="adform__detail")

            # Extract essential details from the listing
            title = get_text(details, 'div', {"class": "main-title"})
            project_unit = get_text(details, 'a', {"id": "project-unit__title", "class": "link"})
            project_name = get_text(details, 'span', {"class": "place-features__values", "data-test": "project-name-value"})
            price = get_text(details, 'div', {"class": "prices-and-fees__price"})
            location = get_text(details, 'div', {"class": "view-map__text"})
            bedrooms = get_text(details, 'div', {"class": "details-item-value", "data-test": "bedrooms-value"})
            bathrooms = get_text(details, 'div', {"class": "details-item-value", "data-test": "full-bathrooms-value"})
            area = get_text(details, 'div', {"class": "details-item-value", "data-test": "area-value"})
            floor_area = get_text(details, 'span', {"class": "place-features__values", "data-test": "floor-area-value"})
            floor = get_text(details, 'span', {"class": "place-features__values", "data-test": "floor-value"})
            condition = get_text(details, 'div', {"class": "facilities__item"}, text=re.compile(r'.*?furnished'))
            property_type = get_text(details, 'span', {"class": "place-features__values", "data-test": "property-type-value"})
            offer_type = get_text(details, 'span', {"class": "place-features__values", "data-test": "operation-type-value"})
            construction_year = get_text(details, 'span', {"class": "place-features__values", "data-test": "construction-year-value"})
            parking_spaces = get_text(details, 'span', {"class": "place-features__values", "data-test": "parking-spaces-value"})
            ownership_type = get_text(details, 'span', {"class": "place-features__values", "data-test": "ownership-value"})
            publish_date = get_text(details, 'div', {"class": "date"})
            project_link = maybe(lambda: details.find('a', {"class": "detail-page-project__link"}, href=True)['href'])

            # Do not include listings with missing essential details
            # if price == None or floor_area == None or location == None or project_name == None:
            #    continue

            record = {
                'title': title,
                'project_unit' : project_unit,
                'project_name': project_name,
                'area' : area,
                'floor_area': floor_area,
                'price': price,
                'location': location,
                # 'full_address': project_name + ", " + location + ", " + "Metro Manila, Philippines",
                'bedrooms': bedrooms,
                'bathrooms': bathrooms,
                'floor': floor,
                'condition': condition,
                'property_type': property_type,
                'offer_type': offer_type,
                'construction_year': construction_year,
                'parking_spaces': parking_spaces,
                'ownership_type': ownership_type,
                'project_link': project_link,
                'publish_date': maybe(lambda: publish_date.split(" - Published by ")[0]),
                'publish_by':  maybe(lambda: publish_date.split(" - Published by ")[1])
            }

            # Skip records with all None values
            if all(value is None for value in record.values()):
                continue
            
            # Convert record to JSON line
            json_line = json.dumps(record, ensure_ascii=False)

            # Add to results for DataFrame
            results.append(record)

            # Add to batch for file writing
            batch.append(json_line)

            # Save to file every batch_size items
            if len(batch) == batch_size or page == end_page:
                f.write('\n'.join(batch) + '\n')
                batch = []
            
results

  return object.find(tag, attrs=attributes, text=text).string.strip()


[{'title': 'For Sale: Lincoln Tower 2BR (SMKT-PL-KM)',
  'project_unit': None,
  'project_name': None,
  'area': '118 sqm',
  'floor_area': '118 sqm',
  'price': '₱ 46,000,000',
  'location': 'Guadalupe Viejo, Makati',
  'bedrooms': '2 bedrooms',
  'bathrooms': '2 bathrooms',
  'floor': None,
  'condition': 'Partly furnished',
  'property_type': 'Condo',
  'offer_type': 'For Sale',
  'construction_year': None,
  'parking_spaces': '1',
  'ownership_type': None,
  'project_link': None,
  'publish_date': '1 hour ago',
  'publish_by': 'Andrew Koa'},
 {'title': 'ONE BEDROOM PROSCENIUM FOR SALE',
  'project_unit': 'Proscenium at Rockwell',
  'project_name': 'Proscenium at Rockwell',
  'area': '65 sqm',
  'floor_area': '65 sqm',
  'price': '₱ 23,500,000',
  'location': 'Guadalupe Viejo, Makati',
  'bedrooms': '1 bedroom',
  'bathrooms': '1 bathroom',
  'floor': None,
  'condition': 'Partly furnished',
  'property_type': 'Condo',
  'offer_type': 'For Sale',
  'construction_year': None,
  'park