In [5]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import time, random, requests, json, re

In [6]:
s = requests.Session()

# Set headers for the session
s.headers.update({
   "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                 "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
   "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
   "Accept-Language": "en-US,en;q=0.9"
})

# Function to fetch a URL with retries
def fetch(url, max_tries=10):
   delay = 3.0
   for i in range(max_tries):
       r = s.get(url, timeout=45)
       if r.status_code in (200, 304):
           # polite delay between successful fetches
           time.sleep(delay + random.random()*2)
           return r
       if r.status_code in (429, 503):  # too many / temporarily blocked
           time.sleep(delay)
           delay *= 2
           continue
       r.raise_for_status()
   raise RuntimeError(f"Failed after {max_tries} tries: {url}")

# Function to extract string from a BeautifulSoup object
def get_text(object, tag, attributes, text=None):
    try:
        return object.find(tag, attrs=attributes, text=text).string.strip()
    except Exception as e:
        return None

# Function for error handling
def maybe(function):
    try:
        return function()
    except:
        return None

In [7]:
# URL for the listings
list_url = "https://www.dotproperty.com.ph/condos/all/metro-manila"

start_page = 31
end_page = 33

In [8]:
# Create or clear the output file
with open('dotproperty-projects.txt', 'w', encoding="utf-8") as f:
    pass

results = []
batch = []
batch_size = 50

# Fetch listings from the specified pages and write to file
with open('dotproperty-projects.txt', 'a', encoding="utf-8") as f:

    for page in range(start_page, end_page + 1):
        response = fetch(list_url + (("?page=" + str(page)) if page > 1 else "" ))
        soup = BeautifulSoup(response.content, "html.parser")
        listings = soup.find_all('article', attrs={"class": re.compile(r"col-xs-6 projects-list(.*?)")})

        for listing in listings:

            response = fetch(listing.find('a', attrs={"itemprop": "url"}, href=True)['href'])
            soup = BeautifulSoup(response.content, "html.parser")

            project_name = get_text(soup, 'h1', {"itemprop": "name", "class": "page-title"})   
            location = get_text(soup, 'div', {"class": "view-on-map-info-location__text"})

            record = {
                'project_name': project_name,
                'location': location
            }

            # Convert record to JSON line
            json_line = json.dumps(record, ensure_ascii=False)

            # Add to results for DataFrame
            results.append(record)

            # Add to batch for file writing
            batch.append(json_line)

            # Save to file every batch_size items
            if len(batch) == batch_size or page == end_page:
                f.write('\n'.join(batch) + '\n')
                batch = []
            
results

  return object.find(tag, attrs=attributes, text=text).string.strip()


[{'project_name': 'Vista Shaw',
  'location': 'Addition Hills, Mandaluyong, Metro Manila'},
 {'project_name': 'Mosaic', 'location': 'Valenzuela, Makati, Metro Manila'},
 {'project_name': 'The Symphony Towers',
  'location': 'Binagbag, Agdangan, Quezon'},
 {'project_name': 'The Currency',
  'location': 'San Antonio, Pasig, Metro Manila'},
 {'project_name': 'Laureano Di Trevi',
  'location': 'Pio Del Pilar, Makati, Metro Manila'},
 {'project_name': 'Crown Tower', 'location': 'Manila, Metro Manila'},
 {'project_name': 'KL Tower', 'location': 'Valenzuela, Makati, Metro Manila'},
 {'project_name': 'Avant at The Fort',
  'location': 'BGC, Taguig, Metro Manila'},
 {'project_name': 'Sorrento Oasis',
  'location': 'Rosario, Pasig, Metro Manila'},
 {'project_name': 'Maui Oasis', 'location': 'Maybunga, Pasig, Metro Manila'},
 {'project_name': 'Capri Oasis', 'location': 'Maybunga, Pasig, Metro Manila'},
 {'project_name': 'Bali Oasis Phase 2',
  'location': 'Santolan, Pasig, Metro Manila'},
 {'proj