In [5]:
#ADDED THE MULTITHREADING FUNCTIONALITY, PROCESSING SPEED HAS IMPROVED CONSIDERABLY, 100 PAGES (~2500 LISTINGS) SCRAPED IN AROUND 25 MINS


import requests
from bs4 import BeautifulSoup
import re
import time
import pandas as pd
import json
from datetime import datetime, timedelta
import concurrent.futures

# making the scraper behave like a standard web browser. 
BASE_URL = "https://www.zameen.com"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
    "Accept-Encoding": "gzip, deflate",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "DNT": "1",
    "Connection": "close",
    "Upgrade-Insecure-Requests": "1"
}

# Functions
def convert_price(price_str):
    price_str = str(price_str).replace(",", "").strip()
    if not price_str: return 0.0
    if "Crore" in price_str: return round(float(price_str.replace('Crore', '').strip()) * 10_000_000)
    elif "Lakh" in price_str: return round(float(price_str.replace('Lakh', '').strip()) * 100_000)
    elif "Million" in price_str: return round(float(price_str.replace('Million', '').strip()) * 1_000_000)
    elif "Arab" in price_str: return round(float(price_str.replace('Arab', '').strip()) * 1_000_000_000)
    elif "Thousand" in price_str: return round(float(price_str.replace('Thousand', '').strip()) * 1_000)
    else:
        try: return round(float(re.sub(r'[^\d.]', '', price_str)))
        except ValueError: return 0.0

def convert_size(size_str):
    size_str = str(size_str).replace(",", "").strip()
    if not size_str: return 0.0
    if "Marla" in size_str: return round(float(size_str.replace('Marla', '').strip()))
    elif "Kanal" in size_str: return round(float(size_str.replace('Kanal', '').strip()) * 20)
    elif "Sq. Yd." in size_str: return round(float(size_str.replace('Sq. Yd.', '').strip()) * (9 / 225), 2)
    else:
        try: return round(float(size_str) / 225, 2)
        except ValueError: return 0.0

def convert_relative_date_to_absolute(relative_date_str):
    if not relative_date_str: return None
    relative_date_str = relative_date_str.lower().strip()
    now = datetime.now()
    if "just now" in relative_date_str or "few seconds ago" in relative_date_str: return now.strftime('%Y-%m-%d')
    elif "yesterday" in relative_date_str: return (now - timedelta(days=1)).strftime('%Y-%m-%d')
    parts = relative_date_str.split()
    if len(parts) < 2: return None
    try: value = int(parts[0]); unit = parts[1]
    except ValueError: return None
    delta = None
    if "minute" in unit: delta = timedelta(minutes=value)
    elif "hour" in unit: delta = timedelta(hours=value)
    elif "day" in unit: delta = timedelta(days=value)
    elif "week" in unit: delta = timedelta(weeks=value)
    elif "month" in unit: delta = timedelta(days=value * 30)
    elif "year" in unit: delta = timedelta(days=value * 365)
    if delta: return (now - delta).strftime('%Y-%m-%d')
    return None



Fetching page 1: https://www.zameen.com/Homes/Lahore-1-1.html
Fetching page 2: https://www.zameen.com/Homes/Lahore-1-2.html
Fetching page 3: https://www.zameen.com/Homes/Lahore-1-3.html
Fetching page 4: https://www.zameen.com/Homes/Lahore-1-4.html
Fetching page 5: https://www.zameen.com/Homes/Lahore-1-5.html
Fetching page 6: https://www.zameen.com/Homes/Lahore-1-6.html
Fetching page 7: https://www.zameen.com/Homes/Lahore-1-7.html
Fetching page 8: https://www.zameen.com/Homes/Lahore-1-8.html
Fetching page 9: https://www.zameen.com/Homes/Lahore-1-9.html
Fetching page 10: https://www.zameen.com/Homes/Lahore-1-10.html
Fetching page 11: https://www.zameen.com/Homes/Lahore-1-11.html
Fetching page 12: https://www.zameen.com/Homes/Lahore-1-12.html
Fetching page 13: https://www.zameen.com/Homes/Lahore-1-13.html
Fetching page 14: https://www.zameen.com/Homes/Lahore-1-14.html
Fetching page 15: https://www.zameen.com/Homes/Lahore-1-15.html
Fetching page 16: https://www.zameen.com/Homes/Lahore-1-16

In [None]:
# Core Scraping Function

def get_detail_page_data_from_dataLayer(detail_url):
    """
    Extracts dataLayer object from a detail page. Re-added robust error handling.
    """
    try:
        response = requests.get(detail_url, headers=headers, timeout=10)
        response.raise_for_status()
        
        dataLayer_pattern = re.compile(r"window\['dataLayer'\]\.push\((\{.*?\})\);", re.DOTALL) 
        match = dataLayer_pattern.search(response.text)

        if match:
            json_str = match.group(1) 
            try:
                data = json.loads(json_str)
                return data
            except json.JSONDecodeError as e:
                print(f"  Error decoding dataLayer JSON for {detail_url}: {e}")
        else:
            print(f"  window['dataLayer'] push pattern not found in {detail_url}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching detail page {detail_url}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred for {detail_url}: {e}")

    return None

def get_listing_links(page):
    """
    Extracts unique property detail page URLs from a Zameen.com search result page.
    Re-added robust error handling.
    """
    search_url = f'https://www.zameen.com/Homes/Lahore-1-{page}.html'
    print(f"Fetching page {page}: {search_url}")
    try:
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        links = []
        for a_tag in soup.find_all('a', attrs={'aria-label': 'Listing link'}, href=re.compile(r'/Property/.*\.html')):
            href = a_tag['href']; full_url = BASE_URL + href
            if full_url not in links: links.append(full_url)
        
        if not links:
            print(f"  No new listing links found on page {page}. This might indicate end of results or a change in URL patterns.")

        return links
    except requests.exceptions.RequestException as e:
        print(f"Error fetching listing page {search_url}: {e}")
        return []

def parse_html_details(soup, data_layer_data):
    """
    Extracts core property details from the parsed HTML (BeautifulSoup object)
    and augments with data from the dataLayer. Re-added robust error handling.
    """
    details = { 'Ad_ID': None, 'Price': None, 'Area': None, 'Bedrooms': None, 'Bathrooms': None,
                'Latitude': None, 'Longitude': None, 'Property_Type': None, 
                'Location_Detail': None, 'Date_Added': None }

    # Ad ID from dataLayer (NEW) - kept robust checks
    if data_layer_data and data_layer_data.get('ad_id'):
        details['Ad_ID'] = data_layer_data['ad_id']

    # Price
    price_tag = soup.find('span', attrs={'aria-label': 'Price'})
    price_raw = price_tag.text.strip() if price_tag else None 
    details['Price'] = convert_price(price_raw)

    # Bedrooms
    bedrooms_tag = soup.find('span', attrs={'aria-label': 'Beds'})
    bedrooms_raw = bedrooms_tag.text.strip() if bedrooms_tag else None
    if bedrooms_raw:
        beds_match = re.search(r'(\d+)', bedrooms_raw)
        details['Bedrooms'] = int(beds_match.group(1)) if beds_match else None
    
    # Bathrooms
    bathrooms_tag = soup.find('span', attrs={'aria-label': 'Baths'})
    bathrooms_raw = bathrooms_tag.text.strip() if bathrooms_tag else None 
    if bathrooms_raw:
        baths_match = re.search(r'(\d+)', bathrooms_raw)
        details['Bathrooms'] = int(baths_match.group(1)) if baths_match else None 
        
    # Area
    area_tag = soup.find('span', attrs={'aria-label': 'Area'})
    area_raw = area_tag.text.strip() if area_tag else None
    details['Area'] = convert_size(area_raw)

    # Property Type
    property_type_tag = soup.find('span', attrs={'aria-label': 'Type'})
    details['Property_Type'] = property_type_tag.text.strip() if property_type_tag else None 

    # Date Added
    creation_date_tag = soup.find('span', attrs={'aria-label': 'Creation date'})
    creation_date_raw = creation_date_tag.text.strip() if creation_date_tag else None 
    details['Date_Added'] = convert_relative_date_to_absolute(creation_date_raw)

    # Latitude, Longitude, Location_Detail
    if data_layer_data:
        details['Latitude'] = float(data_layer_data.get('latitude')) if data_layer_data.get('latitude') is not None else None
        details['Longitude'] = float(data_layer_data.get('longitude')) if data_layer_data.get('longitude') is not None else None
        loc_components = []
        if data_layer_data.get('loc_neighbourhood_name'): loc_components.append(data_layer_data['loc_neighbourhood_name'])
        if data_layer_data.get('loc_name') and data_layer_data.get('loc_name') != data_layer_data.get('loc_neighbourhood_name'): loc_components.append(data_layer_data['loc_name'])
        if data_layer_data.get('loc_city_name') and data_layer_data.get('loc_city_name') not in loc_components: loc_components.append(data_layer_data['loc_city_name'])
        details['Location_Detail'] = ', '.join(loc_components).strip() if loc_components else None
    return details



In [None]:

# Main Execution Function
def main():
    all_results = []
    max_pages = 100 # Number of listing pages to scrape
    target_city_slug = 'Lahore-1'

    all_detail_urls = []
    for page in range(1, max_pages + 1):
        listing_links = get_listing_links(page)
        all_detail_urls.extend(listing_links)
        time.sleep(2) 

    print(f"\nCollected {len(all_detail_urls)} unique detail page URLs.")
    print("Starting concurrent processing of detail pages...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        future_to_url = {executor.submit(process_single_listing, url): url for url in all_detail_urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                result_dict = future.result() 
                if result_dict: 
                    all_results.append(result_dict)
            except Exception as exc:
                print(f'Error processing {url}: {exc}')
                all_results.append({
                    'URL': url, 'Ad_ID': None, 'Price': None, 'Area': None, 
                    'Bedrooms': None, 'Bathrooms': None, 'Latitude': None, 
                    'Longitude': None, 'Property_Type': None, 
                    'Location_Detail': None, 'Date_Added': None
                })
            time.sleep(0.1) 

    df = pd.DataFrame(all_results)
    
    desired_columns = [
        'Ad_ID', 'Property_Type', 'Location_Detail', 'Price', 'Area', 
        'Bedrooms', 'Bathrooms', 'Latitude', 'Longitude', 'Date_Added', 'URL'
    ]
    for col in desired_columns:
        if col not in df.columns:
            df[col] = None
    df = df[desired_columns]

    if 'URL' in df.columns:
        df = df.drop(columns=['URL'])
    
    output_filename = 'zameen_listings_final_All.csv'
    df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f"\n✅ Data saved to '{output_filename}'.")

# Helper function for Multithreading
def process_single_listing(url):

    try:
        data_layer_data = get_detail_page_data_from_dataLayer(url)

        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        details = parse_html_details(soup, data_layer_data)
        details['URL'] = url 
        return details

    except requests.exceptions.RequestException as e:
        print(f"  Thread Error: HTTP/Network issue for {url}: {e}")
        return None
    except Exception as e:
        print(f"  Thread Error: Unexpected error processing {url}: {e}")
        return None


if __name__ == "__main__":
    main()