In [8]:
# with progress monitoring
import json
import time
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, StaleElementReferenceException
from requests.exceptions import ProxyError, SSLError

# List of user agents
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/77.0.4054.90',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 OPR/77.0.4054.90',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1',
    'Mozilla/5.0 (Linux; Android 10; SM-G975F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Mobile Safari/537.36'
]

def configure_driver(user_agent):
    options = Options()
    options.add_argument(f'user-agent={user_agent}')
    driver = webdriver.Firefox(options=options)
    return driver

def extract_data(driver):
    data = {}
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
        time.sleep(1)  # Wait additional time for the page to fully load
        
        try:
            rent = driver.find_element(By.ID, 'rent-maintenance').text.replace('₹', '').replace('/M', '').replace(',', '').strip()
            data['Rent'] = rent
        except NoSuchElementException:
            data['Rent'] = None
        
        try:
            total_sqft = driver.find_element(By.ID, 'square-ft').text.replace(',', '').strip()
            data['Total SqFt'] = total_sqft
        except NoSuchElementException:
            data['Total SqFt'] = None
        
        try:
            deposit = driver.find_element(By.ID, 'emi').text.replace('₹', '').replace(',', '').strip()
            data['Deposit'] = deposit
        except NoSuchElementException:
            data['Deposit'] = None
        
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, 'div.nb__3Z_gh')
            for elem in elements:
                key = elem.find_element(By.CSS_SELECTOR, 'h5.nb__X_Hde').text.strip()
                value = elem.find_element(By.CSS_SELECTOR, 'h4.nb__GDnvX').text.strip()
                data[key] = value
        except NoSuchElementException:
            pass
        
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, 'div.nb__3ocPe')
            for elem in elements:
                key = elem.find_element(By.CSS_SELECTOR, 'h5.nb__1IoiM').text.strip()
                value = elem.find_element(By.CSS_SELECTOR, 'h5.font-semi-bold.nb__1IoiM').text.strip()
                data[key] = value
        except NoSuchElementException:
            pass
        
        try:
            livability_score = driver.find_element(By.CSS_SELECTOR, 'div[data-original-title="Livabilty Score is a measure of proximity of the property to essential facilities and amenities on a scale of 0 to 10"] .nb__3lxrH').text.strip()
            data['Livability Score'] = livability_score
        except NoSuchElementException:
            data['Livability Score'] = None
        
        try:
            transit_score = driver.find_element(By.CSS_SELECTOR, 'div[data-original-title="Transit Score is a measure of how well a property is served by public transit on a scale from 0 to 10"] .nb__3lxrH').text.strip()
            data['Transit Score'] = transit_score
        except NoSuchElementException:
            data['Transit Score'] = None
        
    except TimeoutException as e:
        print(f"Error extracting data: {e}")
    
    return data

def process_url(driver, location, url, retries=3):
    for attempt in range(retries):
        try:
            driver.get(url)
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            
            time.sleep(1)  # Wait for the page to load
            
            # Scroll down to load all elements
            driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.END)
            time.sleep(1)
            
            data = extract_data(driver)
            data['URL'] = url
            data['Location'] = location
            return data
        except (WebDriverException, TimeoutException, NoSuchElementException, StaleElementReferenceException, ProxyError, SSLError) as e:
            print(f"Error processing URL {url}: {e}")
            if attempt < retries - 1:
                print(f"Retrying... ({attempt + 1}/{retries})")
            else:
                print(f"Failed after {retries} attempts")
    return None

def save_intermediate_data(data, processed_urls, data_filename='property_data.json', progress_filename='processed_urls.json'):
    with open(data_filename, 'w') as f:
        json.dump(data, f, indent=4)
    with open(progress_filename, 'w') as f:
        json.dump(processed_urls, f, indent=4)

def load_processed_urls(filename='processed_urls.json'):
    if os.path.exists(filename):
        with open(filename, 'r') as f:
            return set(json.load(f))
    return set()

def process_urls(url_list, urls_per_session=10, save_interval=5):
    all_data = []
    processed_urls = load_processed_urls()
    user_agent_index = 0
    
    for i in range(0, len(url_list), urls_per_session):
        batch = url_list[i:i+urls_per_session]
        user_agent = user_agents[user_agent_index]
        user_agent_index = (user_agent_index + 1) % len(user_agents)
        
        driver = configure_driver(user_agent)
        try:
            for location, url in batch:
                if url in processed_urls:
                    continue
                
                data = process_url(driver, location, url)
                if data:
                    all_data.append(data)
                    processed_urls.add(url)
            
            # Save data at regular intervals
            if (i // urls_per_session + 1) % save_interval == 0:
                save_intermediate_data(all_data, list(processed_urls))
        except Exception as e:
            print(f"Error processing batch: {e}")
        finally:
            driver.quit()
    
    # Final save of the collected data
    save_intermediate_data(all_data, list(processed_urls))
    return all_data

def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

# Load URLs from JSON file
url_file = 'urls_5th_aug.json'
with open(url_file, 'r') as file:
    url_list = json.load(file)

# Process the URLs and save the data
data = process_urls(url_list)
save_to_json(data, 'property_data.json')


In [7]:
import pandas as pd
import json

# Load JSON data from the file
with open('property_data.json', 'r') as f:
    data = json.load(f)

# Identify all unique keys in the JSON data
all_keys = set()
for item in data:
    all_keys.update(item.keys())

# Create a DataFrame with all unique keys
df = pd.DataFrame(columns=list(all_keys))

# Populate the DataFrame
for item in data:
    df = df.append(item, ignore_index=True)

# Display the DataFrame
print(df)
# Optionally, save the DataFrame to a CSV file
df.to_csv('realtor_dataset.csv', index=False)
