In [1]:
import time
import random
import os

import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options

#import undetected_chromedriver as uc
import seleniumwire.undetected_chromedriver as uc

In [2]:
def wait_for_page_to_load(driver, wait):
    title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except:
        print(f"Page Loading failed: {title}")
    else:
        print(f"Page Loading successful: {title}")

In [3]:
def grab_data(driver):    
    data = []
    rows = driver.find_elements(By.CLASS_NAME, 'tupleNew__contentWrap') 
    for row in rows:
        #property name
        try:
            name = row.find_element(By.CLASS_NAME, "tupleNew__headingNrera").text
        except:
            name = np.nan

        try:
            location = row.find_element(By.CLASS_NAME, 'tupleNew__propType').text
        except:
            location = np.nan

        #property price
        try:
            price_element = row.find_elements(By.CLASS_NAME, 'tupleNew__priceValWrap')
        except:
            price = np.nan
        else:
            price = price_element[0].text

        #Area Type
        try:
            area_type = row.find_element(By.CLASS_NAME, 'tupleNew__areaType').text
        except:
            area_type = np.nan

        #area
        try:
            area_bhk = row.find_elements(By.CLASS_NAME, 'tupleNew__totolAreaWrap')
        except:
            area, bhk = np.nan, np.nan
        else:
            area = area_bhk[0].text
            bhk = area_bhk[1].text

        property = {
            'name' : name,
            'location' : location,
            'price' : price,
            'area_type' : area_type,
            'area' : area,
            'bhk' : bhk
        }

        data.append(property)
        
    df_page = pd.DataFrame(data)

    return df_page

In [4]:
def set_filters(driver):
    #scrolling page for elements to load
    driver.execute_script("window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });")
    time.sleep(2)
    driver.execute_script("window.scrollTo({ top: -document.body.scrollHeight, behavior: 'smooth' });")
    time.sleep(3)
    
    
    #1)ready_to_move
    element_path = '/html/body/div[1]/div/div/div[4]/div[3]/div[1]/div[3]/section/div/div/div/div/div[1]/div/div[5]/span[2]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)
    
    #2)verified
    element_path = '/html/body/div[1]/div/div/div[4]/div[3]/div[1]/div[3]/section/div/div/div/div/div[1]/div/div[3]/span[2]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)
    
    #3)residential_apartment
    element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[5]/div/div/div[2]/div[1]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)
    
    #4)resale
    element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[12]/div/div/div'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(1)
    
    element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[12]/div/div/div[2]/div[1]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)
    
    #5)with_photos
    element_path = '/html/body/div[1]/div/div/div[4]/div[3]/div[1]/div[3]/section/div/div/div/div/div[1]/div/div[6]/span[2]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)
    
    #6)2bhk
    element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[6]/div/div/div[2]/div[2]/span[2]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)
    
    #7)3bhk
    element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[6]/div/div/div[2]/div[3]/span[2]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)
    
    #8)4bhk
    element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[6]/div/div/div[2]/div[4]/span[2]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)
    
    #9)max_budget_dropdown
    element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[4]/div/div[2]/div/div[3]/div[1]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(1)
    
    #10)max_budget
    element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[4]/div/div[2]/div/div[3]/div[2]/div/div[2]/div/ul/li[71]'
    element = driver.find_element(By.XPATH, element_path)
    driver.execute_script("arguments[0].click();", element)
    time.sleep(2)

    return driver

In [5]:
def is_captcha_present(driver):
    # Check for common captcha keywords in page title or body
    if "captcha" in driver.page_source.lower() or "access denied" in driver.title.lower():
        return True
    return False

In [8]:
def get_proxy_options(proxy_string):
    """Converts ip:port:user:pass string to selenium-wire format"""
    parts = proxy_string.split(':')
    #if len(parts) == 4:
    #    ip, port, user, password = parts
    #    proxy_url = f"http://{user}:{password}@{ip}:{port}"
    #else:
    #    ip, port = parts
    #    proxy_url = f"http://{ip}:{port}"
    
    ip, port = parts[0], parts[1]
    proxy_url = f"http://{ip}:{port}"
        
    return {
        'proxy': {
            'http': proxy_url,
            'https': proxy_url,
            'no_proxy': 'localhost,127.0.0.1'
        }
    }

In [7]:
PROXY_LIST = [
        "142.111.48.253:7030:yrtqobhn:sqqcxec6sdrt",
        "23.95.150.145:6114:yrtqobhn:sqqcxec6sdrt",
        "198.23.239.134:6540:yrtqobhn:sqqcxec6sdrt",
        "107.172.163.27:6543:yrtqobhn:sqqcxec6sdrt",
        "198.105.121.200:6462:yrtqobhn:sqqcxec6sdrt",
        "64.137.96.74:6641:yrtqobhn:sqqcxec6sdrt",
        "84.247.60.125:6095:yrtqobhn:sqqcxec6sdrt",
        "216.10.27.159:6837:yrtqobhn:sqqcxec6sdrt",
        "23.26.71.145:5628:yrtqobhn:sqqcxec6sdrt",
        "23.27.208.120:5830:yrtqobhn:sqqcxec6sdrt"
    # ... add as many as you have
]


#base_url = "https://www.99acres.com/search/property/buy/gurgaon?city=8&preference=S&area_unit=1&res_com=R"
base_url = "https://www.99acres.com/property-in-gurgaon-ffid"

#df_start_page = pd.read_csv("page.csv")
#start_page = df_start_page.iloc[0,0]

df_url = pd.read_csv("url.csv")
url = df_url['url']

for page in range(start_page,251):
    #setting chrome options
    chrome_options = Options()
    chrome_options.add_argument("--disable-http2")
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--ignore-certificate-errors")
    chrome_options.add_argument("--enable-features=NetworkServiceInProcess")
    chrome_options.add_argument("--disable-features=NetworkService")
    chrome_options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
    )
    #chrome_options.add_argument("--headless")
    chrome_options.page_load_strategy = 'eager'


    #starting driver with a random proxy ip
    new_proxy = random.choice(PROXY_LIST)
    driver = uc.Chrome(
        options=chrome_options,
        seleniumwire_options=get_proxy_options(new_proxy)
    )
    driver.maximize_window()
    wait = WebDriverWait(driver, 10)
    
    #if page <= 5:
    #    url = base_url
    #else:
    #    #url = f"{base_url}&page={page}"
    #    url = f"{base_url}-page-{page}"
    
    driver.get(url)
    wait_for_page_to_load(driver, wait)

    if is_captcha_present(driver):
        print("CAPTCHA DETECTED! SCRIPT PAUSED.")
        print("Please manually solve the CAPTCHA in the browser window.")
        input("Press Enter here in the console once you have solved it...")
        print("Resuming scraping...")
    
    #random sleep after loading the website
    sleep_time = random.uniform(3, 12)
    print(f"Sleeping for {sleep_time:.2f} seconds...")
    time.sleep(sleep_time)
    
    scroll_height = random.randint(300, 1000)
    #driver.execute_script(f"window.scrollTo({0, scroll_height});")
    driver.execute_script(f"""window.scrollTo({{top: {scroll_height}, behavior: 'smooth'}});""")
    time.sleep(random.uniform(3, 7))
    
    driver = set_filters(driver)
    
    #random sleep after loading the website
    sleep_time = random.uniform(3, 12)
    print(f"Sleeping for {sleep_time:.2f} seconds...")
    time.sleep(sleep_time)
    
    #scrolling loop
    prev_height = driver.execute_script('return document.body.scrollHeight')
    while True:
        driver.execute_script("window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });")
        time.sleep(2)
        new_height = driver.execute_script('return document.body.scrollHeight')
        if prev_height == new_height:
            break
        prev_height = new_height
    
    #scraping the data
    print(f"Page {page} : Scraping data...")
    df_page = grab_data(driver)
    next_button = driver.find_element(By.XPATH, '/html/body/div[1]/div/div/div[4]/div[3]/div[3]/div[4]/a')
    url = next_button.get_attribute("href")
    driver.quit()

    #reading already scraped data and appending new data to it
    df = pd.read_csv("99_acres_scraped2.csv")
    df = pd.concat([df,df_page], ignore_index=True)
    df.to_csv("99_acres_scraped2.csv", index=False, encoding="utf-8")

    #keeping info of page number traversed
    #start_page += 1
    #df_start_page.iloc[0,0] = start_page
    #df_start_page.to_csv("page.csv", index=False, encoding="utf-8")

    #keeping info of next page link
    df_url['url'] = url
    df_url.to_csv("url.csv")
    


Page Loading successful: www.99acres.com
Sleeping for 10.31 seconds...


NoSuchElementException: Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html/body/div[1]/div/div/div[4]/div[3]/div[1]/div[3]/section/div/div/div/div/div[1]/div/div[5]/span[2]"}
  (Session info: chrome=143.0.7499.170); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#nosuchelementexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x10a1213
	0x10a1254
	0xe8e6dd
	0xed93a5
	0xed977b
	0xf20382
	0xefb534
	0xf1db13
	0xefb2e6
	0xecd321
	0xece1d4
	0x12f5254
	0x12f080b
	0x130d0ea
	0x10bb118
	0x10c311d
	0x10a9518
	0x10a96d9
	0x1093a68
	0x76e55d49
	0x77c1d5db
	0x77c1d561
