In [1]:
import time
import os

import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options

import undetected_chromedriver as uc

In [2]:
def wait_for_page_to_load(driver, wait):
    title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except:
        print(f"Page Loading failed: {title}")
    else:
        print(f"Page Loading successful: {title}")

In [3]:
def grab_data(driver):    
    data = []
    rows = driver.find_elements(By.CLASS_NAME, 'tupleNew__contentWrap')
    for row in rows:
        #property name
        try:
            name = row.find_element(By.CLASS_NAME, "tupleNew__headingNrera").text
        except:
            name = np.nan

        try:
            location = row.find_element(By.CLASS_NAME, 'tupleNew__propType').text
        except:
            location = np.nan

        #property price
        try:
            price_element = row.find_elements(By.CLASS_NAME, 'tupleNew__priceValWrap')
        except:
            price = np.nan
        else:
            price = price_element[0].text

        #Area Type
        try:
            area_type = row.find_element(By.CLASS_NAME, 'tupleNew__areaType').text
        except:
            area_type = np.nan

        #area
        try:
            area_bhk = row.find_elements(By.CLASS_NAME, 'tupleNew__totolAreaWrap')
        except:
            area, bhk = np.nan, np.nan
        else:
            area = area_bhk[0].text
            bhk = area_bhk[1].text

        property = {
            'name' : name,
            'location' : location,
            'price' : price,
            'area_type' : area_type,
            'area' : area,
            'bhk' : bhk
        }

        data.append(property)
        
    df_page = pd.DataFrame(data)

    return df_page

In [19]:
city = "Gurgaon"

chrome_options = Options()
chrome_options.add_argument("--disable-http2")
#chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--enable-features=NetworkServiceInProcess")
chrome_options.add_argument("--disable-features=NetworkService")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
)
#chrome_options.add_argument("--headless")
#hanging issue fix - starts
chrome_options.page_load_strategy = 'eager'
#hanging issue fix - ends

#hanging issue fix - starts
#driver = webdriver.Chrome(options=chrome_options)
driver = uc.Chrome(options=chrome_options)
#hanging issue fix - ends
driver.maximize_window()
wait = WebDriverWait(driver, 10)

url = "https://www.99acres.com/"
driver.get(url)
wait_for_page_to_load(driver, wait)
time.sleep(3)




#search bar element
try:
    search_bar_path = '/html/body/div[1]/div/div[3]/form/div/div[1]/div[2]/div/div/div[1]/div[1]/div[2]/div/div/input'
    search_bar = wait.until(EC.presence_of_element_located((By.XPATH, search_bar_path)))
except:
    print("Timeout: search bar not located.")
else:
    search_bar.clear()
    search_bar.send_keys(city)
    time.sleep(1)
    
#search suggestion valid option element
try:
    valid_option_path = "/html/body/div[1]/div/div[3]/form/div[1]/div[1]/div[2]/div/div/div[1]/div[1]/div[2]/div[2]/ul/li[1]"
    valid_option = wait.until(EC.element_to_be_clickable((By.XPATH, valid_option_path)))
except:
    print("Timeout: suggestion not popped up.")
else:
    valid_option.click()
    time.sleep(1)

#clicking search button
try:
    search_button_path = "/html/body/div[1]/div/div[3]/form/div[1]/div[1]/div[2]/div/div/div[1]/div[3]/button"
    search_button = wait.until(EC.element_to_be_clickable((By.XPATH, search_button_path)))
except:
    print("Timeout: search button not clickable.")
else:
    search_button.click()
    time.sleep(3)




#scrolling page for elements to load
for i in range(3):
    driver.execute_script("window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });")
    time.sleep(2)
driver.execute_script("window.scrollTo({ top: -document.body.scrollHeight, behavior: 'smooth' });")
time.sleep(3)




#1)ready_to_move
element_path = '/html/body/div[1]/div/div/div[4]/div[3]/div[1]/div[3]/section/div/div/div/div/div[1]/div/div[5]/span[2]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(2)

#2)residential_apartment
element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[5]/div/div/div[2]/div[1]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(2)

#3)resale
element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[12]/div/div/div'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(1)

element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[12]/div/div/div[2]/div[1]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(2)

#4)with_photos
element_path = '/html/body/div[1]/div/div/div[4]/div[3]/div[1]/div[3]/section/div/div/div/div/div[1]/div/div[6]/span[2]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(2)

#5)2bhk
element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[6]/div/div/div[2]/div[2]/span[2]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(2)

#6)3bhk
element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[6]/div/div/div[2]/div[3]/span[2]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(2)

#7)4bhk
element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[6]/div/div/div[2]/div[4]/span[2]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(2)

#8)max_budget_dropdown
element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[4]/div/div[2]/div/div[3]/div[1]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(1)

#9)max_budget
element_path = '/html/body/div[1]/div/div/div[4]/div[2]/div/div[4]/div/div[2]/div/div[3]/div[2]/div/div[2]/div/ul/li[71]'
element = driver.find_element(By.XPATH, element_path)
element.click()
time.sleep(2)

    

#scroll down the page to load completely
page=4
df = pd.DataFrame()

#page navigation loop
while True:
    page += 1
    next_page_button_path = '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[3]/div[4]/a[1]'
    element_found = False
    last_page = False
    page_end = False
    
    #scrolling loop
    prev_height = driver.execute_script('return document.body.scrollHeight')
    while True:
        driver.execute_script("window.scrollTo({ top: document.body.scrollHeight, behavior: 'smooth' });")
        time.sleep(2)
        new_height = driver.execute_script('return document.body.scrollHeight')
        if prev_height == new_height:
            break
        prev_height = new_height
    driver.execute_script("window.scrollBy({ top: -2400, left: 0, behavior: 'smooth' });")
    
    try:
        next_page_button = driver.find_element(By.XPATH, next_page_button_path)
    except:
        print("Timeout: next page button not found. Last Page.")
    else:
        time.sleep(1)
        #scraping the data
        print("Scraping data.")
        df_page = grab_data(driver)
        df = pd.concat([df,df_page], ignore_index=True)
        #click next button
        try:
            print(f"Clicking Next Page button from page {page}.")
            #hanging issue fix - starts
            #wait.until(EC.element_to_be_clickable(next_page_button)).click()
            print(f"{next_page_button.get_attribute("href")}")
            driver.execute_script("arguments[0].click();", next_page_button)
            #hanging issue fix - ends
        except:
            print("Timeout: next page button not clickable.")
        else:
            wait_for_page_to_load(driver, wait)
            time.sleep(3)

#scraping data from last page
df_page = grab_data(driver)
df = pd.concat([df,df_page], ignore_index=True)

Page Loading successful: India Real Estate Property Site - Buy Sell Rent Properties Portal - 99acres.com
Scraping data.
Clicking Next Page button from page 5.
None
Page Loading successful: Property in Gurgaon - Real Estate in Gurgaon
Scraping data.
Clicking Next Page button from page 6.
None
Page Loading successful: Property in Gurgaon - Real Estate in Gurgaon
Scraping data.
Clicking Next Page button from page 7.
None
Page Loading successful: Property in Gurgaon - Real Estate in Gurgaon
Scraping data.
Clicking Next Page button from page 8.
None
Page Loading successful: Property in Gurgaon - Real Estate in Gurgaon
Scraping data.
Clicking Next Page button from page 9.
None
Page Loading successful: Property in Gurgaon - Real Estate in Gurgaon
Scraping data.
Clicking Next Page button from page 10.
None
Page Loading successful: Property in Gurgaon - Real Estate in Gurgaon
Scraping data.
Clicking Next Page button from page 11.
None
Page Loading successful: Property in Gurgaon - Real Estate i

KeyboardInterrupt: 

In [22]:
df.to_csv("99_acres_scraped.csv", index=False, encoding="utf-8")

In [21]:
df

Unnamed: 0,name,location,price,area_type,area,bhk
0,Hero Homes,"3 BHK Flat in Sector 104, Gurgaon",₹1.86 Cr,Super Built-up Area,"1,359 sqft\n(126 sqm)",3 BHK\n(2 Baths)
1,ILD Greens\n4.0,"3 BHK Flat in Sector 37C, Gurgaon",₹1.48 Cr,Super Built-up Area,"1,974 sqft\n(183 sqm)",3 BHK\n(3 Baths)
2,Unitech Uniworld Gardens\n3.8,"3 BHK Flat in Sector 47, Gurgaon",₹3.3 Cr,Built-up Area,"2,027 sqft\n(188 sqm)",3 BHK\n(4 Baths)
3,Smart World Gems,"3 BHK Flat in Sector 89, Gurgaon",₹1.67 Cr,Super Built-up Area,"1,494 sqft\n(139 sqm)",3 BHK\n(3 Baths)
4,Anant Raj,"3 BHK Flat in Sector 63A, Gurgaon",₹2.3 Cr,Carpet Area,"1,378 sqft\n(128 sqm)",3 BHK\n(3 Baths)
...,...,...,...,...,...,...
2593,Tulip Purple\n3.9,"4 BHK Flat in Sector 69, Gurgaon",₹2.73 Cr,Super Built-up Area,"2,400 sqft\n(223 sqm)",4 BHK\n(5 Baths)
2594,Tulip Yellow\n3.5,"3 BHK Flat in Sector 69, Gurgaon",₹2.22 Cr,Super Built-up Area,"1,704 sqft\n(158 sqm)",3 BHK\n(3 Baths)
2595,Tulip Yellow\n3.5,"3 BHK Flat in Sector 69, Gurgaon",₹2.3 Cr,Super Built-up Area,"1,704 sqft\n(158 sqm)",3 BHK\n(3 Baths)
2596,Tulip Yellow\n3.5,"3 BHK Flat in Sector 69, Gurgaon",₹2.32 Cr,Super Built-up Area,"1,704 sqft\n(158 sqm)",3 BHK\n(3 Baths)


In [11]:
current_url = driver.current_url
current_url

'https://www.99acres.com/search/property/buy/gurgaon?city=8&preference=S&area_unit=1&res_com=R'

In [9]:
import re

In [10]:
base_url = re.sub(r'&page=\d+', '', current_url)
base_url = re.sub(r'\?page=\d+', '', base_url)
base_url

'https://www.99acres.com/search/property/buy/gurgaon?city=8&preference=S&area_unit=1&res_com=R'

In [17]:
driver

<undetected_chromedriver.Chrome (session="c5c6b4ee110d48c53ece7bed67343b44")>

InvalidSessionIdException: Message: invalid session id; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#invalidsessionidexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0xcd12d3
	0xcd1314
	0xabe52b
	0xafc635
	0xb2b3a6
	0xb26ed1
	0xb26846
	0xa8ebfd
	0xa8f18e
	0xa8f65d
	0xf25314
	0xf208cb
	0xf3d1aa
	0xceb1d8
	0xcf31dd
	0xa8e7ab
	0xa8ddf7
	0x10717af
	0x76e55d49
	0x77c1d5db
	0x77c1d561
