In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
import time
import pandas as pd
import numpy as np

In [2]:
def wait_for_page_to_load(driver, wait):
    title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except:
        print(f"the webpage \"{title}\" did not get fully loaded")
    else:
        print(f"the webpage {title} did get fully loaded")
        

In [20]:
# options
chrome_options = Options()
chrome_options.add_argument("--disable-http2")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--enable-features=NetworkServiceInProcess")
chrome_options.add_argument("--disable-features=NetworkService")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
)

# Use the correct path to your downloaded chromedriver
service = Service("C:\\webdrivers\\chromedriver.exe")

# Now start the browser using the specified service
driver = webdriver.Chrome(service=service, options=chrome_options) 

driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# accessing the target webpage
url = "https://www.99acres.com/"
driver.get(url)
wait_for_page_to_load(driver, wait)

# identify and enter text into search bar
try:
	search_bar = wait.until(
		EC.presence_of_element_located((By.XPATH, '//*[@id="keyword2"]'))
	)
except:
	print("Timeout while locating Search Bar.\n")
else:
	search_bar.send_keys("Chennai")
	time.sleep(2)

# selecting valid option from list
try:
	valid_option = wait.until(
		EC.element_to_be_clickable((By.XPATH, '//*[@id="0"]'))
	)
except:
	print("Timeout while locating valid search option.\n")
else:
	valid_option.click()
	time.sleep(2)

# click on Search button
try:
	search_button = wait.until(
		EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform_search_btn"]'))
	)
except:
	print("Timeout while clicking on \"Search\" button.\n")
else:
	search_button.click()
	wait_for_page_to_load(driver, wait)

# adjust the Budget slider
try:
	slider = wait.until(
		EC.element_to_be_clickable((By.XPATH, '//*[@id="budgetLeftFilter_max_node"]'))
	)
except:
	print("Timeout while clicking on Budget slider circle.\n")
else:
	actions = ActionChains(driver)
	(
		actions
		.click_and_hold(slider)
		.move_by_offset(-73, 0)
		.release()
		.perform()
	)
	time.sleep(2)

# filter results to show genuine listings
# 1. Verified
verified = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[3]/span[1]'))
)
verified.click()
time.sleep(1)

# 2. Ready To Move
ready_to_move = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[5]/span[1]'))
)
ready_to_move.click()
time.sleep(1)

# moving to the right side to unhide remaining filters
while True:
	try:
		filter_right_button = wait.until(
			EC.presence_of_element_located((By.XPATH, "//i[contains(@class,'iconS_Common_24 icon_upArrow cc__rightArrow')]"))
		)
	except:
		print("Timeout because we have uncovered all filters.\n")
		break
	else:
		filter_right_button.click()
		time.sleep(1)

# 3. With Photos
with_photos = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[6]/span[1]'))
)
with_photos.click()
time.sleep(1)

# 4. With Videos
with_videos = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[7]/span[1]'))
)
with_videos.click()
time.sleep(3)

# navigate pages and extract data
data = []
page_count = 0
while True:
	page_count += 1
	try:
		next_page_button = driver.find_element(By.XPATH, "//a[normalize-space()='Next Page >']")
	except:
		print(f"Timeout because we have navigated all the {page_count} pages.\n")
		break
	else:
		try:
			driver.execute_script("window.scrollBy(0, arguments[0].getBoundingClientRect().top - 100);", next_page_button)
			time.sleep(2)
	
			# scraping the data
			rows = driver.find_elements(By.CLASS_NAME, "tupleNew__TupleContent")
			for row in rows:
				# property name
				try:
					name = row.find_element(By.CLASS_NAME, "tupleNew__headingNrera").text
				except:
					name = np.nan

				# property location
				try:
					location = row.find_element(By.CLASS_NAME, "tupleNew__propType").text
				except:
					location = np.nan

				# property price
				try:
					price = row.find_element(By.CLASS_NAME, "tupleNew__priceValWrap").text
				except:
					price = np.nan

				# property area and bhk
				try:
					elements = row.find_elements(By.CLASS_NAME, "tupleNew__area1Type")
				except:
					area, bhk = [np.nan, np.nan]
				else:
					area, bhk = [ele.text for ele in elements]
					
				property = {
					"name": name,
					"location": location,
					"price": price,
					"area": area,
					"bhk": bhk
				}
				data.append(property)
			
			wait.until(
				EC.element_to_be_clickable((By.XPATH, "//a[normalize-space()='Next Page >']"))
			).click()
			time.sleep(50)
		except:
			print("Timeout while clicking on \"Next Page\".\n")

# scraping data from the last page
rows = driver.find_elements(By.CLASS_NAME, "tupleNew__TupleContent")
for row in rows:
	# property name
	try:
		name = row.find_element(By.CLASS_NAME, "tupleNew__headingNrera").text
	except:
		name = np.nan

	# property location
	try:
		location = row.find_element(By.CLASS_NAME, "tupleNew__propType").text
	except:
		location = np.nan

	# property price
	try:
		price = row.find_element(By.CLASS_NAME, "tupleNew__priceValWrap").text
	except:
		price = np.nan

	# property area and bhk
	try:
		elements = row.find_elements(By.CLASS_NAME, "tupleNew__area1Type")
	except:
		area, bhk = [np.nan, np.nan]
	else:
		area, bhk = [ele.text for ele in elements]
					
	property = {
		"name": name,
		"location": location,
		"price": price,
		"area": area,
		"bhk": bhk
	}
	data.append(property)



the webpage India Real Estate Property Site - Buy Sell Rent Properties Portal - 99acres.com did get fully loaded
the webpage Property in Chennai - Real Estate in Chennai did get fully loaded
Timeout because we have uncovered all filters.

Timeout while clicking on "Next Page".

Timeout while clicking on "Next Page".

Timeout because we have navigated all the 51 pages.



In [21]:
data

[{'name': 'on request',
  'location': '2 BHK Flat in Pallikaranai, Chennai',
  'price': '₹48 Lac',
  'area': '764 sqft',
  'bhk': '2 BHK'},
 {'name': 'RC Adena',
  'location': '2 BHK Flat in Thirumullaivoyal, Chennai',
  'price': '₹45 Lac',
  'area': '820 sqft',
  'bhk': '2 BHK'},
 {'name': 'Radiance Suprema',
  'location': '2 BHK Flat in Madhavaram, Chennai',
  'price': '₹1 Cr',
  'area': '1,102 sqft',
  'bhk': '2 BHK'},
 {'name': 'on request',
  'location': '2 BHK Flat in Pallikaranai, Chennai',
  'price': '₹57.7 Lac',
  'area': '932 sqft',
  'bhk': '2 BHK'},
 {'name': 'Shriram Park 63\n3.8',
  'location': '3 BHK Flat in Perungalathur, Chennai',
  'price': '₹1.5 Cr',
  'area': '1,970 sqft',
  'bhk': '3 BHK'},
 {'name': 'Babu enclave',
  'location': '3 BHK Flat in Keelkattalai, Chennai',
  'price': '₹93.79 Lac',
  'area': '1,218 sqft',
  'bhk': '3 BHK'},
 {'name': 'Adhith enclave',
  'location': '3 BHK Flat in Veeramani Nagar, Kovilambakkam',
  'price': '₹89.17 Lac',
  'area': '1,158 

In [25]:
df = pd.DataFrame(data)
df

Unnamed: 0,name,location,price,area,bhk
0,on request,"2 BHK Flat in Pallikaranai, Chennai",₹48 Lac,764 sqft,2 BHK
1,RC Adena,"2 BHK Flat in Thirumullaivoyal, Chennai",₹45 Lac,820 sqft,2 BHK
2,Radiance Suprema,"2 BHK Flat in Madhavaram, Chennai",₹1 Cr,"1,102 sqft",2 BHK
3,on request,"2 BHK Flat in Pallikaranai, Chennai",₹57.7 Lac,932 sqft,2 BHK
4,Shriram Park 63\n3.8,"3 BHK Flat in Perungalathur, Chennai",₹1.5 Cr,"1,970 sqft",3 BHK
...,...,...,...,...,...
291,TDC VATIKA,"3 BHK Flat in Bhuvaneshwari Nagar, Velachery",₹69 Lac,990 sqft,3 BHK
292,Malles Ahaana,"2 BHK Flat in Pallikaranai, Chennai",₹65 Lac,989 sqft,2 BHK
293,lake vista's,"3 BHK Flat in Pallikaranai, Chennai",₹1.2 Cr,"2,139 sqft",3 BHK
294,"Moulivakkam, Chennai, Chennai West","3 Bedroom House in Moulivakkam, Chennai",₹1.4 Cr,900 sqft,3 BHK


In [27]:
df.to_excel("raw_chennai.xlsx")