In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import time


# Overview

This code sets up the necessary tools and headers to scrape web data. It imports libraries for sending HTTP requests, parsing HTML, and handling data. Additionally, it defines a user agent header to mimic a web browser, which can be useful to avoid getting blocked by some websites.

## import requests

This line imports the requests module, which is a popular Python module used to send HTTP requests to websites.

## from bs4 import BeautifulSoup 
This line imports BeautifulSoup from the bs4 module. BeautifulSoup is a library that is used for web scraping purposes to pull the data out of HTML and XML files. It creates a parse tree that can be used to extract data in a hierarchical and more readable manner.

## import os

This line imports the os module, which provides a way of interacting with the operating system. This could be used for tasks like creating directories, reading environment variables, etc.

## headers = {...}

This line defines a dictionary called headers with a 'User-Agent' key. The value of this key is a string that represents a user agent string.

The user agent string is used to tell the server about the browser and operating system of the user. Some websites serve different content based on the user agent or even block certain user agents (often to prevent scraping). By defining a common browser's user agent string, this code is trying to mimic a real browser request to potentially avoid blocks or get the same content a real user would see.



Extracting Flats/Apartments
- Your_Project_Directory
  - Data
    - City
      - Flats
      - Societies
      - Residential
      - Independent House

In [None]:
# Need to change as per your requirement - city name
# Match with 99acers site like for chandighars flats data site is : https://www.99acres.com/flats-in-chandigarh-ffid
# Taking value of city as 'chandigarh'
City = 'chandigarh'

In [None]:
# User Agent
# Headers set like below:
# User Agent
headers = {
    'authority': 'www.99acres.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'en-US,en;q=0.9',
    'cache-control': 'no-cache',
    'dnt': '1',
    'pragma': 'no-cache',
    'referer': f'https://www.99acres.com/flats-in-{City}-ffid-page',
    'sec-ch-ua': '"Chromium";v="107", "Not;A=Brand";v="8"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"macOS"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/527.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
}

In [None]:
# If folder structures are in already created no need to run it.

import os

# Define the path to your project directory
project_dir = '/content/drive/MyDrive/DSMP/Case Studies/Real estate/'

# Define the subdirectories
subdirectories = ['Data', f'Data/{City}', f'Data/{City}/Flats', f'Data/{City}/Societies', f'Data/{City}/Residential', f'Data/{City}/Independent House']

# Create the directory structure
for subdir in subdirectories:
    dir_path = os.path.join(project_dir, subdir)
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
        print(f"Created directory: {dir_path}")
    else:
        print(f"Directory already exists: {dir_path}")

# Now, your directory structure is created.

In [None]:
import requests
import pandas as pd
import time
import random
import os
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from requests.exceptions import ReadTimeout, RequestException

# ---------------- CONFIG ----------------
City = "chandigarh"

ua = UserAgent()

def get_headers():
    return {
        "User-Agent": ua.random,
        "Accept-Language": "en-IN,en;q=0.9",
        "Referer": "https://www.google.com/"
    }

# ---------------- INPUT ----------------
start = int(input("Enter page number to start from: "))
end = start + 10

# ---------------- SESSION ----------------
session = requests.Session()
session.headers.update(get_headers())

req = 0
pageNumber = start
flats = pd.DataFrame()

# ---------------- SCRAPER ----------------
try:
    while pageNumber < end:
        print(f"\nScraping page {pageNumber}")
        url = f"https://www.99acres.com/flats-in-{City}-ffid-page-{pageNumber}"

        try:
            page = session.get(url, timeout=(5, 15))
        except ReadTimeout:
            print("Listing page timeout, sleeping...")
            time.sleep(30)
            continue

        soup = BeautifulSoup(page.content, "html.parser")
        sections = soup.select_one('div[data-label="SEARCH"]')

        if not sections:
            print("Blocked / empty page detected")
            break

        properties = sections.select('section[data-hydration-on-demand="true"]')
        i = 0

        for prop in properties:
            try:
                property_name = prop.select_one('a.srpTuple__propertyName').text.strip()
                link = prop.select_one('a.srpTuple__propertyName')['href']
                society = prop.select_one('#srp_tuple_society_heading').text.strip()
            except:
                continue

            # ---- DETAIL PAGE ----
            try:
                dpage = session.get(link, timeout=(5, 15))
            except ReadTimeout:
                time.sleep(15)
                continue

            dSoup = BeautifulSoup(dpage.content, "html.parser")
            req += 1

            def safe_text(selector):
                try:
                    return dSoup.select_one(selector).text.strip()
                except:
                    return ""

            property_data = {
                "property_name": property_name,
                "link": link,
                "society": society,
                "price": safe_text("#pdPrice2"),
                "area": safe_text("#srp_tuple_price_per_unit_area"),
                "areaWithType": safe_text("#factArea"),
                "bedRoom": safe_text("#bedRoomNum"),
                "bathroom": safe_text("#bathroomNum"),
                "balcony": safe_text("#balconyNum"),
                "additionalRoom": safe_text("#additionalRooms"),
                "address": safe_text("#address"),
                "floorNum": safe_text("#floorNumLabel"),
                "facing": safe_text("#facingLabel"),
                "agePossession": safe_text("#agePossessionLbl"),
                "property_id": safe_text("#Prop_Id")
            }

            # Nearby locations
            try:
                property_data["nearbyLocations"] = [
                    i.text.strip() for i in dSoup.select(
                        "div.NearByLocation__tagWrap span.NearByLocation__infoText"
                    )
                ]
            except:
                property_data["nearbyLocations"] = ""

            # Description
            property_data["description"] = safe_text("#description")

            # Furnish details
            try:
                property_data["furnishDetails"] = [
                    i.text.strip() for i in dSoup.select("#FurnishDetails li")
                ]
            except:
                property_data["furnishDetails"] = ""

            # Features
            try:
                property_data["features"] = [
                    i.text.strip() for i in dSoup.select("#features li")
                ]
            except:
                property_data["features"] = ""

            flats = pd.concat(
                [flats, pd.DataFrame([property_data])],
                ignore_index=True
            )

            i += 1
            time.sleep(random.uniform(2, 6))

        print(f"Page {pageNumber} â†’ {i} properties scraped")
        pageNumber += 1
        time.sleep(random.uniform(15, 30))

except Exception as e:
    print("ERROR OCCURRED:", e)

finally:
    # ---------------- SAVE DATA ----------------
    save_path = f"flats_{City}_page_{start}_to_{pageNumber-1}.csv"
    flats.to_csv(save_path, index=False)
    print(f"\nData saved to: {save_path}")


In [None]:
!pip install fake-useragent


# Overview

The code scrapes property data from the website "99acres.com" for apartments in Gurgaon. It navigates through a range of pages, extracts details of each property, and saves the data to a CSV file. The script is designed to handle potential errors gracefully, using try and except blocks to manage missing data, and introduces pauses to avoid making rapid requests and potentially getting blocked by the website.

## Initialization of Variables:

start and end specify the range of web pages to scrape.
csv_file defines the path to the CSV file where data will be saved.
pageNumber starts from the initial value of start and will be incremented to navigate through the pages.
req counts the number of HTTP requests made.
## Loop for Page Navigation:

The while loop is used to navigate through each page in the range from start to end.
Inside this loop, the URL of the page to be scraped is constructed using the pageNumber.
An HTTP GET request is made to retrieve the content of the page, and the content is then parsed using BeautifulSoup.
## Loop for Property Extraction:

The nested for loop navigates through individual property sections on the current page.
The script attempts to extract the property name, its link, and its society name.
If any of these attributes are missing, it skips to the next property.
## Detail Extraction:

For each property, an HTTP request is made to its detail page.
The code then attempts to extract various property details like price, area, bedroom count, bathroom count, balcony count, address, and many other attributes. If any attribute is missing, the code handles it gracefully, assigning an empty string or an empty list as appropriate.
## Creating and Saving Data:

All extracted details are stored in a dictionary named property_data.
This dictionary is then converted to a temporary DataFrame temp_df.
The data is appended to a main DataFrame flats and also saved to the CSV file. If the file already exists, the new data is appended without writing the headers again.
## Request Management:

To avoid making too many rapid requests (which can lead to IP bans), the script introduces pauses.
Every 4 requests, it pauses for 10 seconds. Every 15 requests, it pauses for 50 seconds.
Page Counter and Loop Increment:

After scraping all properties on a page, the code prints the page number and the number of properties processed.
pageNumber is incremented to move to the next page.

In [None]:
# Function to combine multiple csv file is one file.

def combine_csv_files(folder_path, combined_file_path):
    combined_data = pd.DataFrame()  # Create an empty DataFrame to hold the combined data

    # Iterate through all CSV files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            print('file_path')
            # Read the data from the current CSV file
            df = pd.read_csv(file_path)

            # Append the data to the combined DataFrame
            combined_data = combined_data.append(df, ignore_index=True)

            # Delete the original CSV file
            os.remove(file_path)

    # Save the combined data to a new CSV file
    combined_data.to_csv(combined_file_path, index=False)

# Example usage:

# Replace with the actual folder path
folder_path = '/content/drive/MyDrive/DSMP/Case Studies/Real estate/flats_appartment'

# Replace with the desired combined file path
combined_file_path = '/content/drive/MyDrive/DSMP/Case Studies/Real estate/flats_appartment/flats.csv'

combine_csv_files(folder_path, combined_file_path)

In [None]:
# Overview: 
The function combine_csv_files combines all the CSV files located in a specified folder into a single CSV file. After appending the data from each individual file to the combined file, the original file is deleted.

## Function Definition:

### combine_csv_files(folder_path, combined_file_path):

### folder_path: Path to the folder containing the CSV files you want to combine.
combined_file_path: Path where the combined CSV file should be saved.
### Initialize an Empty DataFrame:

combined_data = pd.DataFrame(): An empty DataFrame combined_data is created to hold all the data from the individual CSV files.
Iterate Through CSV Files:

The for loop iterates over each file in the directory specified by folder_path.
Within the loop, the code checks if the current file ends with .csv to ensure that only CSV files are processed.
Read and Append Data:

file_path = os.path.join(folder_path, file_name): Constructs the full path to the current CSV file.
df = pd.read_csv(file_path): Reads the data from the current CSV file into a DataFrame df.
combined_data = combined_data.append(df, ignore_index=True): Appends the data from df to the combined_data DataFrame. The ignore_index=True parameter ensures that the index is reset and continuous in the combined data.
Delete the Original CSV File:

os.remove(file_path): Deletes the original CSV file after its data has been appended to the combined data. This step helps in conserving storage space.
Save the Combined Data:

combined_data.to_csv(combined_file_path, index=False): Writes the combined_data DataFrame to a new CSV file at the specified combined_file_path. The parameter index=False ensures that the DataFrame's index is not written to the CSV.
### Example Usage:

The provided paths (folder_path and combined_file_path) specify the location of the individual CSV files and the path for the combined CSV file, respectively.
Calling the combine_csv_files function with these paths will combine all CSV files in the specified folder and save the combined data to the desired location.