# Data Collection

## Data Sources
- Real estate website - [MagicBricks](https://www.magicbricks.com)

## Data Types
- Property Name
- Properly Title
- Property Type (Flat)
- Property Size (Carpet Area)
- Furnshing 
- BHK
- City / Locality
- Price
- Price (SQFT)

## Method
- Web Scraping (BeautifulSoup and Selenium)

In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import numpy as np
import pandas as pd
import os

# Define the possible values for bedrooms, property types, and cities
cityname_values = ['New-Delhi', 'South-area-New-Delhi', 'East-area-New-Delhi', 'West-area-New-Delhi', 'Central-area-New-Delhi', 'North-area-New-Delhi']
bedroom_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
proptype_values = ["Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment"]

# Define budget ranges as tuples
budget_values = [
    (0, 25), (25, 50), (50, 75), (75, 100), (100, 125), (125, 150), (150, 175), (175, 200), (200, 225), 
    (225, 250), (250, 275), (275, 300), (300, 350), (350, 400), (400, 450), (450, 700), 
    (700, 800), (800, 900), (900, 1000), (1000, 100000)
]

# Base URL with placeholders for bedroom, property type, city name, and budget values
base_url = "https://www.magicbricks.com/property-for-sale/residential-real-estate?bedroom={}&proptype={}&cityName={}&BudgetMin={}-Lacs&BudgetMax={}-Lacs"

# Container to hold all data frames
data_frames = []

# Set up the Safari driver
driver = webdriver.Safari()

# Dictionary to hold HTML content for each BHK value
html_content_dict = {bedroom: "" for bedroom in bedroom_values}

# List to hold missed data frame information
missed_data_frames = []

# Generate the list of URLs and iterate over them
for cityname in cityname_values:
    for bedroom in bedroom_values:
        bhk_value = bedroom
        for proptype in proptype_values:
            for min_budget, max_budget in budget_values:
                url = base_url.format(bhk_value, proptype, cityname, min_budget, max_budget)
                
                try:
                    # Open the website
                    driver.get(url)

                    # Scroll down to load more data
                    SCROLL_PAUSE_TIME = 5

                    # Get scroll height
                    last_height = driver.execute_script("return document.body.scrollHeight")

                    while True:
                        # Scroll down to the bottom
                        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

                        # Wait to load the page
                        time.sleep(SCROLL_PAUSE_TIME)

                        # Calculate new scroll height and compare with last scroll height
                        new_height = driver.execute_script("return document.body.scrollHeight")
                        if new_height == last_height:
                            break
                        last_height = new_height

                    # Get the page source
                    page_source = driver.page_source

                    # Parse the HTML data using BeautifulSoup
                    soup = BeautifulSoup(page_source, 'html.parser')

                    # Append the current HTML content to the dictionary entry for the current BHK value
                    html_content_dict[bhk_value] += str(soup)

                    # Define the lists to store the data
                    property_name = []
                    property_title = []
                    property_type = []
                    city_or_locality = []
                    bhk = []
                    property_size = []
                    furnishing = []
                    price_total = []
                    price_per_sqft = []

                    # Parse the HTML data using BeautifulSoup
                    soup = BeautifulSoup(page_source, 'html.parser')

                    # Find all the property details
                    property_details = soup.find_all('div', class_="mb-srp__card__container")

                    for i in property_details:
                        try:
                            property_name.append(i.find('a', class_="mb-srp__card__society--name").text.strip())
                        except:
                            property_name.append(np.nan)

                        try:
                            property_title.append(i.find('h2', class_="mb-srp__card--title").text.strip())
                        except:
                            property_title.append(np.nan)

                        # Set the property type manually
                        property_type.append("Multistorey-Apartment,Builder-Floor-Apartment,Penthouse,Studio-Apartment")

                        try:
                            property_size.append(i.find_all('div', class_="mb-srp__card__summary--value")[0].text.strip())
                        except:
                            property_size.append(np.nan)

                        # Set the BHK value manually based on the bedroom value
                        bhk.append(bhk_value)

                        try:
                            furnishing.append(i.find_all('div', class_="mb-srp__card__summary--value")[4].text.strip())
                        except:
                            furnishing.append(np.nan)

                    # Function to extract city or locality (last two parts)
                    def extract_city_or_locality(title):
                        parts = title.split(',')
                        if len(parts) >= 2:
                            return ','.join(parts[-2:]).strip()
                        return np.nan
                    
                    city_or_locality = [extract_city_or_locality(title) for title in property_title]

                    # Find all the property price details
                    property_price_details = soup.find_all('div', class_="mb-srp__card__price")

                    for i in property_price_details:
                        try:
                            price_total.append(i.find('div', class_="mb-srp__card__price--amount").text.strip())
                        except:
                            price_total.append(np.nan)
                        
                        try:
                            price_per_sqft.append(i.find('div', class_="mb-srp__card__price--size").text.strip())
                        except:
                            price_per_sqft.append(np.nan)

                    # Creating a DataFrame for the current iteration
                    df = pd.DataFrame({
                        'Property Name': property_name,
                        'Property Title': property_title,
                        'Property Type': property_type,
                        'City/Locality': city_or_locality,
                        'BHK': bhk,
                        'Property Size': property_size,
                        'Furnishing': furnishing,
                        'Price Total': price_total,
                        'Price per Sqft': price_per_sqft
                    })

                    # Append the DataFrame to the list
                    data_frames.append(df)

                except Exception as e:
                    print(f"Error processing URL: {url}")
                    print(f"Error: {e}")
                    missed_data_frames.append((cityname, bhk_value, proptype, min_budget, max_budget))

# Close the driver
driver.quit()

# Define the directory path
directory = "/Users/guliaharsh021/Downloads/DA Documents /Projects/Project 1/Data Collection/Flats Data/HTML"

# Save the HTML content to the specified path
for bhk_value, html_content in html_content_dict.items():
    file_path = os.path.join(directory, f'flats_{bhk_value}bhk_data.txt')
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(html_content)

# Merge all data frames into one
if data_frames:
    final_df = pd.concat(data_frames, ignore_index=True)
    # Save the final DataFrame to a CSV file
    final_df.to_csv('/Users/guliaharsh021/Downloads/DA Documents /Projects/Project 1/Data Collection/Flats Data/flats_data.csv', index=False)
else:
    print("No data frames were created successfully.")

# Log missed data frames
if missed_data_frames:
    print("Missed the following data frames:")
    for item in missed_data_frames:
        print(item)
else:
    print("No data frames were missed.")



No data frames were missed.
