<a href="https://colab.research.google.com/github/chetankhairnar05/Python_Automation/blob/main/web_scrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# This is the most reliable setup for Selenium in Google Colab.
# It uses a library specifically designed to handle Colab's environment.
# Run all these commands in a single cell in your Colab notebook.

!pip install google-colab-selenium pandas openpyxl

import pandas as pd
import time
import os
from google_colab_selenium import Chrome
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException

def scrape_webpage(url):
    """
    Scrapes product information from a generic e-commerce search results page.

    Args:
        url (str): The URL of the webpage to scrape.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              the data for one product.
    """
    print("Initializing web driver with a Colab-specific stable setup...")

    # The Chrome() function from the library handles all the setup,
    # making it much more reliable in Colab's environment.
    try:
        driver = Chrome()
        driver.get(url)

        # Allow more time for the page to load, especially for dynamic content on e-commerce sites
        time.sleep(10)
    except Exception as e:
        print(f"Failed to initialize WebDriver or load the page. Please check your URL and Colab environment. Error: {e}")
        return []

    products_data = []

    try:
        print("Finding product cards...")
        # --- Using a robust attribute (data-id) which is less likely to change.
        product_cards_xpath = '//div[@data-id]'

        product_cards = driver.find_elements(By.XPATH, product_cards_xpath)

        print(f"Found {len(product_cards)} product cards.")

        if not product_cards:
            print("No product cards were found with the specified XPath.")
            return []

        # --- Using starts-with to find elements with similar class names ---
        # This is more robust than contains() because it's less prone to finding false positives.
        name_xpath = './/div[starts-with(@class, "_4rR01T")]'
        current_price_xpath = './/div[starts-with(@class, "_30jeq3")]'
        original_price_xpath = './/div[starts-with(@class, "_3I9_wc")]'
        rating_xpath = './/div[starts-with(@class, "_3LWZlK")]'
        rating_count_xpath = './/span[starts-with(@class, "_2_R_DZ")]'

        for i, card in enumerate(product_cards):
            data = {}
            try:
                # Scrape product name
                name_element = card.find_element(By.XPATH, name_xpath)
                data['name'] = name_element.text
            except NoSuchElementException:
                data['name'] = None
            except Exception:
                data['name'] = None

            try:
                # Scrape current price
                price_element = card.find_element(By.XPATH, current_price_xpath)
                data['price'] = price_element.text
            except NoSuchElementException:
                data['price'] = None
            except Exception:
                data['price'] = None

            try:
                # Scrape original price
                original_price_element = card.find_element(By.XPATH, original_price_xpath)
                data['original_price'] = original_price_element.text
            except NoSuchElementException:
                data['original_price'] = None
            except Exception:
                data['original_price'] = None

            try:
                # Scrape rating
                rating_element = card.find_element(By.XPATH, rating_xpath)
                data['rating'] = rating_element.text
            except NoSuchElementException:
                data['rating'] = None
            except Exception:
                data['rating'] = None

            try:
                # Scrape number of ratings
                rating_count_element = card.find_element(By.XPATH, rating_count_xpath)
                data['rating_count'] = rating_count_element.text
            except NoSuchElementException:
                data['rating_count'] = None
            except Exception:
                data['rating_count'] = None

            if data['name']: # Only add products that have at least a name
                products_data.append(data)
                print(f"Scraped product {i+1}: {data['name']}")

    except Exception as e:
        print(f"An error occurred during scraping: {e}")
    finally:
        print("Closing the browser...")
        driver.quit()

    return products_data

def save_to_excel(data, filename="scraped_products.xlsx"):
    """
    Saves a list of dictionaries to an Excel file.
    """
    if data:
        df = pd.DataFrame(data)
        df.to_excel(filename, index=False)
        print(f"\nData successfully saved to {os.path.abspath(filename)}")
    else:
        print("\nNo data to save.")

# --- Main script execution ---
if __name__ == "__main__":
    webpage_url = "https://www.flipkart.com/samsung-mobile-store?otracker=nmenu_sub_Electronics_0_Samsung"
    scraped_data = scrape_webpage(webpage_url)
    if scraped_data:
        save_to_excel(scraped_data)
    else:
        print("No product data was scraped.")
