## Import essential modules/libraries

In [5]:
%pip install webdriver-manager

Collecting webdriver-managerNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading webdriver_manager-3.8.6-py2.py3-none-any.whl (27 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.0.0 webdriver-manager-3.8.6


In [1]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

## Function to launch browser and get a given link

In [2]:
def launch_chrome(link, headless=False):
    ops = Options() # create object of Options
    
    if headless:
        ops.add_argument('--headless') # headless browser testing 

    # prevents browser from closing when function is returned
    ops.add_experimental_option("detach", True)

    driver = webdriver.Chrome(options=ops,  service=Service(ChromeDriverManager().install()))

    driver.get(link)
    return driver

Flow:
1. open link: https://www.amazon.in/s?rh=n%3A6612025031&fs=true&ref=lp_6612025031_sar
2. Get links to page 1 to 257
3. For each page:
4.  get link to product:
5. In product page scrape details of 
    "Product Name", "Price", "Rating", "Seller Name" using the respective xpaths
6. Store in csv

In [9]:
def get_link_divs(driver): # returns list of 
    parent_div = driver.find_element(By.XPATH, '//*[@id="search"]/div[1]/div[1]/div/span[1]/div[1]')

    return parent_div.find_elements(By.XPATH, './/div/div/div/div/div/div/div/h2/a')


## Function to get data from each product link

In [19]:
# xpath to the required html elements
paths = {
    'title': '//*[@id="productTitle"]',
    'price': 'a-price-whole',
    'rating': '//*[@id="acrPopover"]/span[1]/a/span',
    'availability': '//*[@id="availability_feature_div"]',
    'seller': '//*[@id="merchant-info"]/a[1]/span'
}

def get_data(driver):
    title = driver.find_element(By.XPATH, paths['title']).text
    rating = driver.find_element(By.XPATH, paths['rating']).text

    # unavailable has text 'Currently unavailable.' when out of stock hence positive length
    unavailable = len(driver.find_element(By.XPATH, paths['availability']).text) > 0
    available = not unavailable

    if available:
        price = driver.find_element(By.CLASS_NAME, paths['price']).text
        seller = driver.find_element(By.XPATH, paths['seller']).text

    else:
        price, seller = None, None
    
    print(title[:10], ' | ' ,rating, ' | ' ,available, ' | ' ,price, ' | ' ,seller)
    return (title, rating, available, price, seller)

In [None]:
tdriver = launch_chrome('https://www.amazon.in/MI-Lithium-Wireless-10000mAh-Charging/dp/B0BC1SCV6K/ref=sr_1_25?qid=1685635841&s=electronics&sr=1-25')
get_data(tdriver)
tdriver.quit()

In [20]:
def scrape_pages(i, j):
    # create csv file
    with open('amazon.csv', 'a', encoding='utf8', newline='') as csv_file:
        keys = ['page_no', 'num', 'title', 'rating', 'available',
                'price', 'seller', 'page_link', 'product_link']

        dict_writer = csv.DictWriter(csv_file, keys)
        # dict_writer.writeheader() # add header

        for page_no in range(i, j):  # For each page 1 to 275
            page_link = f"https://www.amazon.in/s?i=electronics&rh=n%3A6612025031&fs=true&page={page_no}&qid=1685600480&ref=sr_pg_{page_no}"

            # launch headless browser
            driver = launch_chrome(page_link, headless=True)
            print('On page: ', page_link)

            anchor_tags = get_link_divs(driver)

            print("Products on this page: ",len(anchor_tags))

            for num, a in enumerate(anchor_tags):

                # lauch another driver called product (i.ie. link to the product)
                link = a.get_attribute('href')
                product = launch_chrome(link, headless=True)

                print('No. ', num, 'Product: ', link)
                title, rating, available, price, seller = get_data(product)
                data = {'page_no': page_no,
                        'num': num,
                        'title': title,
                        'rating': rating,
                        'available': available,
                        'price': price,
                        'seller': seller,
                        'page_link': page_link,
                        'product_link': link}

                # add row in csv file
                dict_writer.writerow(data)

                product.quit() # close the respective drivers

            driver.quit()


In [7]:
scrape_pages(1, 2)

On page:  https://www.amazon.in/s?i=electronics&rh=n%3A6612025031&fs=true&page=1&qid=1685600480&ref=sr_pg_1
No.  0 Product:  https://www.amazon.in/20000mAh-Sandstone-Triple-Charging-Delivery/dp/B08HV83HL3/ref=sr_1_1?qid=1685610088&s=electronics&sr=1-1
>>  MI Power B 4.2 True 2,149 Cocoblu Retail
No.  1 Product:  https://www.amazon.in/10000mAH-Li-Polymer-Power-Charging-Midnight/dp/B08HVL8QN3/ref=sr_1_2?qid=1685610088&s=electronics&sr=1-2
>>  Mi 10000mA 4.2 True 1,299 Cocoblu Retail
No.  2 Product:  https://www.amazon.in/Pocket-10000mAh-Triple-Charging-Delivery/dp/B08MC57J31/ref=sr_1_3?qid=1685610088&s=electronics&sr=1-3
>>  MI 10000mA 4.3 True 1,699 Cocoblu Retail
No.  3 Product:  https://www.amazon.in/Ambrane-Multi-Layer-Protection-Li-Polymer-Stylo-10k/dp/B0993BB11X/ref=sr_1_4?qid=1685610088&s=electronics&sr=1-4
>>  Ambrane 10 3.9 True 999 Cocoblu Retail
No.  4 Product:  https://www.amazon.in/Ambrane-20000mAh-Lithium-Polymer-Stylo-20K/dp/B07RD611Z8/ref=sr_1_5?qid=1685610088&s=electroni

In [21]:
scrape_pages(2, 3)

On page:  https://www.amazon.in/s?i=electronics&rh=n%3A6612025031&fs=true&page=2&qid=1685600480&ref=sr_pg_2
Products on this page:  24
No.  0 Product:  https://www.amazon.in/Redmi-10000-Charging-Power-Black/dp/B0851WMSDS/ref=sr_1_25?qid=1685637217&s=electronics&sr=1-25
Redmi Lith  |  4.2  |  True  |  1,199  |  Cocoblu Retail
No.  1 Product:  https://www.amazon.in/MI-Lithium-Wireless-10000mAh-Charging/dp/B0BC1SCV6K/ref=sr_1_26?qid=1685637217&s=electronics&sr=1-26
MI Lithium  |  4.0  |  True  |  2,499  |  Cocoblu Retail
No.  2 Product:  https://www.amazon.in/Belkin-Charges-Devices-simultaneously-Samsung/dp/B092TNMFYB/ref=sr_1_27?qid=1685637217&s=electronics&sr=1-27
Belkin 100  |  4.2  |  True  |  1,699  |  Cocoblu Retail
No.  3 Product:  https://www.amazon.in/10000mAH-Lithium-Polymer-Charging-Included/dp/B09X5SPFZ2/ref=sr_1_28?qid=1685637217&s=electronics&sr=1-28
Amazon Bas  |  3.8  |  True  |  1,199  |  Appario Retail Private Ltd
No.  4 Product:  https://www.amazon.in/URBN-Li-Polymer-Ch