In [1]:
pip install fake-useragent

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
from fake_useragent import UserAgent  # Install this library using: pip install fake-useragent

def scrape_amazon(search_keyword, num_pages):
    base_url = 'https://www.amazon.in/s'
    data = []

    headers = {'User-Agent': UserAgent().random}

    for page in range(1, num_pages + 1):
        params = {'k': search_keyword, 'page': page}
        
        try:
            webpage = requests.get(base_url, params=params, headers=headers)
            webpage.raise_for_status()  # Raise an HTTPError for bad responses
            
            soup = BeautifulSoup(webpage.content, 'html.parser')

            brands = soup.find_all('div', class_='puis-card-container s-card-container s-overflow-hidden aok-relative puis-include-content-margin puis puis-v1x9xtdrqwkf2b2sy3gv3bhvhpn s-latency-cf-section puis-card-border')

            for brand in brands:
                name = brand.find('h2').text.strip()
                rating = brand.find('span', class_='a-icon-alt').text.strip() if brand.find('span', class_='a-icon-alt') else 'N/A'
                reviews = brand.find('span', class_='a-size-base s-underline-text').text.strip() if brand.find('span', class_='a-size-base s-underline-text') else 'N/A'
                price = brand.find('span', class_='a-price-whole').text.strip() if brand.find('span', class_='a-price-whole') else 'N/A'

                data.append({'brand_name': name, 'rating': rating, 'reviews': reviews, 'price': price})

            time.sleep(2)  # Add a delay of 2 seconds between requests to avoid overloading the server
            
        except requests.exceptions.HTTPError as errh:
            print ("HTTP Error:", errh)
        except requests.exceptions.ConnectionError as errc:
            print ("Error Connecting:", errc)
        except requests.exceptions.Timeout as errt:
            print ("Timeout Error:", errt)
        except requests.exceptions.RequestException as err:
            print ("Something went wrong:", err)

    return pd.DataFrame(data)

# Example usage
search_keyword = 'TV'
num_pages_to_scrape = 3
df = scrape_amazon(search_keyword, num_pages_to_scrape)
df

Unnamed: 0,brand_name,rating,reviews,price
0,Mi 189.34cm (75 inches) Q1 Series 4K Smart QLE...,4.3 out of 5 stars,562,144999
1,Panasonic 80 cm (32 inches) HD Ready Smart LED...,4.3 out of 5 stars,3416,15990
2,iFFALCON 80.04 cm (32 inches) Bezel-Less S Ser...,4.1 out of 5 stars,3561,7990
3,Samsung 80 cm (32 inches) HD Ready Smart LED T...,4.2 out of 5 stars,12955,14990
4,Redmi 80 cm (32 inches) F Series HD Ready Smar...,4.2 out of 5 stars,59955,13999
...,...,...,...,...
61,Acer,4.1 out of 5 stars,5188,29999
62,VW,4.1 out of 5 stars,3593,14999
63,KODAK,4.4 out of 5 stars,1084,23999
64,Acer,4.1 out of 5 stars,5188,17999
