In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [2]:
#function to extract product title
def get_title(soup):
    try:
        #outer tag object
        title = soup.find("span", attrs = {"id":'productTitle'})

        #inner navigatable string object
        title_value = title.text

        #title as a string value
        title_string = title_value.strip()

    except AtrributeError:
        title_tring = ""

    return title_string


# function to extract product price
def get_price(soup):
    try:
        price = soup.find("span", attrs={'class':'a-offscreen'}).string.strip()

    except AttributeError:
        try:
            # if there is some deal price
            price = soup.find("span", attrs = {'class':'a-price-whole'}).string.strip()
        except:
            price = ""
    return price


# function to extract product rating
def get_rating(soup):
    try:
        rating = soup.find("i", attrs = {'class':'a-icon a-icon-star a-star-4-5 cm-cr-review-stars-spacing-big'}).string.strip()

    except AttributeError:
        try:
            rating = soup.find("span",attrs = {'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""
    return rating

# function to extract number of user reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs = {'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""

    return review_count

#function to extract Availability status
def get_specifications(soup):
    try:
        specification = soup.find("table", attrs ={"class":"a-normal a-spacing-micro"}).text.strip()
    except AttributeError:
        specification = ""

    return specification

In [3]:
if __name__ == '__main__':
    # user agent
    # headers for request
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36','Accept-Language':'en-US, en:q=0.5'})

    # webpage url
    BASE_URL = "https://www.amazon.com/s?k=laptop"

    # data dictionary
    d = {"title":[], "price":[], "rating":[], "reviews":[], "specifications":[]}

    #loop through the page
    for page in range(1, 11):
        print(f"Scraping page {page}...")
        URL = f"{BASE_URL}&page={page}"

        # http request
        webpage = requests.get(URL, headers=HEADERS)
    
        #soup object containing all data
        soup = BeautifulSoup(webpage.content, "html.parser")
    
        # fetch links as list of tag objects
        links = soup.find_all("a",attrs={'class':'a-link-normal s-line-clamp-2 s-link-style a-text-normal'})
        links_list = [link.get('href') for link in links]

    
  
        #loop for extracting product details from each link
        for link in links_list:
            new_url = "https://www.amazon.com" + link
            new_webpage = requests.get(new_url, headers = HEADERS)  
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")
    
            #function calls to display all necessary product information
            d['title'].append(get_title(new_soup))
            d['price'].append(get_price(new_soup))
            d['rating'].append(get_rating(new_soup))
            d['reviews'].append(get_review_count(new_soup))
            d['specifications'].append(get_specifications(new_soup))

    amazon_laptop_df = pd.DataFrame.from_dict(d)
                                             
    amazon_laptop_df['title'] = amazon_laptop_df['title'].replace('', np.nan)
    amazon_laptop_df = amazon_laptop_df.dropna(subset=['title'])
    #amazon_laptop_df.to_csv("amazon_laptop_df.csv", sep=';', header=True, index=False)

    amazon_laptop_df.to_excel("amazon_laptop_df.xlsx",index=False)

        

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...


In [4]:
amazon_laptop_df

Unnamed: 0,title,price,rating,reviews,specifications
0,Amazon Fire Max 11 tablet (newest model) vivid...,Page 1 of 1,4.4 out of 5 stars,"10,819 ratings",
1,Amazon Fire 7 tablet (newest model) 7” display...,Page 1 of 1,4.4 out of 5 stars,"18,163 ratings",
2,Acer Aspire 3 A315-24P-R7VH Slim Laptop | 15.6...,$279.99,4.4 out of 5 stars,"39,484 ratings",Brand acer Model Name Laptop Scree...
3,"HP 14 Laptop, Intel Celeron N4020, 4 GB RAM, 6...",$178.48,4.0 out of 5 stars,"2,015 ratings",Brand HP Model Name 14-dq0040nr Sc...
4,"Laptop, 2024 Gaming Laptop Computer with Intel...","$1,399.99",5.0 out of 5 stars,4 ratings,Brand KAIGERR Model Name AX15 Scre...
...,...,...,...,...,...
215,"OTVOC Laptop 15.6 inch Windows 11, Celeron N51...",$279.99,4.5 out of 5 stars,9 ratings,Brand OTVOC Model Name VocBook 15 Pro ...
216,"HP 14"" Ultral Light Laptop for Students and Bu...",$239.96,3.7 out of 5 stars,202 ratings,Brand HP Model Name HP Screen Size...
217,"ACEMAGIC 2024 Laptop, 16.1-inch FHD Display La...",Page 1 of 1,4.3 out of 5 stars,75 ratings,Brand ACEMAGIC Model Name AX16PRO ...
218,"HP Stream 14"" HD Laptop, Intel Celeron N4120, ...",$399.00,4.5 out of 5 stars,2 ratings,Brand HP Model Name HP Stream Scre...


In [4]:
pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.
