In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from datetime import datetime

HEADERS = {
    'authority': 'www.amazon.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="102", "Google Chrome";v="102"',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}

In [2]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):

    try:
        price = soup.find("span", attrs={'class':'aok-offscreen'}).string

    except AttributeError:

        try:
            # If there is some deal price
            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()

        except:
            price = ""

    return price

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available

def get_description(soup):
    try:
        available = soup.find('div', class_='a-expander-content a-expander-partial-collapse-content').text

    except AttributeError:
        available = "Not Available"	

    return available

def get_review_link(soup):
    try:
        a_tag = soup.find('a', class_='a-link-emphasis a-text-bold')
        available = 'https://www.amazon.com' + a_tag.get('href')

    except AttributeError:
        available = "Not Available"	

    return available

def get_sponsered_brand(soup):
    try:
        span_element = soup.find('span', class_='a-color-base')
        if span_element is not None:
            amazon_sponsored = span_element.text.strip()
        else:
            amazon_sponsored = ""
    except AttributeError:
        amazon_sponsored = ""
    
    return amazon_sponsored

In [3]:
# Extra Data as Html object from amazon Review page
def reviewsHtml(url, len_page):
    
    # Empty List define to store all pages html data
    soups = []
    # Request make for each page
    response = requests.get(url, headers=HEADERS)
    
    # Save Html object by using BeautifulSoup4 and lxml parser
    soup = BeautifulSoup(response.text, 'lxml')
    # Add single Html page data in master soups list
    soups.append(soup)
    # Loop for gather all 3000 reviews from 300 pages via range
    for page_no in range(1, len_page + 1):

        url = url + f'&page_number{page_no+1}'
        # parameter set as page no to the requests body
        params = {
            'ie': 'UTF8',
            'reviewerType': 'all_reviews',
            'filterByStar': 'critical',
            'pageNumber': page_no,
        }
        try:
            # Request make for each page
            response = requests.get(url, headers=HEADERS)
            
            # Save Html object by using BeautifulSoup4 and lxml parser
            soup = BeautifulSoup(response.text, 'lxml')

            # Add single Html page data in master soups list
            soups.append(soup)
        except: break
        
    return soups

In [4]:
# Grab Reviews name, description, date, stars, title from HTML
def getReviews(html_data):

    # Create Empty list to Hold all data
    data_dicts = []
    
    # Select all Reviews BOX html using css selector
    boxes = html_data.select('div[data-hook="review"]')
    
    # Iterate all Reviews BOX 
    for box in boxes:
        
        # Select Name using css selector and cleaning text using strip()
        # If Value is empty define value with 'N/A' for all.
        try:
            name = box.select_one('[class="a-profile-name"]').text.strip()
        except Exception as e:
            name = 'N/A'

        try:
            stars = box.select_one('[data-hook="review-star-rating"]').text.strip().split(' out')[0]
        except Exception as e:
            stars = 'N/A'   

        try:
            title = box.select_one('[data-hook="review-title"]').text.strip()
        except Exception as e:
            title = 'N/A'

        try:
            # Convert date str to dd/mm/yyy format
            datetime_str = box.select_one('[data-hook="review-date"]').text.strip().split(' on ')[-1]
            date = datetime.strptime(datetime_str, '%B %d, %Y').strftime("%d/%m/%Y")
        except Exception as e:
            date = 'N/A'

        try:
            description = box.select_one('[data-hook="review-body"]').text.strip()
        except Exception as e:
            description = 'N/A'

        try:
            verified = box.select_one('[data-hook="avp-badge"]').text.strip()
        except Exception as e:
            verified = 'N/A'

        # create Dictionary with al review data 
        data_dict = {
            'Name' : name,
            'Stars' : stars,
            'Title' : title,
            'Date' : date,
            'Description' : description,
            'Verified' : verified
        }

        # Add Dictionary in master empty List
        data_dicts.append(data_dict)
    
    return data_dicts

In [5]:
def get_all_reviews(reviews_url):
    len_page = 2

    try:
        # Grab all HTML
        html_datas = reviewsHtml(reviews_url, len_page)

        # Empty List to Hold all reviews data
        reviews = []

        # Iterate all Html page 
        for html_data in html_datas:
            
            # Grab review data
            review = getReviews(html_data)
            
            # add review data in reviews empty list
            reviews += review

    except: reviews = 'Not Available'

    return reviews

In [89]:
if __name__ == '__main__':

    # add your user agent 
    HEADERS = ({'User-Agent':'Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 7.03 [en]', 'Accept-Language': 'en-US, en;q=0.5'})

    # The base webpage URL
    URL = "https://www.amazon.com/s?k="

    # Get user input for the search query
    search = input("What do you want to search on Amazon? ")
    search = search.replace(" ", "+")
    
    # Combine the base URL with the modified search query
    URL = URL + search
    
    # HTTP Request
    webpage = requests.get(URL, headers=HEADERS)

    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")
    a_tag = soup.find('a', class_='s-pagination-item s-pagination-next s-pagination-button s-pagination-separator')
    pages = []
    page = URL
    pages.append(page)
    more_pages = True
    while more_pages:
        try:
            # HTTP Request
            webpage = requests.get(page, headers=HEADERS)

            # Soup Object containing all data
            soup = BeautifulSoup(webpage.content, "html.parser")
            a_tag = soup.find('a', class_='s-pagination-item s-pagination-next s-pagination-button s-pagination-separator')
            page = 'https://www.amazon.com' + a_tag.get('href')
            pages.append(page)

        except: more_pages = False

    links = []
    for p in pages:
        # Fetch links as List of Tag Objects
        # links_page = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})
        # sponsored = soup.find_all("span", attrs={'class':'a-color-secondary'})

        # HTTP Request
        webpage = requests.get(p, headers=HEADERS)

        # Soup Object containing all data
        soup = BeautifulSoup(webpage.content, "html.parser")

        links_page = soup.find_all("div", attrs={'class':'puis-card-container s-card-container s-overflow-hidden aok-relative puis-expand-height puis-include-content-margin puis puis-v36g8q2u37vpji29sg9uhoxaczm s-latency-cf-section puis-card-border'})
        links = links + links_page
    
    # Store the links
    links_list = []
    sponsored_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            link_element = link.find('a', class_='a-link-normal s-no-outline')
            links_list.append(link_element.get('href') if link_element else None)

            text_element = link.find('span', class_='a-color-secondary')
            sponsored_list.append(text_element.get_text(strip=True) if text_element else "Not Sponsored")

    d = {"title":[], "price":[], "rating":[], "num_reviews":[],"availability":[], "product_link":[], "description":[], "sponsored":[], "reviews_link":[], "reviews":[]}
    
    # Loop for extracting product details from each link 
    id = 0
    for link in links_list:
        product_link = "https://www.amazon.com" + link
        new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)

        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Function calls to display all necessary product information
        d['title'].append(get_title(new_soup))
        d['product_link'].append(product_link)
        d['price'].append(get_price(new_soup))
        d['rating'].append(get_rating(new_soup))
        d['num_reviews'].append(get_review_count(new_soup))
        d['availability'].append(get_availability(new_soup))
        d['description'].append(get_description(new_soup))
        d['sponsored'].append(sponsored_list[id])
        d['reviews_link'].append(get_review_link(new_soup))
        d['reviews'].append(get_all_reviews(get_review_link(new_soup)))
        id +=1

    amazon_df = pd.DataFrame.from_dict(d)
    amazon_df['title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['title'])
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)


In [90]:
pages

['https://www.amazon.com/s?k=women+jacket']

In [91]:
# pd.set_option('display.max_rows', 20)

print(amazon_df.shape)
amazon_df.head(20)

(15, 10)


Unnamed: 0,title,price,rating,num_reviews,availability,product_link,description,sponsored,reviews_link,reviews
0,Amazon Essentials Women's Lightweight Long-Sle...,$24.50,4.5 out of 5 stars,"20,589 ratings",In Stock,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Product details \nFabric type \n \nShell...,Featured from Amazon brands,https://www.amazon.com/Amazon-Essentials-Light...,[]
1,MOERDENG Women's Waterproof Ski Jacket Warm Wi...,$48.99 with 46 percent savings,4.5 out of 5 stars,"34,402 ratings",In Stock,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Product details \nFabric type \n \n100% ...,Sponsored,https://www.amazon.com/MOERDENG-Waterproof-Mou...,[]
2,Vetinee Women’s Oversized Button Up Frayed Hem...,$46.96 with 8 percent savings,4.2 out of 5 stars,"2,450 ratings",In Stock,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Product details \nFabric type \n \n80% C...,Sponsored,https://www.amazon.com/Vetinee-Oversized-Boyfr...,[]
3,"90 Degree By Reflex Women’s Lightweight, Full ...",$39.99,4.5 out of 5 stars,"7,847 ratings",In Stock,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Product details \nFabric type \n \n72% N...,Sponsored,https://www.amazon.com/90-Degree-Reflex-Brushe...,[]
7,Amazon Essentials Women's Lightweight Long-Sle...,About this item,4.5 out of 5 stars,"20,589 ratings",Not Available,https://www.amazon.com/Amazon-Essentials-Light...,Product details \nFabric type \n \nShell...,100+ bought in past month,https://www.amazon.com/Amazon-Essentials-Light...,[]
10,THE NORTH FACE Teen Glacier Lightweight Full Z...,$36.00 with 39 percent savings,4.4 out of 5 stars,123 ratings,In Stock,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Product details \nFabric type \n \n100% ...,Sponsored,https://www.amazon.com/NORTH-FACE-Glacier-Ligh...,"[{'Name': 'The Review', 'Stars': '5.0', 'Title..."
11,ECOWISH Women Cardigan Jacket Coat - Fall 2023...,$49.98,4.2 out of 5 stars,30 ratings,In Stock,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Product details \nFabric type \n \n100% ...,Sponsored,https://www.amazon.com/ECOWISH-Women-Cardigan-...,[]
12,SheKiss Womens Camouflage Shacket Jacket Coats...,$42.99,4.1 out of 5 stars,"2,304 ratings",In Stock,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Product details \nFabric type \n \nMilit...,Sponsored,https://www.amazon.com/Military-Jackets-Camouf...,[]
13,CHICWISH Women's Classy Light Tan/Black Open F...,$49.90 with 29 percent savings,4.1 out of 5 stars,"3,822 ratings",In Stock,https://www.amazon.com/sspa/click?ie=UTF8&spc=...,Product details \nFabric type \n \n100% ...,Sponsored,https://www.amazon.com/Chicwish-Womens-Classy-...,[]
16,Columbia Women's Fast Trek Ii Jacket,About this item,Previous page of related Sponsored Products,,Not Available,https://www.amazon.com/Columbia-Womens-Jacket-...,Product details \nFabric type \n \n100% ...,50+ bought in past month,https://www.amazon.com/Columbia-Womens-Fast-Tr...,"[{'Name': 'SZ', 'Stars': '5.0', 'Title': '5.0 ..."


In [88]:
amazon_df.to_csv(f'amazon_data_{search}.csv')

In [None]:
import requests
from pprint import pprint

search = input('What do you want to search')

# Structure payload.
payload = {
    'source': 'amazon_search',
    'domain': 'com',
    'query': search,
    'start_page': 1,
    'pages': 6,
    'parse': True,
    'context': [
        {'key': 'category_id', 'value': 16391693031}
    ],
}

# Get response.
response = requests.request(
    'POST',
    'https://realtime.oxylabs.io/v1/queries',
    auth=('jnossa', 'yF3QT4vqS_MrB7'),
    json=payload,
)

# Print prettified response to stdout.
pprint(response.json())

{'job': {'_links': [{'href': 'http://data.oxylabs.io/v1/queries/7171795812823870465',
                     'method': 'GET',
                     'rel': 'self'},
                    {'href': 'http://data.oxylabs.io/v1/queries/7171795812823870465/results',
                     'method': 'GET',
                     'rel': 'results'},
                    {'href': 'http://data.oxylabs.io/v1/queries/7171795812823870465/results?type=raw',
                     'method': 'GET',
                     'rel': 'results-html'},
                    {'href': 'http://data.oxylabs.io/v1/queries/7171795812823870465/results?type=parsed',
                     'method': 'GET',
                     'rel': 'results-parsed'}],
         'browser_instructions': None,
         'callback_url': 'https://realtime.oxylabs.io:443/api/done',
         'client_id': 49775,
         'client_notes': None,
         'content_encoding': 'utf-8',
         'context': [{'key': 'force_headers', 'value': None},
                     

In [None]:
import pandas as pd

# Extract JSON content
json_content = response.json()

attributes = ['asin', 'is_amazons_choice', 'is_sponsored', 'best_seller', 'manufacturer', 'pos', 'price', 'rating', 'reviews_count', 'title', 'url', 'page_nbr', 'subgroup']

data = []
for page in json_content['results']:
    page_nbr = page['content']['page']
    for subgroup in page['content']['results']:

        if subgroup == 'paid': 
            # Extract ASINs from the list of dictionaries
            asin_list = [item['asin'] for item in page['content']['results'][subgroup]]
            is_amazons_choice_list = [item['is_amazons_choice'] for item in page['content']['results'][subgroup]]
            is_sponsored_list = [item['is_sponsored'] for item in page['content']['results'][subgroup]]
            best_seller_list = [item['best_seller'] for item in page['content']['results'][subgroup]]
            manufacturer_list = [item['manufacturer'] for item in page['content']['results'][subgroup]]
            pos_list = [item['rel_pos'] if 'rel_pos' in item else item['pos'] for item in page['content']['results'][subgroup]]
            price_list = [pd.NA if 'rel_pos' in item else item['price'] for item in page['content']['results'][subgroup]]
            rating_list = [item['rating'] for item in page['content']['results'][subgroup]]
            reviews_count_list = [item['reviews_count'] for item in page['content']['results'][subgroup]]
            title_list = [item['title'] for item in page['content']['results'][subgroup]]
            url_list = [item['url'] for item in page['content']['results'][subgroup]]

        else:
            # Extract ASINs from the list of dictionaries
            asin_list = [item['asin'] for item in page['content']['results'][subgroup]]
            is_amazons_choice_list = [item['is_amazons_choice'] for item in page['content']['results'][subgroup]]
            is_sponsored_list = [item['is_sponsored'] for item in page['content']['results'][subgroup]]
            best_seller_list = [item['best_seller'] for item in page['content']['results'][subgroup]]
            manufacturer_list = [item['manufacturer'] for item in page['content']['results'][subgroup]]
            pos_list = [item['pos'] for item in page['content']['results'][subgroup]]
            price_list = [item['price'] for item in page['content']['results'][subgroup]]
            rating_list = [item['rating'] for item in page['content']['results'][subgroup]]
            reviews_count_list = [item['reviews_count'] for item in page['content']['results'][subgroup]]
            title_list = [item['title'] for item in page['content']['results'][subgroup]]
            url_list = [item['url'] for item in page['content']['results'][subgroup]]

        for i in range(len(asin_list)):
            row = {
                'asin': asin_list[i],
                'is_amazons_choice': is_amazons_choice_list[i],
                'is_sponsored': is_sponsored_list[i],
                'best_seller': best_seller_list[i],
                'manufacturer': manufacturer_list[i],
                'pos': pos_list[i],
                'price': price_list[i],
                'rating': rating_list[i],
                'reviews_count': reviews_count_list[i],
                'title': title_list[i],
                'url': url_list[i],
                'page_nbr': page_nbr,
                'subgroup': subgroup
            }
            data.append(row)

df = pd.DataFrame(data, columns=attributes)
print(df.shape)
df.head()

(306, 13)


Unnamed: 0,asin,is_amazons_choice,is_sponsored,best_seller,manufacturer,pos,price,rating,reviews_count,title,url,page_nbr,subgroup
0,B06Y2R5J2Q,False,True,False,,1,,4.6,78,"Koobay 16.5"" Rose Gold Copper Clothes Hangers ...",https://aax-us-iad.amazon.com/x/c/RMD39H3NPt8e...,1,paid
1,B076M396J5,False,True,False,,2,,4.6,145,"25Pack Koobay 16.5"" Metal Hook Wire Rose Gold ...",https://aax-us-iad.amazon.com/x/c/RMD39H3NPt8e...,1,paid
2,B06ZY941C9,False,True,False,,3,,4.6,84,"30Pack KOOBAY 13.7"" Rose Gold Metal Heavy Duty...",https://aax-us-iad.amazon.com/x/c/RMD39H3NPt8e...,1,paid
3,B07RDM1P5H,False,False,True,,2,15.96,4.7,34658,Clothes Hangers Plastic 40 Pack - Black Plasti...,/Neaterize-Plastic-Clothes-Lightweight-Availab...,1,organic
4,B01G3WS3PW,False,False,True,,3,21.99,4.7,32400,Utopia Home Premium Velvet Hangers 50 Pack - N...,/Utopia-Home-Premium-Non-Slip-Hangers/dp/B01G3...,1,organic


In [None]:
product_list = df['asin'].tolist()

for product in product_list:
    # Structure payload.
    payload = {
        'source': 'amazon_reviews',
        'domain': 'com',
        'query': product,
        'parse': True,
    }

    # Get response.
    response_reviews = requests.request(
        'POST',
        'https://realtime.oxylabs.io/v1/queries',
        auth=('jnossa', 'yF3QT4vqS_MrB7'),
        json=payload,
    )
    # Extract JSON content
    json_reviews = response_reviews.json()

    attributes = ['title', 'author', 'rating', 'content', 'timestamp', 'is_verified']

    data = []
    # Extract titles from the list of dictionaries
    title_list = [item['title'] for item in json_reviews['results'][0]['content']['reviews']]
    author_list = [item['author'] for item in json_reviews['results'][0]['content']['reviews']]
    rating_list = [item['rating'] for item in json_reviews['results'][0]['content']['reviews']]
    content_list = [item['content'] for item in json_reviews['results'][0]['content']['reviews']]
    timestamp_list = [item['timestamp'] for item in json_reviews['results'][0]['content']['reviews']]
    is_verified_list = [item['is_verified'] for item in json_reviews['results'][0]['content']['reviews']]

    for i in range(len(title_list)):
        row = {
            'title': title_list[i],
            'author': author_list[i],
            'rating': rating_list[i],
            'content': content_list[i],
            'timestamp': timestamp_list[i],
            'is_verified': is_verified_list[i]
        }
        data.append(row)

    df_reviews = pd.DataFrame(data, columns=attributes)

    # Locate the row based on the ID value
    index_to_update = df.loc[df['asin'] == product].index
    result_dict = df_reviews.to_dict(orient='records')

    # Update the 'review' feature for the located row
    df.loc[index_to_update, 'reviews'] = str(result_dict)

safe_search = search.replace(' ', '_')
df.to_csv(f'{safe_search}.csv')

  df.loc[index_to_update, 'reviews'] = str(result_dict)


In [None]:
import tkinter as tk
from tkinter import messagebox

# Your script here

# Notify when the script is done
root = tk.Tk()
root.withdraw()  # Hide the main window

messagebox.showinfo("Notification", "Your script has finished running!")

'ok'