In [1]:
from bs4 import BeautifulSoup
from requests import get
from fake_useragent import UserAgent
import re

In [2]:
def get_soup(pid: str) -> BeautifulSoup:
    url = f'https://amazon.com.br/dp/{pid}'
    user_agent = UserAgent()
    random_user_agent = user_agent.random
    header = {
        'Accept': '*/*',
        'User-Agent': random_user_agent,
        'Accept-Language': 'pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3'
    }
    page = get(url, headers=header)
    assert page.status_code == 200, 'Status Code must be 200'
    
    return BeautifulSoup(page.content, 'html.parser')

In [3]:
def get_title(soup: BeautifulSoup):
    assert soup.title.string is not None, 'title must not be None'
    return soup.title.string

In [4]:
def get_reviews(soup: BeautifulSoup):
    title = soup.find(id='acrCustomerReviewText')
    if title is None: return 0
    reviews = re.search("^[\d*\.]+", title.get_text())
    if not reviews: return 0
    return int(reviews.group().replace('.', ''))

In [5]:
def get_category(soup: BeautifulSoup, delimiter: str = " > "):
    element = soup.find(id='wayfinding-breadcrumbs_container')
    if element is None: return 'Not Defined'
    element = str(element)
    inner_text = re.findall(r">\n([\s/\n\w]+)<", element)
    if inner_text:
        return delimiter.join(
            set([txt.strip() for txt in inner_text if txt.strip() != ""])
        )

    return "Not Defined"

In [6]:
def get_is_prime(soup: BeautifulSoup) -> str:
    prime_div = soup.find(id="primeSavingsUpsellCaption_feature_div")
    if prime_div is not None:
        return "true"

    prime_div = soup.select("div.tabular-buybox-text:nth-child(4) div:nth-child(1) span:nth-child(1)")
    if prime_div:
        prime_div = prime_div[0].text
        if "amazon" in prime_div.strip().lower(): return "true"
    
    prime_div = soup.select("div#mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE span")
    if prime_div:
        prime_div = prime_div[0].text
        if "grÃ¡tis" in prime_div.strip().lower(): return "true"
        
    return "false"

In [7]:
soup = get_soup('B000F7823U')

In [8]:
get_title(soup)

'NORPRO Norpro Assadeira de Biscoito S/S 35 x 30 cm, 1 EA, 3861 | Amazon.com.br'

In [9]:
get_reviews(soup)

0

In [10]:
get_category(soup)

'Cozinha > Compras Internacionais em Cozinha > Kitchen NAFN'

In [11]:
get_is_prime(soup)

'true'