In [1]:
from bs4 import BeautifulSoup
from requests import get
from fake_useragent import UserAgent
import re
from bs4.element import Tag
from json import loads

In [2]:
def get_soup(pid: str) -> BeautifulSoup:
    url = f'https://amazon.com.br/dp/{pid}'
    user_agent = UserAgent()
    random_user_agent = user_agent.random
    header = {
        'Accept': '*/*',
        'User-Agent': random_user_agent,
        'Accept-Language': 'pt-BR,pt;q=0.8,en-US;q=0.5,en;q=0.3'
    }
    page = get(url, headers=header)
    assert page.status_code == 200, 'Status Code must be 200'
    
    return BeautifulSoup(page.content, 'html.parser')

In [3]:
def get_title(soup: BeautifulSoup):
    title = soup.find(id='productTitle')
    if title: return title.get_text().strip()
    return "Not Defined"

In [4]:
def get_reviews(soup: BeautifulSoup):
    title = soup.find(id='acrCustomerReviewText')
    if title is None: return 0
    reviews = re.search("^[\d*\.]+", title.get_text())
    if not reviews: return 0
    return int(reviews.group().replace('.', ''))

In [5]:
def get_category(soup: BeautifulSoup, delimiter: str = " > "):
    element = soup.find(id='wayfinding-breadcrumbs_container')
    if element is None: return 'Not Defined'
    element = str(element)
    inner_text = re.findall(r">\n([\s/\n\w]+)<", element)
    if inner_text:
        return delimiter.join(
            set([txt.strip() for txt in inner_text if txt.strip() != ""])
        )

    return "Not Defined"

In [6]:
def get_free_shipping(soup: BeautifulSoup) -> str:
    prime_div = soup.find(id="primeSavingsUpsellCaption_feature_div")
    if prime_div is not None:
        return "true"

    prime_div = soup.select("div.tabular-buybox-text:nth-child(4) div:nth-child(1) span:nth-child(1)")
    if prime_div:
        prime_div = prime_div[0].text
        if "amazon" in prime_div.strip().lower(): return "true"
    
    prime_div = soup.select("div#mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE span")
    if prime_div:
        prime_div = prime_div[0].text
        if "grátis" in prime_div.strip().lower(): return "true"
        
    return "false"

In [7]:
def convert_price_to_number(price_string: str) -> float:
    price = re.search("[\d\,\.]+", price_string)
    return float(price.group().replace('.', '').replace(',', '.'))

In [8]:
def loop_price_selectors(element: Tag):
    '''
    Com o element selecionado e existente, roda um loop nos possíveis seletores
    que podem possuir o preço, e se esse seletor existir, retorna o conteúdo (innerText) 
    do seletor.
    '''
    possible_price_selectors =['span.a-offscreen']
    for selector in possible_price_selectors:
        has_content = element.select(selector)
        if has_content: return has_content[0].text
    return None

In [9]:
def get_price(soup: BeautifulSoup) -> float:
    price_container_element = soup.find(id='corePrice_feature_div')
    if price_container_element:
        price = loop_price_selectors(price_container_element)
        if price: return convert_price_to_number(price)
    del price_container_element
    
    # no caso de livros, #corePrice_feature_div existe, mas não contem dados relevantes, o elemento a ser capturado é outro
    book_price = soup.find(id='price')
    if book_price: return convert_price_to_number(book_price.text)
    del book_price
    
    # para ebooks kindle, o processo é semelhante ao de livros
    ebook_price = soup.find(id='kindle-price')
    if ebook_price: return convert_price_to_number(ebook_price.text)
    
    return 0.0

In [10]:
def convert_discount_to_number(discount_string: str) -> int:
    discount = re.search("\d{1,2}%", discount_string)
    return int(discount.group().replace('%',''))

In [11]:
def get_discount(soup: BeautifulSoup) -> int:
    discount_basic_element = soup.find('span', {'class': 'savingPriceOverride'})
    if discount_basic_element: return convert_discount_to_number(discount_basic_element.text)
    del discount_basic_element
    
    # livros fisicos
    book_discount = soup.find(id='savingsPercentage')
    if book_discount: return convert_discount_to_number(book_discount.text)
    del book_discount
    
    # ebooks
    ebook_discount = soup.select('p.ebooks-price-savings')
    if ebook_discount: return convert_discount_to_number(ebook_discount[0].text)
    del ebook_discount
    
    # preços e discontos em <table>
    table_discount = soup.select('tr td.a-span12.a-color-price.a-size-base span.a-color-price')
    if table_discount: return convert_discount_to_number(table_discount[0].text)

    
    return 0
    

In [12]:
def get_previous_price(soup: BeautifulSoup):
    basis_price_element = soup.find('span', {'class': 'basisPrice'})
    if basis_price_element:
        previous_price = basis_price_element.find('span', {'class': 'a-offscreen'})
        if previous_price: return convert_price_to_number(previous_price.text)
    del basis_price_element
    
    # ebooks
    basis_price_element = soup.find(id='digital-list-price')
    if basis_price_element: return convert_price_to_number(basis_price_element.text)
    del basis_price_element
    
    # livros físicos
    basis_price_element = soup.find(id='listPrice')
    if basis_price_element: return convert_price_to_number(basis_price_element.text)
    del basis_price_element
    
    return None

In [13]:
def get_biggest_image(element: Tag):
    images = element.get('data-a-dynamic-image', None)
    if images:
        images = list(loads(images).keys())
        return images[-1]
    return None

In [14]:
def get_image_url(soup: BeautifulSoup) -> str:
    #v1
    image_element = soup.find('img', {'class': 'a-dynamic-image'})
    if image_element:
        image_url = image_element.get('data-old-hires', None)
        if image_url: return image_url
    del image_element
    
    # v2
    image_element = soup.find(id='landingImage')
    if image_element:
        image_url = image_element.get('data-old-hires', None)
        if image_url: return image_url
    del image_element
    
    # v3
    image_element = soup.find(id='landingImage')
    if image_element: 
        image_url = get_biggest_image(image_element)
        if image_url: return image_url
        
    # livros físicos
    image_element = soup.find(id='ebooksImgBlkFront')
    if image_element: 
        image_url = get_biggest_image(image_element)
        if image_url: return image_url
    del image_element
    

    return None

In [15]:
soup = get_soup('B07XB3JDCB')

In [17]:
get_image_url(soup)

'https://m.media-amazon.com/images/I/819srW8JiRL._AC_SL1500_.jpg'

In [18]:
get_previous_price(soup)

137.74

In [19]:
get_discount(soup)

6

In [20]:
get_price(soup)

129.9

In [21]:
get_title(soup)

'Barbie Conjunto de Brinquedo Maiô Rosa e Piscina para crianças a partir de 3 anos'

In [22]:
get_reviews(soup)

1375

In [23]:
get_category(soup)

'Bonecas > Bonecas e Acessórios > Brinquedos e Jogos'

In [24]:
get_is_prime(soup)

'true'