# <span style="color:#3399ff">Beautiful Soup - Exemplos</span>

In [38]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime 
import numpy as np

In [3]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [5]:
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [6]:
print(soup.title)

<title>The Dormouse's story</title>


In [7]:
print(soup.head)

<head><title>The Dormouse's story</title></head>


In [8]:
print(soup.body.p)

<p class="title"><b>The Dormouse's story</b></p>


In [9]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [10]:
soup.find_all('p', 'title')

[<p class="title"><b>The Dormouse's story</b></p>]

In [11]:
soup.find_all('p', class_='story')

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [12]:
soup.find_all('a', class_='sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [13]:
soup.find_all('a', id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [14]:
soup.find_all('a', id='link1')[0].string

'Elsie'

In [15]:
soup.find_all('a', id='link1')[0].get_text()

'Elsie'

# <span style="color:#3399ff">Beautiful Soup - Prática I</span>

In [16]:
# id
# product_name
# product_type
# product_composition
# preco

In [17]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [18]:
soup = BeautifulSoup(page.text, 'html.parser')

In [19]:
products = soup.find('ul', class_='products-listing small')

In [20]:
product_list = products.find_all('article', class_='hm-product-item')

In [21]:
len(product_list)

36

In [22]:
# product_id
product_id = [p.get('data-articlecode') for p in product_list]

# product_category
product_category = [p.get('data-category') for p in product_list]

In [23]:
# product_name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list] 

In [24]:
# product_price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

In [25]:
data = pd.DataFrame([product_id , product_category, product_name, product_price]).T
data.columns = ['product_id' , 'product_category', 'product_name', 'product_price']
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99
1,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99
2,690449036,men_jeans_ripped,Skinny Jeans,$ 39.99
3,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99
4,690449056,men_jeans_ripped,Skinny Jeans,$ 39.99


In [26]:
# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [27]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-01-13 08:45:28
1,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-01-13 08:45:28
2,690449036,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-01-13 08:45:28
3,690449022,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-01-13 08:45:28
4,690449056,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-01-13 08:45:28


# <span style="color:#3399ff">Beautiful Soup - Prática II</span>

- Sempre olhar a paginação na vitrine de produtos.

In [28]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [30]:
soup = BeautifulSoup(page.text, 'html.parser')

In [32]:
total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')

In [33]:
total_item

'58'

In [35]:
int(total_item)

58

- De quantas páginas preciso para capturar todos os itens?

In [42]:
page_number = np.round(int(total_item)/36)
page_number

2.0

In [43]:
url02 = url + '?page-size=' + str(int(page_number*36))
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=72'