# <span style="color:#3399ff">Beautiful Soup - Exemplos</span>

In [63]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime 

In [2]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [4]:
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [5]:
print(soup.title)

<title>The Dormouse's story</title>


In [6]:
print(soup.head)

<head><title>The Dormouse's story</title></head>


In [7]:
print(soup.body.p)

<p class="title"><b>The Dormouse's story</b></p>


In [8]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [9]:
soup.find_all('p', 'title')

[<p class="title"><b>The Dormouse's story</b></p>]

In [10]:
soup.find_all('p', class_='story')

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [11]:
soup.find_all('a', class_='sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [12]:
soup.find_all('a', id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [13]:
soup.find_all('a', id='link1')[0].string

'Elsie'

In [14]:
soup.find_all('a', id='link1')[0].get_text()

'Elsie'

# <span style="color:#3399ff">Beautiful Soup - Prática I</span>

In [50]:
# id
# product_name
# product_type
# product_composition
# preco

In [51]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [52]:
soup = BeautifulSoup(page.text, 'html.parser')

In [53]:
products = soup.find('ul', class_='products-listing small')

In [54]:
product_list = products.find_all('article', class_='hm-product-item')

In [55]:
len(product_list)

36

In [56]:
# product_id
product_id = [p.get('data-articlecode') for p in product_list]

# product_category
product_category = [p.get('data-category') for p in product_list]

In [59]:
# product_name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list] 

In [60]:
# product_price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

In [62]:
data = pd.DataFrame([product_id , product_category, product_name, product_price]).T
data.columns = ['product_id' , 'product_category', 'product_name', 'product_price']
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99
1,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99
2,690449036,men_jeans_ripped,Skinny Jeans,$ 39.99
3,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99
4,1024256003,men_jeans_slim,Slim Jeans,$ 19.99


In [66]:
# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [67]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-01-12 07:04:14
1,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-01-12 07:04:14
2,690449036,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-01-12 07:04:14
3,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2022-01-12 07:04:14
4,1024256003,men_jeans_slim,Slim Jeans,$ 19.99,2022-01-12 07:04:14
