# <span style="color:#3399ff">Beautiful Soup - Exemplos</span>

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime 
import numpy as np

In [None]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [None]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [None]:
soup

In [None]:
print(soup.title)

In [None]:
print(soup.head)

In [None]:
print(soup.body.p)

In [None]:
soup.find_all('p')

In [None]:
soup.find_all('p', 'title')

In [None]:
soup.find_all('p', class_='story')

In [None]:
soup.find_all('a', class_='sister')

In [None]:
soup.find_all('a', id='link1')

In [None]:
soup.find_all('a', id='link1')[0].string

In [None]:
soup.find_all('a', id='link1')[0].get_text()

# <span style="color:#3399ff">Beautiful Soup - Prática I</span>

In [None]:
# id
# product_name
# product_type
# product_composition
# preco

In [None]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [None]:
soup = BeautifulSoup(page.text, 'html.parser')

In [None]:
products = soup.find('ul', class_='products-listing small')

In [None]:
product_list = products.find_all('article', class_='hm-product-item')

In [None]:
len(product_list)

In [None]:
# product_id
product_id = [p.get('data-articlecode') for p in product_list]

# product_category
product_category = [p.get('data-category') for p in product_list]

In [None]:
# product_name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list] 

In [None]:
# product_price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

In [None]:
data = pd.DataFrame([product_id , product_category, product_name, product_price]).T
data.columns = ['product_id' , 'product_category', 'product_name', 'product_price']
data.head()

In [None]:
# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [None]:
data.head()

# <span style="color:#3399ff">Beautiful Soup - Prática II</span>

- Sempre olhar a paginação na vitrine de produtos.

In [None]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [None]:
soup = BeautifulSoup(page.text, 'html.parser')

In [None]:
total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')

In [None]:
total_item

In [None]:
int(total_item)

- De quantas páginas preciso para capturar todos os itens?

In [None]:
page_number = np.round(int(total_item)/36)
page_number

In [None]:
url02 = url + '?page-size=' + str(int(page_number*36))
url02

# <span style="color:#3399ff">Beautiful Soup - Prática III</span>

- Aqui, vamos coletar as informações para cada produto.

- Fazendo para um produto para testar:

In [None]:
url = 'https://www2.hm.com/en_us/productpage.1024256001.html'
    
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [None]:
soup = BeautifulSoup(page.text, 'html.parser')

In [None]:
type(soup)

In [None]:
# color name - primeira posição
soup.find_all('a', class_='filter-option miniature')[0].get('data-color')

In [None]:
# color name
product_color_list = soup.find_all('a', class_='filter-option miniature')
color_name = [p.get('data-color') for p in product_color_list]

# product_id
product_id = [p.get('data-articlecode') for p in product_color_list]

df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id', 'color_name']
df_color

In [None]:
product_color_list[0]

In [None]:
color_name

In [None]:
product_id

In [None]:
# composition
soup.find_all('div', class_='pdp-description-list-item')

In [None]:
type(soup.find_all('div', class_='pdp-description-list-item')[1])

In [None]:
soup.find_all('div', class_='pdp-description-list-item')[1].get_text().split('\n')

- **Retirando essas posições vazias:**

In [None]:
list(filter(None, soup.find_all('div', class_='pdp-description-list-item')[1].get_text().split('\n')))

- **Construindo linha e coluna:**

In [3]:
product_composition_list = soup.find_all('div', class_='pdp-description-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]
df_aux = pd.DataFrame(product_composition).T
df_aux

Unnamed: 0,0,1,2,3
0,Size,Fit,Composition,Art. No.
1,"The model is 185cm/6'1"" and wears a size 31/32",Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
2,,,"Shell: Cotton 99%, Spandex 1%",


In [None]:
df_aux.iloc[0]

In [None]:
# rename data frame
df_aux.columns = df_aux.iloc[0]
df_aux

In [None]:
# delete first row
df_aux = df_aux.iloc[1:].copy().reset_index()
df_aux

In [None]:
df_aux = df_aux[['Fit', 'Composition', 'Art. No.']]
df_aux

- **Substituindo o None:**

In [None]:
df_aux

In [None]:
df_aux = df_aux.fillna(method='ffill')
df_aux

In [None]:
df_composition = df_aux
df_composition

In [None]:
# generate style id + color id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
df_composition

In [None]:
# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])
df_color

In [None]:
# Join
pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

# <span style="color:#3399ff">Juntando todos os passos anteriores </span>

## <span style="color:#ff8000">0. Packages </span>

In [1]:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

## <span style="color:#ff8000">1. One Product </span>

In [25]:
# API Requests
url = 'https://www2.hm.com/en_us/productpage.1024256001.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url, headers=headers )


# Beautiful Soup object
soup = BeautifulSoup(page.text, 'html.parser')

# ================== color name =====================================
product_list = soup.find_all('a', class_='filter-option miniature')
color_name = [p.get('data-color') for p in product_list]

# color name
product_color_list = soup.find_all('a', class_='filter-option miniature')
color_name = [p.get('data-color') for p in product_color_list]

# product_id
product_id = [p.get('data-articlecode') for p in product_color_list]

df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id', 'color_name']

# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

# ================== composition =====================================
product_composition_list = soup.find_all('div', class_='pdp-description-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]


# rename dataframe
df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]


# delete first row
df_composition = df_composition.iloc[1:].fillna(method='ffill')
df_composition = df_composition[['Fit', 'Composition', 'Art. No.']]

# generate style id + color id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

# Merge data color + composition
data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')


In [26]:
data_sku

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition
0,1024256002,Light denim blue,1024256,2,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
1,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
2,1024256003,Light denim blue,1024256,3,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
3,1024256003,Light denim blue,1024256,3,Slim fit,"Shell: Cotton 99%, Spandex 1%"
4,1024256004,Denim blue,1024256,4,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
5,1024256004,Denim blue,1024256,4,Slim fit,"Shell: Cotton 99%, Spandex 1%"
6,1024256005,Dark blue,1024256,5,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
7,1024256005,Dark blue,1024256,5,Slim fit,"Shell: Cotton 99%, Spandex 1%"
8,1024256006,Dark denim blue,1024256,6,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
9,1024256006,Dark denim blue,1024256,6,Slim fit,"Shell: Cotton 99%, Spandex 1%"


## <span style="color:#ff8000">2. Multiple Products </span>