# <span style="color:#3399ff">Beautiful Soup - Exemplos</span>

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime 
import numpy as np

In [2]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [4]:
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [5]:
print(soup.title)

<title>The Dormouse's story</title>


In [6]:
print(soup.head)

<head><title>The Dormouse's story</title></head>


In [7]:
print(soup.body.p)

<p class="title"><b>The Dormouse's story</b></p>


In [8]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [9]:
soup.find_all('p', 'title')

[<p class="title"><b>The Dormouse's story</b></p>]

In [10]:
soup.find_all('p', class_='story')

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [11]:
soup.find_all('a', class_='sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [12]:
soup.find_all('a', id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [13]:
soup.find_all('a', id='link1')[0].string

'Elsie'

In [14]:
soup.find_all('a', id='link1')[0].get_text()

'Elsie'

# <span style="color:#3399ff">Beautiful Soup - Prática I</span>

In [15]:
# id
# product_name
# product_type
# product_composition
# preco

In [16]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [17]:
soup = BeautifulSoup(page.text, 'html.parser')

In [18]:
products = soup.find('ul', class_='products-listing small')

In [19]:
product_list = products.find_all('article', class_='hm-product-item')

In [20]:
len(product_list)

36

In [21]:
# product_id
product_id = [p.get('data-articlecode') for p in product_list]

# product_category
product_category = [p.get('data-category') for p in product_list]

In [22]:
# product_name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list] 

In [23]:
# product_price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

In [24]:
data = pd.DataFrame([product_id , product_category, product_name, product_price]).T
data.columns = ['product_id' , 'product_category', 'product_name', 'product_price']
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price
0,1008549001,men_jeans_regular,Regular Jeans,$ 19.99
1,1013317006,men_jeans_regular,Hybrid Regular Tapered Joggers,$ 39.99
2,811993040,men_jeans_regular,Regular Jeans,$ 29.99
3,1025726003,men_jeans_relaxed,Relaxed Jeans,$ 39.99
4,979945001,men_jeans_loose,Loose Jeans,$ 29.99


In [25]:
# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [26]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2022-01-29 13:15:13
1,1013317006,men_jeans_regular,Hybrid Regular Tapered Joggers,$ 39.99,2022-01-29 13:15:13
2,811993040,men_jeans_regular,Regular Jeans,$ 29.99,2022-01-29 13:15:13
3,1025726003,men_jeans_relaxed,Relaxed Jeans,$ 39.99,2022-01-29 13:15:13
4,979945001,men_jeans_loose,Loose Jeans,$ 29.99,2022-01-29 13:15:13


# <span style="color:#3399ff">Beautiful Soup - Prática II</span>

- Sempre olhar a paginação na vitrine de produtos.

In [27]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [28]:
soup = BeautifulSoup(page.text, 'html.parser')

In [29]:
total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')

In [30]:
total_item

'71'

In [31]:
int(total_item)

71

- De quantas páginas preciso para capturar todos os itens?

In [32]:
page_number = np.round(int(total_item)/36)
page_number

2.0

In [33]:
url02 = url + '?page-size=' + str(int(page_number*36))
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=72'

# <span style="color:#3399ff">Beautiful Soup - Prática III</span>

- Aqui, vamos coletar as informações para cada produto.

- Fazendo para um produto para testar:

In [34]:
url = 'https://www2.hm.com/en_us/productpage.1024256001.html'
    
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [35]:
soup = BeautifulSoup(page.text, 'html.parser')

In [36]:
type(soup)

bs4.BeautifulSoup

In [37]:
# color name - primeira posição
soup.find_all('a', class_='filter-option miniature')[0].get('data-color')

'Light denim blue'

In [38]:
# color name
product_color_list = soup.find_all('a', class_='filter-option miniature')
color_name = [p.get('data-color') for p in product_color_list]

# product_id
product_id = [p.get('data-articlecode') for p in product_color_list]

df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id', 'color_name']
df_color

Unnamed: 0,product_id,color_name
0,1024256002,Light denim blue
1,1024256003,Light denim blue
2,1024256004,Denim blue
3,1024256005,Dark blue
4,1024256006,Dark denim blue
5,1024256007,Dark gray


In [39]:
product_color_list[0]

<a aria-checked="false" class="filter-option miniature" data-articlecode="1024256002" data-color="Light denim blue" data-sizes="" href="/en_us/productpage.1024256002.html" id="filter-colour-1024256002" role="radio" title="Light denim blue">
<noscript data-alt="Light denim blue" data-src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fda%2Fba%2Fdabaf3f73477f46b068e2e02033aa0222206ef7f.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]">
<img alt="Light denim blue" src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fda%2Fba%2Fdabaf3f73477f46b068e2e02033aa0222206ef7f.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]"/>
</noscript>
<span></span>
</a>

In [40]:
color_name

['Light denim blue',
 'Light denim blue',
 'Denim blue',
 'Dark blue',
 'Dark denim blue',
 'Dark gray']

In [41]:
product_id

['1024256002',
 '1024256003',
 '1024256004',
 '1024256005',
 '1024256006',
 '1024256007']

In [42]:
# composition
soup.find_all('div', class_='pdp-description-list-item')

[<div class="pdp-description-list-item">
 <dt>Size</dt>
 <dd>The model is 185cm/6'1" and wears a size 31/32</dd>
 </div>,
 <div class="pdp-description-list-item">
 <dt>Fit</dt>
 <dd>
 <ul>
 <li>Slim fit</li>
 </ul>
 </dd>
 </div>,
 <div class="pdp-description-list-item">
 <dt>Composition</dt>
 <dd>
 <ul>
 <li>Pocket lining: Polyester 65%, Cotton 35%</li>
 <li>Shell: Cotton 99%, Spandex 1%</li>
 </ul>
 </dd>
 </div>,
 <div class="pdp-description-list-item">
 <dt>Art. No.</dt>
 <dd>1024256001</dd>
 </div>]

In [43]:
type(soup.find_all('div', class_='pdp-description-list-item')[1])

bs4.element.Tag

In [44]:
soup.find_all('div', class_='pdp-description-list-item')[1].get_text().split('\n')

['', 'Fit', '', '', 'Slim fit', '', '', '']

- **Retirando essas posições vazias:**

In [45]:
list(filter(None, soup.find_all('div', class_='pdp-description-list-item')[1].get_text().split('\n')))

['Fit', 'Slim fit']

- **Construindo linha e coluna:**

In [46]:
product_composition_list = soup.find_all('div', class_='pdp-description-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]
df_aux = pd.DataFrame(product_composition).T
df_aux

Unnamed: 0,0,1,2,3
0,Size,Fit,Composition,Art. No.
1,"The model is 185cm/6'1"" and wears a size 31/32",Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
2,,,"Shell: Cotton 99%, Spandex 1%",


In [47]:
df_aux.iloc[0]

0           Size
1            Fit
2    Composition
3       Art. No.
Name: 0, dtype: object

In [48]:
# rename data frame
df_aux.columns = df_aux.iloc[0]
df_aux

Unnamed: 0,Size,Fit,Composition,Art. No.
0,Size,Fit,Composition,Art. No.
1,"The model is 185cm/6'1"" and wears a size 31/32",Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
2,,,"Shell: Cotton 99%, Spandex 1%",


In [49]:
# delete first row
df_aux = df_aux.iloc[1:].copy().reset_index()
df_aux

Unnamed: 0,index,Size,Fit,Composition,Art. No.
0,1,"The model is 185cm/6'1"" and wears a size 31/32",Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001.0
1,2,,,"Shell: Cotton 99%, Spandex 1%",


In [50]:
df_aux = df_aux[['Fit', 'Composition', 'Art. No.']]
df_aux

Unnamed: 0,Fit,Composition,Art. No.
0,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001.0
1,,"Shell: Cotton 99%, Spandex 1%",


- **Substituindo o None:**

In [51]:
df_aux

Unnamed: 0,Fit,Composition,Art. No.
0,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001.0
1,,"Shell: Cotton 99%, Spandex 1%",


In [52]:
df_aux = df_aux.fillna(method='ffill')
df_aux

Unnamed: 0,Fit,Composition,Art. No.
0,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
1,Slim fit,"Shell: Cotton 99%, Spandex 1%",1024256001


In [53]:
df_composition = df_aux
df_composition

Unnamed: 0,Fit,Composition,Art. No.
0,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
1,Slim fit,"Shell: Cotton 99%, Spandex 1%",1024256001


In [54]:
# generate style id + color id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
df_composition

Unnamed: 0,Fit,Composition,Art. No.,style_id,color_id
0,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001,1024256,1
1,Slim fit,"Shell: Cotton 99%, Spandex 1%",1024256001,1024256,1


In [55]:
# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])
df_color

Unnamed: 0,product_id,color_name,style_id,color_id
0,1024256002,Light denim blue,1024256,2
1,1024256003,Light denim blue,1024256,3
2,1024256004,Denim blue,1024256,4
3,1024256005,Dark blue,1024256,5
4,1024256006,Dark denim blue,1024256,6
5,1024256007,Dark gray,1024256,7


In [56]:
# Join
pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition
0,1024256002,Light denim blue,1024256,2,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
1,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
2,1024256003,Light denim blue,1024256,3,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
3,1024256003,Light denim blue,1024256,3,Slim fit,"Shell: Cotton 99%, Spandex 1%"
4,1024256004,Denim blue,1024256,4,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
5,1024256004,Denim blue,1024256,4,Slim fit,"Shell: Cotton 99%, Spandex 1%"
6,1024256005,Dark blue,1024256,5,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
7,1024256005,Dark blue,1024256,5,Slim fit,"Shell: Cotton 99%, Spandex 1%"
8,1024256006,Dark denim blue,1024256,6,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
9,1024256006,Dark denim blue,1024256,6,Slim fit,"Shell: Cotton 99%, Spandex 1%"


# <span style="color:#3399ff">Juntando todos os passos anteriores </span>

## <span style="color:#ff8000">0. Packages </span>

In [57]:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

## <span style="color:#ff8000">1. One Product </span>

In [58]:
# API Requests
url = 'https://www2.hm.com/en_us/productpage.1024256001.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get( url, headers=headers )


# Beautiful Soup object
soup = BeautifulSoup(page.text, 'html.parser')

# ================== color name =====================================
product_list = soup.find_all('a', class_='filter-option miniature')
color_name = [p.get('data-color') for p in product_list]

# color name
product_color_list = soup.find_all('a', class_='filter-option miniature')
color_name = [p.get('data-color') for p in product_color_list]

# product_id
product_id = [p.get('data-articlecode') for p in product_color_list]

df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id', 'color_name']

# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

# ================== composition =====================================
product_composition_list = soup.find_all('div', class_='pdp-description-list-item')
product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]


# rename dataframe
df_composition = pd.DataFrame(product_composition).T
df_composition.columns = df_composition.iloc[0]


# delete first row
df_composition = df_composition.iloc[1:].fillna(method='ffill')
df_composition = df_composition[['Fit', 'Composition', 'Art. No.']]

# generate style id + color id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

# Merge data color + composition
data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')


In [59]:
data_sku

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition
0,1024256002,Light denim blue,1024256,2,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
1,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
2,1024256003,Light denim blue,1024256,3,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
3,1024256003,Light denim blue,1024256,3,Slim fit,"Shell: Cotton 99%, Spandex 1%"
4,1024256004,Denim blue,1024256,4,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
5,1024256004,Denim blue,1024256,4,Slim fit,"Shell: Cotton 99%, Spandex 1%"
6,1024256005,Dark blue,1024256,5,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
7,1024256005,Dark blue,1024256,5,Slim fit,"Shell: Cotton 99%, Spandex 1%"
8,1024256006,Dark denim blue,1024256,6,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
9,1024256006,Dark denim blue,1024256,6,Slim fit,"Shell: Cotton 99%, Spandex 1%"


## <span style="color:#ff8000">2. Multiple Products </span>

In [60]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2022-01-29 13:15:13
1,1013317006,men_jeans_regular,Hybrid Regular Tapered Joggers,$ 39.99,2022-01-29 13:15:13
2,811993040,men_jeans_regular,Regular Jeans,$ 29.99,2022-01-29 13:15:13
3,1025726003,men_jeans_relaxed,Relaxed Jeans,$ 39.99,2022-01-29 13:15:13
4,979945001,men_jeans_loose,Loose Jeans,$ 29.99,2022-01-29 13:15:13


In [61]:
# ===================================== TESTES ====================================
for i in range(len(data)):

    # API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    print(url)

https://www2.hm.com/en_us/productpage.1008549001.html
https://www2.hm.com/en_us/productpage.1013317006.html
https://www2.hm.com/en_us/productpage.0811993040.html
https://www2.hm.com/en_us/productpage.1025726003.html
https://www2.hm.com/en_us/productpage.0979945001.html
https://www2.hm.com/en_us/productpage.1008549004.html
https://www2.hm.com/en_us/productpage.0875105016.html
https://www2.hm.com/en_us/productpage.0875105018.html
https://www2.hm.com/en_us/productpage.1013317002.html
https://www2.hm.com/en_us/productpage.1025726002.html
https://www2.hm.com/en_us/productpage.0979945002.html
https://www2.hm.com/en_us/productpage.0811993037.html
https://www2.hm.com/en_us/productpage.1024256001.html
https://www2.hm.com/en_us/productpage.0985159001.html
https://www2.hm.com/en_us/productpage.0690449056.html
https://www2.hm.com/en_us/productpage.0690449043.html
https://www2.hm.com/en_us/productpage.1004199005.html
https://www2.hm.com/en_us/productpage.0690449022.html
https://www2.hm.com/en_us/pr

In [63]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# empty data frame to store all the needed information
df_details = pd.DataFrame()


for i in range(len(data)):

    # API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    page = requests.get( url, headers=headers )


    # Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ================== color name =====================================
    product_list = soup.find_all('a', class_='filter-option miniature')
    color_name = [p.get('data-color') for p in product_list]

    # color name
    product_color_list = soup.find_all('a', class_='filter-option miniature')
    color_name = [p.get('data-color') for p in product_color_list]

    # product_id
    product_id = [p.get('data-articlecode') for p in product_color_list]

    df_color = pd.DataFrame([product_id, color_name]).T
    df_color.columns = ['product_id', 'color_name']

    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    # ================== composition =====================================
    product_composition_list = soup.find_all('div', class_='pdp-description-list-item')
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]


    # rename dataframe
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[0]


    # delete first row
    df_composition = df_composition.iloc[1:].fillna(method='ffill')
    df_composition = df_composition[['Fit', 'Composition', 'Art. No.']]

    # generate style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    # Merge data color + composition
    data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')
    
    # Products' details
    df_details = pd.concat([df_details, data_sku], axis=0)
    
# Join showroom data + details
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:])

data_raw = pd.merge(data, df_details[['style_id', 'color_name', 'Fit', 'Composition']], how='left', on='style_id')


In [64]:
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,Fit,Composition
0,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2022-01-29 13:15:13,1008549,1,Denim blue,Regular fit,"Shell: Cotton 98%, Spandex 2%"
1,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2022-01-29 13:15:13,1008549,1,Denim blue,Regular fit,"Pocket lining: Polyester 65%, Cotton 35%"
2,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2022-01-29 13:15:13,1008549,1,Dark blue,Regular fit,"Shell: Cotton 98%, Spandex 2%"
3,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2022-01-29 13:15:13,1008549,1,Dark blue,Regular fit,"Pocket lining: Polyester 65%, Cotton 35%"
4,1008549001,men_jeans_regular,Regular Jeans,$ 19.99,2022-01-29 13:15:13,1008549,1,Black,Regular fit,"Shell: Cotton 98%, Spandex 2%"


In [65]:
# Salvando a base de dados em csv
data_raw.to_csv('products_hm.csv')