In [1]:
import requests

import pandas as pd
import numpy as np

from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
# Pagination
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15'}
pagination_page = requests.get(url, headers = headers)

soup = BeautifulSoup(pagination_page.text, 'html.parser')

total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')
itens_per_page = 36
page_number = np.ceil(int(total_item) / itens_per_page)
url_pagination = url + '?page-size=' + str(int(page_number * itens_per_page))
url_pagination

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=108'

In [3]:
main_page = requests.get(url_pagination, headers = headers)
soup = BeautifulSoup(main_page.text, 'html.parser')
products = soup.find('ul', class_='products-listing small')

In [4]:
products_list_article = products.find_all('article', class_ = 'hm-product-item')

#product_id
product_id = [p.get('data-articlecode') for p in products_list_article]

#product_category
product_category = [p.get('data-category') for p in products_list_article]

In [5]:
#product_name
products_list_link = products.find_all('a', class_ = 'link')
product_name = [p.get_text() for p in products_list_link]

In [6]:
#product_price
product_list_spam = products.find_all('span', class_ = 'price regular')
product_price = [p.get_text() for p in product_list_spam]

In [7]:
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

In [8]:
#scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d- %H:%M:%S')

In [61]:
data['style_code'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:]) 

In [62]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_code,color_id
0,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,1
1,985159001,men_jeans_skinny,Skinny Jeans,$ 19.99,2021-09-28- 09:17:23,985159,1
2,985197003,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,3
3,985197005,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,5
4,985197007,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,7


In [68]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15'}

# Empty DataFrame
df_details = pd.DataFrame()

# Unique columns for all products
aux = []
cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns = cols)

for i in range(len(data)):

    # API request 
    url_products = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    page_product = requests.get(url_products, headers = headers)

    # Beautiful Soup Object
    soup = BeautifulSoup(page_product.text, 'html.parser')

    # product_color
    product_list = soup.find_all('a', class_ = 'filter-option miniature')
    
    if len(product_list) > 0:
        
        color_name = [p.get('data-color') for p in product_list]
        product_id = [p.get('data-articlecode') for p in product_list]
        df_color = pd.DataFrame([product_id, color_name]).T
        df_color.columns = ['product_id', 'color_name']
        #================== Create Style Code and Color Code ==================#
        df_color['style_code'] = df_color['product_id'].apply(lambda x: x[:-3])
        df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:]) 

    # product_composition
    product_composition_list = soup.find_all('div', class_ = 'pdp-description-list-item')
    
    if len(product_composition_list) > 0:
        product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]
        df_composition = pd.DataFrame(product_composition).T
        df_composition.columns = df_composition.iloc[0]
        df_composition = df_composition.iloc[1:].fillna(method = 'ffill')

        # Garantee the same number of columns
        df_composition = pd.concat([df_pattern, df_composition], axis = 0)

        #================== Create Style Code and Color Code ==================#
        df_composition['style_code'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
        df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:]) 
    
    aux = aux + df_composition.columns.tolist()

    ## Merging DataFrames
    data_sku = pd.merge(df_color, df_composition[['style_code', 'Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']], how = 'left', on = 'style_code')
    
    df_details = pd.concat([df_details, data_sku], axis = 0)
    
# Join showroom + details
data_raw = pd.merge(data, df_details[['style_code', 'color_name', 'Fit', 'Composition', 'Size']], how = 'left', on = 'style_code')

In [39]:
set (aux)

{'Art. No.',
 'Composition',
 'Fit',
 'More sustainable materials',
 'Size',
 'color_id',
 'style_code'}

In [71]:
data_raw.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_code,color_id,color_name,Fit,Composition,Size
0,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,1,Midnight blue,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32"
1,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,1,Midnight blue,Slim fit,"Shell: Cotton 98%, Elastane 2%","The model is 189cm/6'2"" and wears a size 32/32"
2,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,1,Denim blue,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32"
3,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,1,Denim blue,Slim fit,"Shell: Cotton 98%, Elastane 2%","The model is 189cm/6'2"" and wears a size 32/32"
4,985197001,men_jeans_slim,Slim Jeans,$ 19.99,2021-09-28- 09:17:23,985197,1,Dark denim blue,Slim fit,Pocket lining: Cotton 100%,"The model is 189cm/6'2"" and wears a size 32/32"
