In [23]:
import requests
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup
import numpy as np

In [6]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

# the header is added so the request is treated like a web browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5), AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [7]:
soup = BeautifulSoup( page.text, 'html.parser' )

In [11]:
products = soup.find('ul', class_='products-listing small')

In [12]:
product_list = products.find_all('article', class_='hm-product-item')

# product id
product_id = [p.get('data-articlecode') for p in product_list]

# product category
product_category = [p.get('data-category') for p in product_list]

In [13]:
# product name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list]

In [14]:
# product price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

In [19]:
# we use the transposed matrix so items are correctly aligned
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [20]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-11-02 14:27:28
1,751994034,men_jeans_slim,Slim Jeans,$ 29.99,2021-11-02 14:27:28
2,1004476004,men_jeans_slim,Freefit® Slim Jeans,$ 49.99,2021-11-02 14:27:28
3,938875001,men_jeans_slim,Slim Tapered Jeans,$ 39.99,2021-11-02 14:27:28
4,985197005,men_jeans_slim,Slim Jeans,$ 19.99,2021-11-02 14:27:28


In [21]:
# we take the first element with class data-total which represents how many items there are
# in the page
total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')
total_item

'90'

In [24]:
# how many pages we have
# we divide by 36 because it's the number of items displayed in the page
page_number = np.round(int(total_item)/36)

In [25]:
# the second url will be created with the page-size argument followed by the number of pages
# times how many items are displayed per page
url02 = url + '?page-size=' + str(int(page_number * 36))

### Fecthing information about a single product

In [30]:
url = 'https://www2.hm.com/en_us/productpage.0636207010.html'

# the header is added so the request is treated like a web browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5), AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

soup = BeautifulSoup( page.text, 'html.parser' )

In [35]:
product_list = soup.find_all('a', class_='filter-option miniature')
color_name = [p.get('data-color') for p in product_list]
product_id = [p.get('data-articlecode') for p in product_list]

In [69]:
df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id', 'color_name']

# generate style id + color id

df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[3:])

In [38]:
df_color.head()

Unnamed: 0,product_id,color_name
0,636207001,Dark denim blue
1,636207002,Dark gray denim
2,636207004,Denim blue
3,636207005,Gray
4,636207006,Black


In [56]:
product_composition_list = soup.find_all('div', class_='pdp-description-list-item')

product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]

In [71]:
df_composition = pd.DataFrame(product_composition).T
# setting the first line to be the column name
df_composition.columns = df_composition.iloc[0]
# removing the first line and filling the second with the information from the first
df_composition = df_composition.iloc[1:].fillna(method='ffill')

# generate style id + color id

df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[3:])

In [72]:
# merge data color + composition

data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

In [73]:
data_sku

Unnamed: 0,product_id,color_name,style_id,color_id,Fit,Composition
0,636207001,Dark denim blue,636207,6207001,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
1,636207001,Dark denim blue,636207,6207001,Slim fit,Pocket lining: Cotton 100%
2,636207002,Dark gray denim,636207,6207002,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
3,636207002,Dark gray denim,636207,6207002,Slim fit,Pocket lining: Cotton 100%
4,636207004,Denim blue,636207,6207004,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
5,636207004,Denim blue,636207,6207004,Slim fit,Pocket lining: Cotton 100%
6,636207005,Gray,636207,6207005,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
7,636207005,Gray,636207,6207005,Slim fit,Pocket lining: Cotton 100%
8,636207006,Black,636207,6207006,Slim fit,"Cotton 88%, Polyester 10%, Spandex 2%"
9,636207006,Black,636207,6207006,Slim fit,Pocket lining: Cotton 100%


### Fecthing information about all the products

In [91]:
# empty df
df_details = pd.DataFrame()

# unique columns for all products
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']
df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):
    # API requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    page = requests.get(url, headers=headers)
    
    # Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # ==================== color name =================================
    
    product_list = soup.find_all('a', class_='filter-option miniature')
    color_name = [p.get('data-color') for p in product_list]
    
    # product id
    product_id = [p.get('data-articlecode') for p in product_list]
    
    df_color = pd.DataFrame([product_id, color_name]).T
    df_color.columns = ['product_id', 'color_name']

    # generate style id + color id

    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[3:])
    
    product_composition_list = soup.find_all('div', class_='pdp-description-list-item')
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]
    
    # reaname dataframe
    df_composition = pd.DataFrame(product_composition).T
    
    # setting the first line to be the column name
    df_composition.columns = df_composition.iloc[0]
    
    # removing the first line and filling the second with the information from the first
    df_composition = df_composition.iloc[1:].fillna(method='ffill')
    
    # garantee the same number of columns
    df_composition = pd.concat([df_pattern, df_composition], axis=0)

    # generate style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[3:])
    
    aux = aux + df_composition.columns.tolist()
    
    # merge data color + decomposition
    data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition',
                                                  'Size', 'Product safety']], how='left', on='style_id')
    # all details products
    df_details = pd.concat([df_details, data_sku], axis=0)
    
# Join showroom data + details
data['style_id'] = data['product_id'].apply(lambda x: x[:-3])
data['color_id'] = data['product_id'].apply(lambda x: x[-3:])

data_raw = pd.merge(data, df_details[['style_id', 'color_name', 'Fit',
                                      'Composition', 'Size', 'Product safety']], how='left', on='style_id')

In [92]:
# to know which columns are unique
set(aux)

{'Art. No.',
 'Composition',
 'Fit',
 'More sustainable materials',
 'Product safety',
 'Size',
 'color_id',
 'style_id'}

In [93]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id
0,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99,2021-11-02 14:27:28,690449,51
1,751994034,men_jeans_slim,Slim Jeans,$ 29.99,2021-11-02 14:27:28,751994,34
2,1004476004,men_jeans_slim,Freefit® Slim Jeans,$ 49.99,2021-11-02 14:27:28,1004476,4
3,938875001,men_jeans_slim,Slim Tapered Jeans,$ 39.99,2021-11-02 14:27:28,938875,1
4,985197005,men_jeans_slim,Slim Jeans,$ 19.99,2021-11-02 14:27:28,985197,5
