In [6]:
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import pandas as pd
import numpy as np

# Competitor's Web Scraping

## H&M - Men - Jeans - main page

In [8]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

In [9]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'}
# encontro esse user-agent da linha acima em http://developers.whatismybrowser.com/

In [10]:
page = requests.get(url, headers=headers)

In [11]:
soup = BeautifulSoup(page.text, 'html.parser')

In [12]:
products = soup.find('ul', class_='products-listing small')  # not find_all because I want only the first 

In [13]:
product_list = products.find_all('article', class_='hm-product-item')

In [14]:
# fazer um laco for que percorre a lista aplicando o metodo get sobre o "data-articlecode"
# escrevendo o for numa linha unica de codigo que vai retornar uma lista

# product_id
product_id = [p.get('data-articlecode') for p in product_list]

# product_category
product_category = [p.get('data-category') for p in product_list]

In [15]:
# product_name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list]

In [16]:
# price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

In [17]:
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

In [18]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price
0,1024256001,men_jeans_slim,Slim Jeans,$ 19.99
1,1008549006,men_jeans_regular,Regular Jeans,$ 19.99
2,1024256002,men_jeans_slim,Slim Jeans,$ 19.99
3,690449043,men_jeans_ripped,Skinny Jeans,$ 39.99
4,690449051,men_jeans_ripped,Skinny Jeans,$ 39.99


In [19]:
# Para os itens product_color e product_composition temos que fazer a requisicao em outras paginas
# pois a vitrine (pagina principal) nao mostra todas as cores nem a composicao

## H&M - Men - Jeans - product individual pages

In [20]:
# API Requests

# example of a product page
url_3 = 'https://www2.hm.com/en_us/productpage.0985159001.html'

page = requests.get(url_3, headers=headers)

# Beautiful Soup object
soup = BeautifulSoup(page.text, 'html.parser')

# ================ Product Color ==================

gen_color_list = soup.find_all('a', class_='filter-option miniature')

# product colors
product_colors = [c.get('data-color') for c in gen_color_list]

# product id
product_id = [c.get('data-articlecode') for c in gen_color_list]

# dataframe color
df_color = pd.DataFrame(list(zip(product_id, product_colors)))
df_color.columns = ['product_id', 'product_colors']
df_color.head()

# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])
df_color.head()

# ================ Product Composition ==================

gen_composition_list = soup.find_all('div', class_='details-attributes-list-item')
gen_composition = [list(filter(None, d.get_text().split('\n'))) for d in gen_composition_list]
gen_composition

del gen_composition[5]

df_composition = pd.DataFrame(gen_composition).T

# rename column names
df_composition.columns = df_composition.iloc[0]

# delete first row
df_composition = df_composition.iloc[1:]

df_composition = df_composition.loc[:, ['Fit', 'Composition', 'Art. No.']]

# fill NA with info from the row above
df_composition = df_composition.fillna(method='ffill')

# extract style id and article id from product_id
# generate style id + composition id
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['article_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
df_composition.head()

# Merge color and composition dataframes
data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')
data_sku.head()


Unnamed: 0,product_id,product_colors,style_id,color_id,Fit,Composition
0,985159002,Denim blue,985159,2,Skinny fit,"Shell: Cotton 99%, Spandex 1%"
1,985159002,Denim blue,985159,2,Skinny fit,Pocket lining: Cotton 100%
2,985159003,Dark gray,985159,3,Skinny fit,"Shell: Cotton 99%, Spandex 1%"
3,985159003,Dark gray,985159,3,Skinny fit,Pocket lining: Cotton 100%
4,985159004,Light denim blue,985159,4,Skinny fit,"Shell: Cotton 99%, Spandex 1%"


### H&M - Men - Jeans - multiple pages¶

In [21]:
# product_url = ['https://www2.hm.com/en_us/productpage.' + str(p) + '.html' for p in product_id]

In [22]:
# empty dataframe
df_details = pd.DataFrame()

for i in range(len(data)):
   
    # API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i, 'product_id'] + '.html'
    page = requests.get(url, headers=headers)

    # Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ================ Product Color ==================

    gen_color_list = soup.find_all('a', class_='filter-option miniature')

    # product colors
    product_colors = [c.get('data-color') for c in gen_color_list]

    # product id
    product_id = [c.get('data-articlecode') for c in gen_color_list]

    # dataframe color
    df_color = pd.DataFrame(list(zip(product_id, product_colors)))
    df_color.columns = ['product_id', 'product_colors']

    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])
    

    # ================ Product Composition ==================

    gen_composition_list = soup.find_all('div', class_='details-attributes-list-item')
    gen_composition = [list(filter(None, d.get_text().split('\n'))) for d in gen_composition_list]

    del gen_composition[5]

    df_composition = pd.DataFrame(gen_composition).T

    # rename column names
    df_composition.columns = df_composition.iloc[0]

    # delete first row
    df_composition = df_composition.iloc[1:]

    df_composition = df_composition.loc[:, ['Fit', 'Composition', 'Art. No.']]

    # fill NA with info from the row above
    df_composition = df_composition.fillna(method='ffill')

    # extract style id and article id from product_id
    # generate style id + composition id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['article_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    df_composition.head()

    # Merge color and composition dataframes
    data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

    # all product details
    df_details = pd.concat([df_details, data_sku], axis=0)
    

In [23]:
df_details.head()

Unnamed: 0,product_id,product_colors,style_id,color_id,Fit,Composition
0,1024256002,Light denim blue,1024256,2,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
1,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
2,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
3,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
4,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
