# Scraping H&M product images
***

### 0. Import all necessary libraries

In [2]:
import numpy as np
import time
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import urllib
import pandas as pd
import cv2

### 1. Get image link for all products

In [2]:
url = "https://www2.hm.com/en_us/women/new-arrivals/shoes-accessories.html?sort=stock&productTypes=bag&image-size=small&image=stillLife&offset=0&page-size=36"
hm_page = requests.get(url)
soup = BeautifulSoup(hm_page.content, 'lxml')

In [3]:
# get all image links
match = soup.findAll('img', {"class":"item-image"}) # find all tags with images
links = [x['data-altimage'] for x in match] # get image links to save pic
desc = [x['alt'] for x in match] # get item descriptions
match2 = soup.findAll('span', {'class': 'price regular'}) # find all tags with prices
prices = [price.text for price in match2] # get item prices

In [4]:
# update the links that were
links_= []
for l in links:
    l = l.replace('[m]','[y]')
    l = l.replace('style', 'main')
    l = 'http:'+l
    links_.append(l)

In [5]:
# create a list of dictionaries with item details
all_data = []
"http:"+(url.replace('[m]', '[y]').replace('style', 'main'))
for i in range(len(links)):
    all_data.append({"link": links_[i], "description": desc[i], "price": prices[i]})
all_data

[{'link': 'http://lp2.hm.com/hmgoepprod?set=source[/14/19/1419903843a7b4bbd0b497c8189e43061fb18056.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[y],hmver[1]&call=url[file:/product/main]',
  'description': 'Bucket Bag with Suede Details',
  'price': '$49.99'},
 {'link': 'http://lp2.hm.com/hmgoepprod?set=source[/9e/36/9e362586554b101dcc2745227fec6a93bc9a5c45.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[y],hmver[1]&call=url[file:/product/main]',
  'description': 'Straw Shoulder Bag',
  'price': '$19.99'},
 {'link': 'http://lp2.hm.com/hmgoepprod?set=source[/19/58/19581a1612861d63e710ab471eac47fc8b2e515d.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[y],hmver[1]&call=url[file:/product/main]',
  'description': 'Paper Straw Shopper',
  'price': '$29.99'},
 {'link': 'http://lp2.hm.com/hmgoepprod?set=source[/21/8c/218cea81e997d7f320a8bd005662667462532d8a.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[y],hmver[1]&call=url[file:/product/main]',
 

In [6]:
# list of dictionary in dataframe
df_data = pd.DataFrame(all_data)
df_data

Unnamed: 0,description,link,price
0,Bucket Bag with Suede Details,http://lp2.hm.com/hmgoepprod?set=source[/14/19...,$49.99
1,Straw Shoulder Bag,http://lp2.hm.com/hmgoepprod?set=source[/9e/36...,$19.99
2,Paper Straw Shopper,http://lp2.hm.com/hmgoepprod?set=source[/19/58...,$29.99
3,Mini Shopper,http://lp2.hm.com/hmgoepprod?set=source[/21/8c...,$19.99
4,Net Bag,http://lp2.hm.com/hmgoepprod?set=source[/b2/bf...,$14.99
5,Suede Shopper,http://lp2.hm.com/hmgoepprod?set=source[/90/8c...,$69.99
6,Round Straw Bag,http://lp2.hm.com/hmgoepprod?set=source[/ce/67...,$17.99
7,Braided Handbag,http://lp2.hm.com/hmgoepprod?set=source[/36/ed...,$59.99
8,Straw Bag with Fabric Bag,http://lp2.hm.com/hmgoepprod?set=source[/e2/2f...,$29.99
9,Suede Shoulder Bag,http://lp2.hm.com/hmgoepprod?set=source[/6f/91...,$49.99


In [7]:
# save the dataframe as a csv (to merge with other data later)
df_data.to_csv('hm_data.csv')

### 2. Loop through image links and save as a .png file

In [9]:
# loop through the links to request and save image
for i, url in enumerate(links_):
    time.sleep(1)
    try:
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        binary_str = response.read()
        byte_array = bytearray(binary_str)
        numpy_array = np.asarray(byte_array, dtype='int8')
        image = cv2.imdecode(numpy_array, cv2.IMREAD_UNCHANGED)
        cv2.imwrite("H&M/"+"hm_{}".format(i+1)+".png", image)
        print("Saved "+"hm_{}".format(i+1)+".png")
    except Exception as e:
        print(str(e))

print("Done")


Saved hm_0.png
Saved hm_1.png
Saved hm_2.png
Saved hm_3.png
Saved hm_4.png
Saved hm_5.png
Saved hm_6.png
Saved hm_7.png
Saved hm_8.png
Saved hm_9.png
Saved hm_10.png
Saved hm_11.png
Saved hm_12.png
Saved hm_13.png
Saved hm_14.png
Saved hm_15.png
Saved hm_16.png
Saved hm_17.png
Saved hm_18.png
Saved hm_19.png
Saved hm_20.png
Saved hm_21.png
Saved hm_22.png
Saved hm_23.png
Saved hm_24.png
Saved hm_25.png
Saved hm_26.png
Saved hm_27.png
Saved hm_28.png
Done


In [3]:
df = pd.read_csv('hm_data.csv')
df

Unnamed: 0.1,Unnamed: 0,description,link,price
0,0,Bucket Bag with Suede Details,http://lp2.hm.com/hmgoepprod?set=source[/14/19...,$49.99
1,1,Straw Shoulder Bag,http://lp2.hm.com/hmgoepprod?set=source[/9e/36...,$19.99
2,2,Paper Straw Shopper,http://lp2.hm.com/hmgoepprod?set=source[/19/58...,$29.99
3,3,Mini Shopper,http://lp2.hm.com/hmgoepprod?set=source[/21/8c...,$19.99
4,4,Net Bag,http://lp2.hm.com/hmgoepprod?set=source[/b2/bf...,$14.99
5,5,Suede Shopper,http://lp2.hm.com/hmgoepprod?set=source[/90/8c...,$69.99
6,6,Round Straw Bag,http://lp2.hm.com/hmgoepprod?set=source[/ce/67...,$17.99
7,7,Braided Handbag,http://lp2.hm.com/hmgoepprod?set=source[/36/ed...,$59.99
8,8,Straw Bag with Fabric Bag,http://lp2.hm.com/hmgoepprod?set=source[/e2/2f...,$29.99
9,9,Suede Shoulder Bag,http://lp2.hm.com/hmgoepprod?set=source[/6f/91...,$49.99


In [4]:
df['brand'] = 'H&M'
df['img-file'] = df['Unnamed: 0'].apply(lambda x: 'hm_'+str(x+1)+'.png')
df.drop(labels = 'Unnamed: 0', axis=1, inplace = True)
df

Unnamed: 0,description,link,price,brand,img-file
0,Bucket Bag with Suede Details,http://lp2.hm.com/hmgoepprod?set=source[/14/19...,$49.99,H&M,hm_1.png
1,Straw Shoulder Bag,http://lp2.hm.com/hmgoepprod?set=source[/9e/36...,$19.99,H&M,hm_2.png
2,Paper Straw Shopper,http://lp2.hm.com/hmgoepprod?set=source[/19/58...,$29.99,H&M,hm_3.png
3,Mini Shopper,http://lp2.hm.com/hmgoepprod?set=source[/21/8c...,$19.99,H&M,hm_4.png
4,Net Bag,http://lp2.hm.com/hmgoepprod?set=source[/b2/bf...,$14.99,H&M,hm_5.png
5,Suede Shopper,http://lp2.hm.com/hmgoepprod?set=source[/90/8c...,$69.99,H&M,hm_6.png
6,Round Straw Bag,http://lp2.hm.com/hmgoepprod?set=source[/ce/67...,$17.99,H&M,hm_7.png
7,Braided Handbag,http://lp2.hm.com/hmgoepprod?set=source[/36/ed...,$59.99,H&M,hm_8.png
8,Straw Bag with Fabric Bag,http://lp2.hm.com/hmgoepprod?set=source[/e2/2f...,$29.99,H&M,hm_9.png
9,Suede Shoulder Bag,http://lp2.hm.com/hmgoepprod?set=source[/6f/91...,$49.99,H&M,hm_10.png


In [5]:
df.to_csv('hm_data.csv')