# Scraping Sandqvist product images
***

### 0. Import all necessary libraries

In [1]:
import numpy as np
import time
from bs4 import BeautifulSoup as bs
import requests
import re
import pandas as pd
import urllib
import cv2
from selenium import webdriver
!pip install webdriver_manager
from webdriver_manager.chrome import ChromeDriverManager



### 1. Get image link for all products

In [2]:
urls = ["https://www.sandqvist.com/en/shop#categories~sor=shop/tote_bags",
        "https://www.sandqvist.com/en/shop#categories~sor=shop/shoulder-bags"]

all_ = []
links_ = []
desc_ = []
prices_ = []

In [3]:
for url in urls:
    
    browser = webdriver.Chrome(ChromeDriverManager().install())

    browser.get(url)
    time.sleep(5)

    # Selenium script to grab all product info on page -> wait 3 seconds for the next batch of data to load, 
    # then continue scrolling. It will continue to do this until the page stops loading new data.

    elements = (browser.find_elements_by_class_name(name= 'productList-image'))
    links_.extend([e.get_attribute('innerHTML') for e in elements])
    desc_.extend([e.get_attribute('alt') for e in elements])
    elements2 = (browser.find_elements_by_class_name(name= 'productList-price'))
    prices_.extend([e.text for e in elements2])


    # Now that the page is fully scrolled, grab the source code.
    source_data = browser.page_source
    browser.close()


Checking for mac64 chromedriver:2.46 in cache
Driver found in /Users/gracejeong/.wdm/chromedriver/2.46/mac64/chromedriver

Checking for mac64 chromedriver:2.46 in cache
Driver found in /Users/gracejeong/.wdm/chromedriver/2.46/mac64/chromedriver


In [4]:
len(links_)

80

In [5]:
links = []

for l in links_:
    groups = re.search('.*?src\=\"(.*)\" alt', l)
    links.append((groups.group(1)).replace('https','http'))

In [6]:
len(links)

80

In [7]:
desc = []

for l in links_:
    groups = re.search('.*?alt\=\"(.*)\"', l)
    desc.append((groups.group(1)).replace('https','http'))


In [8]:
len(desc)

80

In [9]:
len(prices_)

80

In [10]:
# clean up prices with regex
pattern = '\d+.\d+'
p = re.compile(pattern)
prices = [p.findall(x) for x in prices_]
prices = [float(x[0]) for x in prices]
prices

[319.0,
 319.0,
 369.0,
 159.0,
 159.0,
 169.0,
 169.0,
 125.0,
 199.0,
 319.0,
 319.0,
 319.0,
 319.0,
 369.0,
 369.0,
 289.0,
 369.0,
 379.0,
 289.0,
 125.0,
 375.0,
 375.0,
 375.0,
 350.0,
 389.0,
 235.0,
 235.0,
 235.0,
 125.0,
 125.0,
 369.0,
 275.0,
 275.0,
 275.0,
 139.0,
 139.0,
 125.0,
 125.0,
 125.0,
 125.0,
 319.0,
 209.0,
 209.0,
 209.0,
 429.0,
 429.0,
 429.0,
 239.0,
 239.0,
 139.0,
 209.0,
 209.0,
 239.0,
 239.0,
 159.0,
 189.0,
 319.0,
 319.0,
 319.0,
 319.0,
 319.0,
 319.0,
 319.0,
 319.0,
 239.0,
 429.0,
 429.0,
 239.0,
 239.0,
 209.0,
 209.0,
 189.0,
 209.0,
 159.0,
 159.0,
 375.0,
 375.0,
 375.0,
 350.0,
 350.0]

In [11]:
# create dictionary with all item details to put into dataframe
all_data = []

for i in range(len(links)):
    all_data.append({'description': desc[i], 
                     'link': links[i], 
                     'price': prices[i], 
                    'brand': 'Sandqvist'})

In [12]:
# list of dictionary in dataframe
df_data = pd.DataFrame(all_data)
df_data

Unnamed: 0,brand,description,link,price
0,Sandqvist,Iris - Green / Black / White,http://sandqvist.centracdn.net/client/dynamic/...,319.0
1,Sandqvist,Iris - Black,http://sandqvist.centracdn.net/client/dynamic/...,319.0
2,Sandqvist,Andreas - Black,http://sandqvist.centracdn.net/client/dynamic/...,369.0
3,Sandqvist,Marta - Powder,http://sandqvist.centracdn.net/client/dynamic/...,159.0
4,Sandqvist,Marta - Black,http://sandqvist.centracdn.net/client/dynamic/...,159.0
5,Sandqvist,Thea - Powder,http://sandqvist.centracdn.net/client/dynamic/...,169.0
6,Sandqvist,Thea - Black,http://sandqvist.centracdn.net/client/dynamic/...,169.0
7,Sandqvist,Stig Tote bag - Burgundy,http://sandqvist.centracdn.net/client/dynamic/...,125.0
8,Sandqvist,Tyre - Navy,http://sandqvist.centracdn.net/client/dynamic/...,199.0
9,Sandqvist,Stina - Beige,http://sandqvist.centracdn.net/client/dynamic/...,319.0


In [13]:
df_data.link[0]

'http://sandqvist.centracdn.net/client/dynamic/images/1952_cecbfdc1e6-iris-greenblackwhite-01.jpg'

In [14]:
num = list(range(len(df_data)))

In [15]:
df_data = pd.concat([df_data, pd.Series(num)], axis=1)
df_data

Unnamed: 0,brand,description,link,price,0
0,Sandqvist,Iris - Green / Black / White,http://sandqvist.centracdn.net/client/dynamic/...,319.0,0
1,Sandqvist,Iris - Black,http://sandqvist.centracdn.net/client/dynamic/...,319.0,1
2,Sandqvist,Andreas - Black,http://sandqvist.centracdn.net/client/dynamic/...,369.0,2
3,Sandqvist,Marta - Powder,http://sandqvist.centracdn.net/client/dynamic/...,159.0,3
4,Sandqvist,Marta - Black,http://sandqvist.centracdn.net/client/dynamic/...,159.0,4
5,Sandqvist,Thea - Powder,http://sandqvist.centracdn.net/client/dynamic/...,169.0,5
6,Sandqvist,Thea - Black,http://sandqvist.centracdn.net/client/dynamic/...,169.0,6
7,Sandqvist,Stig Tote bag - Burgundy,http://sandqvist.centracdn.net/client/dynamic/...,125.0,7
8,Sandqvist,Tyre - Navy,http://sandqvist.centracdn.net/client/dynamic/...,199.0,8
9,Sandqvist,Stina - Beige,http://sandqvist.centracdn.net/client/dynamic/...,319.0,9


In [16]:
df_data['img-file'] = df_data[0].apply(lambda x: 'sq_'+str(int(x)+1)+'.jpg')
df_data

Unnamed: 0,brand,description,link,price,0,img-file
0,Sandqvist,Iris - Green / Black / White,http://sandqvist.centracdn.net/client/dynamic/...,319.0,0,sq_1.jpg
1,Sandqvist,Iris - Black,http://sandqvist.centracdn.net/client/dynamic/...,319.0,1,sq_2.jpg
2,Sandqvist,Andreas - Black,http://sandqvist.centracdn.net/client/dynamic/...,369.0,2,sq_3.jpg
3,Sandqvist,Marta - Powder,http://sandqvist.centracdn.net/client/dynamic/...,159.0,3,sq_4.jpg
4,Sandqvist,Marta - Black,http://sandqvist.centracdn.net/client/dynamic/...,159.0,4,sq_5.jpg
5,Sandqvist,Thea - Powder,http://sandqvist.centracdn.net/client/dynamic/...,169.0,5,sq_6.jpg
6,Sandqvist,Thea - Black,http://sandqvist.centracdn.net/client/dynamic/...,169.0,6,sq_7.jpg
7,Sandqvist,Stig Tote bag - Burgundy,http://sandqvist.centracdn.net/client/dynamic/...,125.0,7,sq_8.jpg
8,Sandqvist,Tyre - Navy,http://sandqvist.centracdn.net/client/dynamic/...,199.0,8,sq_9.jpg
9,Sandqvist,Stina - Beige,http://sandqvist.centracdn.net/client/dynamic/...,319.0,9,sq_10.jpg


In [17]:
df_data.drop(columns = [0], inplace=True)

In [18]:
df_data

Unnamed: 0,brand,description,link,price,img-file
0,Sandqvist,Iris - Green / Black / White,http://sandqvist.centracdn.net/client/dynamic/...,319.0,sq_1.jpg
1,Sandqvist,Iris - Black,http://sandqvist.centracdn.net/client/dynamic/...,319.0,sq_2.jpg
2,Sandqvist,Andreas - Black,http://sandqvist.centracdn.net/client/dynamic/...,369.0,sq_3.jpg
3,Sandqvist,Marta - Powder,http://sandqvist.centracdn.net/client/dynamic/...,159.0,sq_4.jpg
4,Sandqvist,Marta - Black,http://sandqvist.centracdn.net/client/dynamic/...,159.0,sq_5.jpg
5,Sandqvist,Thea - Powder,http://sandqvist.centracdn.net/client/dynamic/...,169.0,sq_6.jpg
6,Sandqvist,Thea - Black,http://sandqvist.centracdn.net/client/dynamic/...,169.0,sq_7.jpg
7,Sandqvist,Stig Tote bag - Burgundy,http://sandqvist.centracdn.net/client/dynamic/...,125.0,sq_8.jpg
8,Sandqvist,Tyre - Navy,http://sandqvist.centracdn.net/client/dynamic/...,199.0,sq_9.jpg
9,Sandqvist,Stina - Beige,http://sandqvist.centracdn.net/client/dynamic/...,319.0,sq_10.jpg


In [21]:
df_data[df_data['img-file'] == 'sq_32.jpg']

Unnamed: 0,brand,description,link,price,img-file
31,Sandqvist,Jussi - Beluga,http://sandqvist.centracdn.net/client/dynamic/...,275.0,sq_32.jpg


In [22]:
# save the dataframe as a csv (to merge with other data later)
df_data.to_csv('sandqvist_data.csv')

### 2. Loop through image links and save as a .png file

In [33]:
# loop through links to request and save image locally
from PIL import Image
import requests
from io import BytesIO

for i, url in enumerate(df_data.link):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        image_save_loc = 'Sandqvist/'+'sq_'+str(int(i)+1)+'.jpg'
        img.save(image_save_loc, "JPEG")
        print("Saved "+'sq_'+str(int(i)+1)+".jpg")
    except Exception as e:
        print(str(e))

print("Done")


Saved sq_1.jpg
Saved sq_2.jpg
Saved sq_3.jpg
Saved sq_4.jpg
Saved sq_5.jpg
Saved sq_6.jpg
Saved sq_7.jpg
Saved sq_8.jpg
Saved sq_9.jpg
Saved sq_10.jpg
Saved sq_11.jpg
Saved sq_12.jpg
Saved sq_13.jpg
Saved sq_14.jpg
Saved sq_15.jpg
Saved sq_16.jpg
Saved sq_17.jpg
Saved sq_18.jpg
Saved sq_19.jpg
Saved sq_20.jpg
Saved sq_21.jpg
Saved sq_22.jpg
Saved sq_23.jpg
Saved sq_24.jpg
Saved sq_25.jpg
Saved sq_26.jpg
Saved sq_27.jpg
Saved sq_28.jpg
Saved sq_29.jpg
Saved sq_30.jpg
Saved sq_31.jpg
Saved sq_32.jpg
Saved sq_33.jpg
Saved sq_34.jpg
Saved sq_35.jpg
Saved sq_36.jpg
Saved sq_37.jpg
Saved sq_38.jpg
Saved sq_39.jpg
Saved sq_40.jpg
Saved sq_41.jpg
Saved sq_42.jpg
Saved sq_43.jpg
Saved sq_44.jpg
Saved sq_45.jpg
Saved sq_46.jpg
Saved sq_47.jpg
Saved sq_48.jpg
Saved sq_49.jpg
Saved sq_50.jpg
Saved sq_51.jpg
Saved sq_52.jpg
Saved sq_53.jpg
Saved sq_54.jpg
Saved sq_55.jpg
Saved sq_56.jpg
Saved sq_57.jpg
Saved sq_58.jpg
Saved sq_59.jpg
Saved sq_60.jpg
Saved sq_61.jpg
Saved sq_62.jpg
Saved sq_63.jpg
S