# Scraping Dagne Dover product images
***

### 0. Import all necessary libraries

In [1]:
import numpy as np
import time
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import urllib
import pandas as pd
import cv2

### 1. Get image link for all products

In [110]:
urls = ["https://www.dagnedover.com/collections/tote-bags",
      "https://www.dagnedover.com/collections/crossbody"]

links = []
desc = []
prices = []

for url in urls:
    time.sleep(5)
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'lxml')
    match = soup.findAll('div', {'class':'image'}) # find all tags with images
    links.extend([x.img['src'] for x in match])
    match2 = soup.findAll('div', {'class': 'product_loop_title'})
    desc.extend([x.text for x in match2]) # get item descriptions
    match3 = soup.findAll('div', {'class': 'product_loop_price'})
    prices.extend([x.span for x in match3])
print('Done')

Done


In [111]:
print(len(links))
print(len(desc))
print(len(prices))

29
29
29


In [112]:
desc = [d.replace('\n', "").strip() for d in desc] # clean text of product name
desc

['Allyn Tote',
 'Allyn Tote',
 'Allyn Tote',
 'Petite Tote',
 'Midi Tote',
 'Classic Tote',
 'Charlie Tote',
 'Charlie Tote',
 'Charlie Tote',
 'Ava Bucket',
 'Ava Bucket',
 'Landon Carryall',
 'Landon Carryall',
 'Landon Carryall',
 'Simone Satchel',
 'Simone Satchel',
 'Simone Satchel',
 'Andra Crossbody',
 'Andra Crossbody',
 'Andra Crossbody',
 'Petite Tote',
 'Midi Tote',
 'Petite Tote',
 'Ace Fanny Pack',
 'Ace Fanny Pack',
 'Ace Fanny Pack',
 'Landon Carryall',
 'Landon Carryall',
 'Landon Carryall']

In [113]:
prices = [str(p) for p in prices]

In [114]:
# regex to clean up price text
pattern = '(?<=>)[^><]+'
p = re.compile(pattern)

# create new price list with clean data
prices_=[]

for i in prices:
    if i != 'None':
        prices_.extend(p.findall(i))
    else:
        prices_.append('none')

prices_

['$305',
 '$325',
 '$345',
 '$165',
 '$225',
 '$245',
 '$495',
 '$495',
 '$495',
 '$325',
 'none',
 '$185',
 '$185',
 '$215',
 '$395',
 '$395',
 '$395',
 '$185',
 '$155',
 '$195',
 '$165',
 '$225',
 '$145',
 'none',
 '$85',
 '$85',
 '$185',
 '$185',
 '$215']

In [115]:
# create dictionary with all item details to put into dataframe
all_data = []

for i in range(len(links)):
    all_data.append({'description': desc[i], 'link': links[i], 'price': prices_[i]})

In [116]:
# list of dictionary in dataframe
df_data = pd.DataFrame(all_data)
df_data

Unnamed: 0,description,link,price
0,Allyn Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$305
1,Allyn Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$325
2,Allyn Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$345
3,Petite Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$165
4,Midi Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$225
5,Classic Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$245
6,Charlie Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$495
7,Charlie Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$495
8,Charlie Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$495
9,Ava Bucket,//cdn.shopify.com/s/files/1/0260/1439/products...,$325


In [121]:
# clean up dataframe
df_data = df_data[df_data.price != 'none']
df_data.drop_duplicates(subset='description', inplace=True)
df_data.reset_index(inplace=True)
df_data.drop(labels=['index'], axis = 1, inplace = True)
df_data

In [125]:
# save the dataframe as a csv (to merge with other data later)
df_data.to_csv('dd_data.csv')

### 2. Loop through image links and save as a .png file

In [126]:
# loop through links to request and save image locally
for i, url in enumerate(df_data.link):
    time.sleep(1)
    try:
        url = 'http:' + url
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        binary_str = response.read()
        byte_array = bytearray(binary_str)
        numpy_array = np.asarray(byte_array, dtype='int8')
        image = cv2.imdecode(numpy_array, cv2.IMREAD_UNCHANGED)
        cv2.imwrite("DagneDover/"+"dd_{}".format(i+1)+".png", image)
        print("Saved "+"dd_{}".format(i+1)+".png")
    except Exception as e:
        print(str(e))

print("Done")


Saved dd_1.png
Saved dd_2.png
Saved dd_3.png
Saved dd_4.png
Saved dd_5.png
Saved dd_6.png
Saved dd_7.png
Saved dd_8.png
Saved dd_9.png
Saved dd_10.png
Done


In [2]:
df = pd.read_csv('dd_data.csv')
df

Unnamed: 0.1,Unnamed: 0,description,link,price
0,0,Allyn Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$305
1,1,Petite Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$165
2,2,Midi Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$225
3,3,Classic Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$245
4,4,Charlie Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$495
5,5,Ava Bucket,//cdn.shopify.com/s/files/1/0260/1439/products...,$325
6,6,Landon Carryall,//cdn.shopify.com/s/files/1/0260/1439/products...,$185
7,7,Simone Satchel,//cdn.shopify.com/s/files/1/0260/1439/products...,$395
8,8,Andra Crossbody,//cdn.shopify.com/s/files/1/0260/1439/products...,$185
9,9,Ace Fanny Pack,//cdn.shopify.com/s/files/1/0260/1439/products...,$85


In [3]:
df['brand'] = 'DagneDover'

In [4]:
df['img-file'] = df['Unnamed: 0'].apply(lambda x: 'dd_'+ str(x+1)+'.png')

In [6]:
df.drop(labels = 'Unnamed: 0', axis =1, inplace=True)

In [7]:
df

Unnamed: 0,description,link,price,brand,img-file
0,Allyn Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$305,DagneDover,dd_1.png
1,Petite Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$165,DagneDover,dd_2.png
2,Midi Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$225,DagneDover,dd_3.png
3,Classic Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$245,DagneDover,dd_4.png
4,Charlie Tote,//cdn.shopify.com/s/files/1/0260/1439/products...,$495,DagneDover,dd_5.png
5,Ava Bucket,//cdn.shopify.com/s/files/1/0260/1439/products...,$325,DagneDover,dd_6.png
6,Landon Carryall,//cdn.shopify.com/s/files/1/0260/1439/products...,$185,DagneDover,dd_7.png
7,Simone Satchel,//cdn.shopify.com/s/files/1/0260/1439/products...,$395,DagneDover,dd_8.png
8,Andra Crossbody,//cdn.shopify.com/s/files/1/0260/1439/products...,$185,DagneDover,dd_9.png
9,Ace Fanny Pack,//cdn.shopify.com/s/files/1/0260/1439/products...,$85,DagneDover,dd_10.png


In [8]:
# save the dataframe as a csv (to merge with other data later)
df.to_csv('dd_data.csv')