# Using Selenium and Requests: _Fjall Raven & Rains_

In this section we will be using selenium and beautiful soup to scrape all of the women's bags from Fjall Raven & Rains. First, we will use selenium to get everylink for the bags, and then use Beautiful Soup to get the correct picture,price, product link, and description.

We have linked some great resources in our other notebooks.

#### Import your needed Libraries:

In [None]:
import time
import requests
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import cv2
import numpy as np 
import pandas as pd
import re
from PIL import Image
import requests
from io import BytesIO

#### Create variables for each of the URL

We will do this for each of the pages we will be pulling from on _Fjall Raven & Rains_.

In [None]:
fjall_raven = 'https://www.fjallraven.us/collections/backpacks'
rains = 'https://www.us.rains.com/collections/backpacks'

### Rains:

We will start with Rains, grabbing all of their bags with every colors. The name of the picture files, the prices and the names of the items will be saved in a DataFrame.

In [None]:
page = requests.get(rains)
soup = BeautifulSoup(page.content, 'lxml')
class_1 = 'flexslider collection'
class_2 = 'details_outer'
divs_imgs = soup.findAll('div', {'class':class_1})

After inspecting, we can tell that these are all of the bags on the page. However, let's drop item 14 as it is not a bookbag. In the list comprehension we use [::2] to skip everyother item in the list of images for each item. These are the hover images which we do not need

In [None]:
images = [x.findAll('img')[::2] for x in divs_imgs]
del images[13]

In [None]:
flat_imgs = [img for img_li in images for img in img_li]

rains_colors = [x['alt'] for x in flat_imgs]
rains_urls = ['https:' + x['data-src'] for x in flat_imgs]

In [None]:
divs = soup.findAll('div', {'class':class_2})

In [None]:
name = [x.find('a')['title'] for x in divs]
pri = [x.find('a').find('span',{'class':'price_lg'}).text.replace('\n','').strip() for x in divs]
prices = [float(x.replace(' USD','').replace('$','')) for x in pri]

In [None]:
del name[13]
del prices[13]

In [None]:
div_num = [len(i) for i in images]

In [None]:
name_count = []
prices_count = []
for i in range(0,len(div_num)):
    name_count.append([name[i]]*div_num[i])
    prices_count.append([prices[i]]*div_num[i])

In [None]:
names_rains = [name for name_li in name_count for name in name_li]
prices_rains = [price for price_li in prices_count for price in price_li]

### Create a DataFrame and create a function to download pictures :

Below we will use the lists created above and make a dataframe. Then we will define a function to download the images into our directory, and save the image file name in a new column of the DataFrame.

In [None]:
table = [names_rains,prices_rains,rains_colors,rains_urls]
rains_df = pd.DataFrame(table).transpose()
rains_df.rename( columns={0:'Names', 1: 'Prices',2:'Color',3:'URL'},inplace=True)

In [None]:
rains_df.head(3)

In [None]:
def save_src_image_apply(company,short,url):
    start_time = time.time()
    time.sleep(1)
    
    try:
        global c
        c += 1
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        image_save_loc = company+short+'_'+str(c)+'.jpg'
        img.save(image_save_loc, "JPEG")
        print("Saved "+short+"_{}".format(c)+".jpg")
        return short+"_"+ str(c)+".jpg"
    except Exception as e:
        return e

#### Save all images using an apply

In [None]:
c = 0
rains_df['img'] = rains_df.URL.apply(lambda x: save_src_image_apply('../../Rains_backpacks/','rains_',x))

In [None]:
rains_df.to_csv('../../rains_df.csv')

## Fjall Raven

Repeat earlier steps for Fjall Raven backpacks:

In [None]:
page_2 = requests.get(fjall_raven)
soup_2 = BeautifulSoup(page_2.content, 'lxml')
item_class = 'grid-view-item__link'
divs_items = soup_2.findAll('div', {'class':item_class})

Define variables to store the class names for each of the itmes that we need:

In [None]:
thumb_class ='product-single__thumbnails'
title_class = 'h4 grid-view-item__title'
price_class = 'h4 grid-view-item__price'

Use these classes in __list comprehensions__ to loop through the list of divs and further filter the items we need:

In [None]:
names_FR = [x.find('div', {'class':title_class}).text for x in divs_items]
prices_FR = [x.find('div',{'class':price_class}).text for x in divs_items]
links = ['https://www.fjallraven.us/'+x.find('a')['href'] for x in divs_items]
colors_divs = [x.find('select').findAll('option') for x in divs_items]
colors_FR = [[x.text.replace('\n','').strip() for x in i]for i in colors_divs]

Use each link to grab the images for each color bag:

In [None]:
del names_FR[20]
del prices_FR[20]
del links[20]

In [None]:
imgs_FR = []
skips = [2,2,1,1,2,2,1,1,1,1,1,1,1,1,1,1,1,4,3,3,1,1]
for i,link in enumerate(links):
    loop_page = requests.get(link)
    soup_loop = BeautifulSoup(loop_page.content, 'lxml')
    img_list = soup_loop.find('ul',{'class':thumb_class})
    img_items = img_list.findAll('a')[::skips[i]]
    imgs_FR.append(['https:'+x['data-zoom'] for x in img_items])

Use lengths of color and image lists for each item to make price and name lists long enough:

In [None]:
price_FR_count = []
names_FR_count = []
for i in range(0,len(names_FR)):
    names_FR_count.append([names_FR[i]]*len(imgs_FR[i]))
    price_FR_count.append([prices_FR[i]]*len(imgs_FR[i]))

Flatten all lists from list of lists to just a single list:

In [None]:
names_fjall = [name for name_li in names_FR_count for name in name_li]
prices_fjall = [price for price_li in price_FR_count for price in price_li]
colors_fjall = [color for color_li in colors_FR for color in color_li]
imgs_fjall = [img for imgs_li in imgs_FR for img in imgs_li]

In [None]:
table_FR = [names_fjall,prices_fjall,imgs_fjall]
fjallraven_df = pd.DataFrame(table_FR).transpose()
fjallraven_df.rename( columns={0:'Names', 1: 'Prices',2:'URL'},inplace=True)

Clean prices:

In [None]:
fjallraven_df.Prices = fjallraven_df.Prices.apply(lambda x: float(x.replace('$','')))


#### Save images and file names:

Use function from before to save images and create the column with the file name:

In [None]:
c = 0
fjallraven_df['img'] = fjallraven_df.URL.apply(lambda x: save_src_image_apply('../../fjall_raven_2/','FJ_',x))


In [None]:
# bad_bags = ['FJ__17.jpg','FJ__18.jpg','FJ__29.jpg','FJ__53.jpg',
#            'FJ__54.jpg','FJ__55.jpg','FJ__56.jpg','FJ__57.jpg','FJ__58.jpg',
#            'FJ__59.jpg','FJ__60.jpg','FJ__61.jpg','FJ__62.jpg','FJ__63.jpg','FJ__64.jpg'
#            'FJ__65.jpg','FJ__66.jpg','FJ__67.jpg','FJ__68.jpg','FJ__69.jpg',
#            'FJ__70.jpg','FJ__78.jpg','FJ__83.jpg','FJ__84.jpg','FJ__85.jpg',
#            'FJ__104.jpg','FJ__107.jpg','FJ__108.jpg','FJ__111.jpg',
#            'FJ__112.jpg','FJ__114.jpg','FJ__115.jpg','FJ__117.jpg','FJ__118.jpg',
#            'FJ__120.jpg','FJ__121.jpg','FJ__127.jpg','FJ__137.jpg','FJ__145.jpg','FJ__151.jpg',
#             'FJ__152.jpg','FJ__153.jpg','FJ__157.jpg','FJ__158.jpg','FJ__159.jpg'
#            'FJ__161.jpg','FJ__162.jpg','FJ__163.jpg','FJ__164.jpg','FJ__165.jpg','FJ__166.jpg']

# fjallraven_df_test = fjallraven_df[fjallraven_df.img.apply(lambda x: x not in bad_bags)]
fjallraven_df_test.to_csv('../../fjallraven_df.csv')

#### Fjall Raven #2

In [None]:
fjallrave_2 = 'https://www.fjallraven.us/collections/kanken?page=1&sort_by=price-descending'
page_3 = requests.get(fjallrave_2)
soup_3 = BeautifulSoup(page_3.content, 'lxml')
item_class = 'grid-view-item__link'
divs_items_2 = soup_3.findAll('div', {'class':item_class})

For the most part our code to grab these images will not change. However, we will be adding a try-except. Some of the bags only have one picture and therefore do not have the same classes:

In [None]:
names_FR_2 = [x.find('div', {'class':title_class}).text for x in divs_items_2]
prices_FR_2 = [x.find('div',{'class':price_class}).text for x in divs_items_2]
links_2 = ['https://www.fjallraven.us/'+x.find('a')['href'] for x in divs_items_2]

In [None]:
imgs_FR_2 = []
single_photo = 'product-single__photos'
skips_2 = [10,10,1,4,1,3,1,2,2,1,2,1,2,1,1,1,1,2,1,2,1,1,2]
for i,link in enumerate(links_2):
    loop_page = requests.get(link)
    soup_loop = BeautifulSoup(loop_page.content, 'lxml')
    try:
        img_list = soup_loop.find('ul',{'class':thumb_class})
        img_items = img_list.findAll('a')[::skips_2[i]]
        imgs_FR_2.append(['https:'+x['data-zoom'] for x in img_items])
    except:
        img_list = soup_loop.find('div',{'class':single_photo})
        img_items = img_list.findAll('img')
        imgs_FR_2.append(['https:'+x['src'] for x in img_items])


In [None]:
price_FR_count_2 = []
names_FR_count_2 = []
for i in range(0,len(names_FR_2)):
    names_FR_count_2.append([names_FR_2[i]]*len(imgs_FR_2[i]))
    price_FR_count_2.append([prices_FR_2[i]]*len(imgs_FR_2[i]))

In [None]:
names_fjall_2 = [name for name_li in names_FR_count_2 for name in name_li]
prices_fjall_2 = [price for price_li in price_FR_count_2 for price in price_li]
imgs_fjall_2 = [img for imgs_li in imgs_FR_2 for img in imgs_li]

In [None]:
table_FR_2 = [names_fjall_2,prices_fjall_2,imgs_fjall_2]
fjallraven_df_2 = pd.DataFrame(table_FR_2).transpose()
fjallraven_df_2.rename( columns={0:'Names', 1: 'Prices',2:'URL'},inplace=True)

In [None]:
fjallraven_df_2.head()

In [None]:
fjallraven_df_2.Prices = fjallraven_df_2.Prices.apply(lambda x: float(x.replace('$','')))

In [None]:
c = 166
fjallraven_df_2['img'] = fjallraven_df_2.URL.apply(lambda x: save_src_image_apply('../../fjall_raven_2/','FJ_',x))

In [None]:
bad_bags_2 = ['FJ__177','FJ__178','FJ__179','FJ__180','FJ__181','FJ__182','FJ__183',
              'FJ__184','FJ__185','FJ__186','FJ__187','FJ__188','FJ__189','FJ__190',
              'FJ__191','FJ__192','FJ__193','FJ__194','FJ__195','FJ__196','FJ__197',
              'FJ__198','FJ__199','FJ__200','FJ__201','FJ__202','FJ__204','FJ__205',
              'FJ__206','FJ__207','FJ__208','FJ__209','FJ__210','FJ__211','FJ__234',
              'FJ__235','FJ__259','FJ__260','FJ__261','FJ__278','FJ__279','FJ__280',
              'FJ__296','FJ__297','FJ__298','FJ__314','FJ__315','FJ__316','FJ__317',
              'FJ__323','FJ__325','FJ__327','FJ__329','FJ__332','FJ__339','FJ__341',
              'FJ__343','FJ__345','FJ__348']

fjallraven_df_2 = fjallraven_df_2[fjallraven_df_2.img.apply(lambda x: x not in bad_bags_2)]
fjallraven_df_2.to_csv('../../fjallraven_df_2.csv')

In [None]:
result = pd.concat([fjallraven_df_test,fjallraven_df_2])

In [None]:
result.reset_index(drop=True,inplace=True)

In [None]:
result.to_csv('../../fjallraven_df_full.csv')

In [None]:
result['brand'] = 'Fjallraven'
result['source'] = 'Fjallraven'

In [None]:
result.head()