# Lecture - Static web scraping 3

Author: Jun Sun (jun.sun@gesis.org)

## selectorlib

In [1]:
# install the selectorlib package
!pip install selectorlib

Collecting selectorlib
  Downloading selectorlib-0.16.0-py2.py3-none-any.whl (5.8 kB)
Collecting parsel>=1.5.1 (from selectorlib)
  Downloading parsel-1.8.1-py2.py3-none-any.whl (17 kB)
Collecting cssselect>=0.9 (from parsel>=1.5.1->selectorlib)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting jmespath (from parsel>=1.5.1->selectorlib)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting w3lib>=1.19.0 (from parsel>=1.5.1->selectorlib)
  Downloading w3lib-2.1.2-py3-none-any.whl (21 kB)
Installing collected packages: w3lib, jmespath, cssselect, parsel, selectorlib
Successfully installed cssselect-1.2.0 jmespath-1.0.1 parsel-1.8.1 selectorlib-0.16.0 w3lib-2.1.2


In [2]:
# import stuffs
from selectorlib import Extractor
import requests

In [3]:
# we start here
url = 'https://scrapeme.live/shop/page/1/'

In [4]:
# we get the yaml from the selectorlib addon
yaml = """
  product_name:
      css: 'li.product h2.woocommerce-loop-product__title'
      xpath: null
      multiple: true
      type: Text
  price:
      css: 'li.product span.woocommerce-Price-amount'
      xpath: null
      multiple: true
      type: Text
  image:
      css: 'li.product img.attachment-woocommerce_thumbnail'
      xpath: null
      multiple: true
      type: Image
  link:
      css: 'li.product a.woocommerce-LoopProduct-link'
      xpath: null
      multiple: true
      type: Link
"""


In [5]:
# create a selectorlib extractor from the yaml
e = Extractor.from_yaml_string(yaml)

In [6]:
# request the URL
r = requests.get(url)

In [7]:
# use the extractor to extract the names, images and links
e.extract(r.text)

{'product_name': ['Bulbasaur',
  'Ivysaur',
  'Venusaur',
  'Charmander',
  'Charmeleon',
  'Charizard',
  'Squirtle',
  'Wartortle',
  'Blastoise',
  'Caterpie',
  'Metapod',
  'Butterfree',
  'Weedle',
  'Kakuna',
  'Beedrill',
  'Pidgey'],
 'price': ['£ 63.00',
  '£ 87.00',
  '£ 105.00',
  '£ 48.00',
  '£ 165.00',
  '£ 156.00',
  '£ 130.00',
  '£ 123.00',
  '£ 76.00',
  '£ 73.00',
  '£ 148.00',
  '£ 162.00',
  '£ 25.00',
  '£ 148.00',
  '£ 168.00',
  '£ 159.00'],
 'image': ['https://scrapeme.live/wp-content/uploads/2018/08/001-350x350.png',
  'https://scrapeme.live/wp-content/uploads/2018/08/002-350x350.png',
  'https://scrapeme.live/wp-content/uploads/2018/08/003-350x350.png',
  'https://scrapeme.live/wp-content/uploads/2018/08/004-350x350.png',
  'https://scrapeme.live/wp-content/uploads/2018/08/005-350x350.png',
  'https://scrapeme.live/wp-content/uploads/2018/08/006-350x350.png',
  'https://scrapeme.live/wp-content/uploads/2018/08/007-350x350.png',
  'https://scrapeme.live/wp-co

## Regular expression

In [8]:
# Import the re package for regular expression handling
import re

In [9]:
# regex for retrieving the stock level
regex_stock = re.compile(r"(\d+) in stock")

In [10]:
# the first element of the result is the stock level
regex_stock.findall("41 in stock")

['41']

What if something is out of stock?

In [11]:
# returns an empty list, but should return 0
regex_stock.findall("out of stock")

[]

In [12]:
# a function with the right behavior
def get_stock_level(str_stock):
  regex_stock = re.compile(r"(\d+) in stock")
  stock_level = regex_stock.findall(str_stock)
  if len(stock_level) == 0:
    return 0
  else:
    return int(stock_level[0])

In [13]:
# try it out
print(get_stock_level("41 in stock"))
print(get_stock_level("out of stock"))

41
0


Another example

In [14]:
# another example of regex for getting the numerical value of price
regex_price = re.compile(r".\s(\d+\.\d+)")
regex_price.findall('£ 87.00')[0]

'87.00'

In [15]:
# convert the string to a float number
float(regex_price.findall('£ 87.00')[0])

87.0