#  Article dependacies


In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
from tqdm import tqdm
import requests
import os
import time


In [2]:
from bs4 import BeautifulSoup

#  create html object
html = """
<html>
    <title>Mock Webpage</title>
    <body>
        <h1>Web Scraping</h1>
        
        <p>This article is all about web scraping</p>

        <p>We will be using BeautifulSoup</p>
    </body>
</html>
"""

#  create beautifulsoup element
bs = BeautifulSoup(html, 'html.parser')


In [4]:
#  extract the title tag
bs.title
#  extract the h1 tag
bs.h1
#  extract the p tag
bs.p
#  extract all p tags
bs.find_all('p')
#  extract only the string in the title tag
bs.title.get_text()


'Mock Webpage'

# BeautifulSoup & Web Page Scraping

In [6]:
from urllib.request import urlopen
from urllib.request import Request

url = 'https://blog.paperspace.com/generating-images-with-stable-diffusion/'

#  header to mimick web browser
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'}

#  make request to server
request = Request(url, headers=headers)

#  open request and create beautifulsoup element
html = urlopen(request)
bs = BeautifulSoup(html.read(), 'html.parser')


# tags and attributes

In [7]:
#  copy link from page 2 and edit 2 to 1 to access the first page
url = 'https://www.jumia.com.ng/mlp-fashion-deals/mens-athletic-shoes/?page=1#catalog-listing'

headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
      'Accept-Encoding': 'none',
      'Accept-Language': 'en-US,en;q=0.8',
      'Connection': 'keep-alive'}

request = Request(url, headers=headers)

html = urlopen(request)
bs = BeautifulSoup(html.read(), 'html.parser')

#  extract all img tags with class img
interest = bs.find_all('img', attrs={'class':'img'})

#  extracting links using list comprehension


In [9]:
#  extracting links using list comprehension
links = [listing['data-src'] for listing in interest]

# Downloading from src Links

In [10]:
import requests

#  instantiating counter
count = 0

#  downloading images
for link in tqdm(links):
  with open(f'athletic_{count}.jpg', 'wb') as f:
    response = requests.get(link)
    image = response.content
    f.write(image)
    count+=1

100%|██████████| 40/40 [00:17<00:00,  2.34it/s]


In [11]:
class WebScraper():
    def __init__(self, headers, tag: str, attribute: dict,
                 src_attribute: str, filepath: str, count=0):
      self.headers = headers
      self.tag = tag
      self.attribute = attribute
      self.src_attribute = src_attribute
      self.filepath = filepath
      self.count = count
      self.bs = []
      self.interest = []

    def __str__(self):
      display = f"""      CLASS ATTRIBUTES
      headers: headers used so as to mimic requests coming from web browsers.
      tag: html tags intended for scraping.
      attribute: attributes of the html tags of interest.
      filepath: path ending with filenames to use when scraping images.
      count: numerical suffix to differentiate files in the same folder.
      bs: a list of each page's beautifulsoup elements.
      interest: a list of each page's image links."""
      return display

    def __repr__(self):
      display = f"""      CLASS ATTRIBUTES
      headers: {self.headers}
      tag: {self.tag}
      attribute: {self.attribute}
      filepath: {self.filepath}
      count: {self.count}
      bs: {self.bs}
      interest: {self.interest}"""
      return display

    def parse_html(self, url):
      """
      This method requests the webpage from the server and
      returns a beautifulsoup element
      """
      try:
        request = Request(url, headers=self.headers)
        html = urlopen(request)
        bs = BeautifulSoup(html.read(), 'html.parser')
        self.bs.append(bs)
      except Exception as e:
        print(f'problem with webpage\n{e}')
      pass

    def extract_src(self):
      """
      This method extracts tags of interest from the webpage's
      html
      """
      #  extracting tag of interest
      interest = self.bs[-1].find_all(self.tag, attrs=self.attribute)
      interest = [listing[self.src_attribute] for listing in interest]
      self.interest.append(interest)
      pass

    def scrape_images(self):
      """
      This method grabs images located in the src links and
      saves them as required
      """
      for link in tqdm(self.interest[-1]):
        try:
          with open(f'{self.filepath}_{self.count}.jpg', 'wb') as f:
            response = requests.get(link)
            image = response.content
            f.write(image)
            self.count += 1
            #  pausing scraping for 0.4secs so as to not exceed 200 requests per minute as stipulated in the web page's robots.txt file
            time.sleep(0.4)
        except Exception as e:
          print(f'problem with image\n{e}')
          time.sleep(0.4)
      pass


In [12]:
#  instantiating web scraper class
scraper = WebScraper(headers=headers, tag='img', attribute={'class': 'img'},
                     src_attribute='data-src', filepath='shoes/athletic/atl', count=0)


In [13]:
def my_scraper(scraper, page_range: list):
    """
    This function wraps around the web scraper class allowing it to scrape
    multiple pages. The argument page_range takes both a list of two elements
    to define a range of pages or a list of one element to define a single page.
    """
    if len(page_range) > 1:
      for i in range(page_range[0], page_range[1] + 1):
        scraper.parse_html(
            url=f'https://www.jumia.com.ng/mlp-fashion-deals/mens-athletic-shoes/?page={i}#catalog-listing')
        scraper.extract_src()
        scraper.scrape_images()
        print(f'\npage {i} done.')
      print('All Done!')
    else:
      scraper.parse_html(
          url=f'https://www.jumia.com.ng/mlp-fashion-deals/mens-athletic-shoes/?page={page_range[0]}#catalog-listing')
      scraper.extract_src()
      scraper.scrape_images()
      print('\nAll Done!')
    pass


In [14]:
import os

#  creating directory to hold images
os.mkdir('shoes')
os.mkdir('shoes/athletic')

#  scraping the first five pages
my_scraper(scraper=scraper, page_range=[1, 5])


100%|██████████| 40/40 [00:26<00:00,  1.50it/s]



page 1 done.


100%|██████████| 12/12 [00:10<00:00,  1.12it/s]



page 2 done.


0it [00:00, ?it/s]



page 3 done.


0it [00:00, ?it/s]



page 4 done.


0it [00:00, ?it/s]


page 5 done.
All Done!



