In [None]:
#  article dependencies
import cv2
import numpy as np
import os
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.request import Request
import time
from torch.utils.data import Dataset
import torch
from torchvision import transforms
from tqdm import tqdm


In [None]:
class WebScraper():
    def __init__(self, headers, tag: str, attribute: dict,
                 src_attribute: str, filepath: str, count=0):
      self.headers = headers
      self.tag = tag
      self.attribute = attribute
      self.src_attribute = src_attribute
      self.filepath = filepath
      self.count = count
      self.bs = []
      self.interest = []

    def __str__(self):
      display = f"""      CLASS ATTRIBUTES
      headers: headers used so as to mimic requests coming from web browsers.
      tag: html tags intended for scraping.
      attribute: attributes of the html tags of interest.
      filepath: path ending with filenames to use when scraping images.
      count: numerical suffix to differentiate files in the same folder.
      bs: a list of each page's beautifulsoup elements.
      interest: a list of each page's image links."""
      return display

    def __repr__(self):
      display = f"""      CLASS ATTRIBUTES
      headers: {self.headers}
      tag: {self.tag}
      attribute: {self.attribute}
      filepath: {self.filepath}
      count: {self.count}
      bs: {self.bs}
      interest: {self.interest}"""
      return display

    def parse_html(self, url):
      """
      This method requests the webpage from the server and
      returns a beautifulsoup element
      """
      try:
        request = Request(url, headers=self.headers)
        html = urlopen(request)
        bs = BeautifulSoup(html.read(), 'html.parser')
        self.bs.append(bs)
      except Exception as e:
        print(f'problem with webpage\n{e}')
      pass

    def extract_src(self):
      """
      This method extracts tags of interest from the webpage's
      html
      """
      #  extracting tag of interest
      interest = self.bs[-1].find_all(self.tag, attrs=self.attribute)
      interest = [listing[self.src_attribute] for listing in interest]
      self.interest.append(interest)
      pass

    def scrape_images(self):
      """
      This method grabs images located in the src links and
      saves them as required
      """
      for link in tqdm(self.interest[-1]):
        try:
          with open(f'{self.filepath}_{self.count}.jpg', 'wb') as f:
            response = requests.get(link)
            image = response.content
            f.write(image)
            self.count += 1
            time.sleep(0.4)
        except Exception as e:
          print(f'problem with image\n{e}')
          time.sleep(0.4)
      pass


In [None]:
def my_scraper(scraper, page_range: list):
    """
    This function wraps around the web scraper class allowing it to scrape
    multiple pages. The argument page_range takes both a list of two elements
    to define a range of pages or a list of one element to define a single page.
    """
    if len(page_range) > 1:
      for i in range(page_range[0], page_range[1] + 1):
        scraper.parse_html(
            url=f'https://www.jumia.com.ng/mlp-fashion-deals/mens-athletic-shoes/?page={i}#catalog-listing')
        scraper.extract_src()
        scraper.scrape_images()
        print(f'\npage {i} done.')
      print('All Done!')
    else:
      scraper.parse_html(
          url=f'https://www.jumia.com.ng/mlp-fashion-deals/mens-athletic-shoes/?page={page_range[0]}#catalog-listing')
      scraper.extract_src()
      scraper.scrape_images()
      print('\nAll Done!')
    pass


In [None]:
#  create directories to hold images
os.mkdir('shoes')
os.mkdir('shoes/athletic')
os.mkdir('shoes/boots')


# Scrape the images

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
           'Accept-Encoding': 'none',
           'Accept-Language': 'en-US,en;q=0.8',
           'Connection': 'keep-alive'}

#  scrape athletic shoe images
athletic_scraper = WebScraper(headers=headers, tag='img', attribute={'class': 'img'},
                              src_attribute='data-src', filepath='shoes/athletic/atl', count=0)

my_scraper(scraper=athletic_scraper, page_range=[1, 3])


In [None]:
#  replace the urls in the my scraper function with the urls below
#  first url:
#  f'https://www.jumia.com.ng/mlp-fashion-deals/mens-boots/?page={i}#catalog-listing'
#  second url:
#  f'https://www.jumia.com.ng/mlp-fashion-deals/mens-boots/?page={page_range[0]}#catalog-listing'
#  rerun my_scraper function code cell

#  scrape boot images
boot_scraper = WebScraper(headers=headers, tag='img', attribute={'class': 'img'},
                          src_attribute='data-src', filepath='shoes/boots/boot', count=0)

my_scraper(scraper=boot_scraper, page_range=[1, 3])


Load & Label data

In [None]:
#  defining class to load and label data
class LoadShoeData():
    """
    This class loads in data from each directory in numpy array format then saves
    loaded dataset
    """
    def __init__(self):
        self.athletic = 'shoes/athletic'
        self.boots = 'shoes/boots'
        self.labels = {self.athletic: np.eye(2, 2)[0], self.boots: np.eye(2, 2)[1]}
        self.img_size = 100
        self.dataset = []
        self.athletic_count = 0
        self.boots_count = 0

    def create_dataset(self):
        """
        This method reads images as grayscale from directories,
        resizes them and labels them as required.
        """

        #  reading from directory
        for key in self.labels:
          print(key)

          #  looping through all files in the directory
          for img_file in tqdm(os.listdir(key)):
            try:
              #  deriving image path
              path = os.path.join(key, img_file)

              #  reading image
              image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
              image = cv2.resize(image, (self.img_size, self.img_size))

              #  appending image and class label to list
              self.dataset.append([image, self.labels[key]])

              #  incrementing counter
              if key == self.athletic:
                self.athletic_count+=1
              elif key == self.boots:
                self.boots_count+=1

            except Exception as e:
              pass

        #  shuffling array of images
        np.random.shuffle(self.dataset)

        #  printing to screen
        print(f'\nathletic shoe images: {self.athletic_count}')
        print(f'boot images: {self.boots_count}')
        print(f'total: {self.athletic_count + self.boots_count}')
        print('All done!')
        return np.array(self.dataset, dtype='object')

In [None]:
#  load data
data = LoadShoeData()

dataset = data.create_dataset()


# Creating a PyTorch dataset

In [None]:
#  extending Dataset class
class ShoeDataset(Dataset):
    def __init__(self, custom_dataset, transforms=None):
        self.custom_dataset = custom_dataset
        self.transforms = transforms

    def __len__(self):
        return len(self.custom_dataset)

    def __getitem__(self, idx):
        #  extracting image from index and scaling
        image = self.custom_dataset[idx][0]
        #  extracting label from index
        label = torch.tensor(self.custom_dataset[idx][1])
        #  applying transforms if transforms are supplied
        if self.transforms:
          image = self.transforms(image)
        return (image, label)


In [None]:
#  creating an instance of the dataset class
dataset = ShoeDataset(dataset, transforms=transforms.ToTensor())
