In [None]:
# !python3 -m pip install -r requirements.txt

[Example Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html)

In [15]:
import bs4
import requests
import logging
import time
import collections
import csv
import re

from fake_useragent import UserAgent
from fake_headers import Headers
from selenium import webdriver
from selenium.webdriver import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.remote.remote_connection import LOGGER

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('wb')
LOGGER.setLevel(logging.WARNING)

ParseResult = collections.namedtuple(
    "ParseResult",
    (
        "brand",
        "title",
        "url",
        "review_count",
        "current_price",
    ),
)

HEADERS = (
    "brand",
    "title",
    "url",
    "review_count",
    "current_price",
)


In [3]:
class Parser():
    """
    Parser is an utility class to crawl raw data from the e-commerce website 
    (e.g. WildBerries): item's brand name, URL, price, etc.

    This class uses selenium package in order to automate web browser interaction.

    Example:
        >>> my_parser = Parser() 
        >>> my_parser.run("https://www.wildberries.ru/catalog/zhenshchinam/odezhda/bryuki-i-shorty")
        >>> my_parser.save_result()
    """    

    def __init__(self):
        self.ua = UserAgent()
        self.options = Options()
        self.options.add_argument(
            "--disable-blink-features=AutomationControlled")
        self.options.add_argument("--headless")
        self.options.add_argument(f"user-agent={self.ua.random}")
        self.driver = webdriver.Chrome(
            service=ChromeService(
                ChromeDriverManager().install()),
            options=self.options,
            chrome_options=self.options)

        self.result = []

    def stop(self):
        """ Closes the browser that was created during the class initialization."""        
        self.driver.close()

    def _load_page(self, url: str):
        """Returns page contents from a given URL.

        Args:
            url (str): the URL address of page

        Returns:
            str: HTML source code of page
        """        
        # Load the URL
        self.driver.get(url)

        # Fake user activity to load JavaScript content
        for x in range(5):
            actions = ActionChains(self.driver)
            actions.send_keys(Keys.SPACE)
            actions.perform()
            time.sleep(.5)

        page = self.driver.page_source
        return page

    def _parse_page(self, page: str):
        """Parse HTML page and process each item's block.

        Args:
            page (str): HTML source code of page
        """
        soup = bs4.BeautifulSoup(page, 'lxml')
        # Find all div-blocks with class "product-card j-card-item"
        container = soup.select(f"div.product-card.j-card-item")
        if len(container) == 0:
            logger.error("Container is empty!")
            return

        for block in container:
            self._parse_block(block=block)

    def _get_brand(self, block):
        brand_block = block.select_one("div.product-card__brand")
        if not brand_block:
            logger.error("no brand_block")
            return

        brand_name_block = brand_block.select_one("p.product-card__brand-name")

        brand_name = brand_name_block.select_one("span.brand-name")
        if not brand_name:
            logger.error("no brand_name")
            return
        return brand_name.text.strip()

    def _get_title(self, block):
        brand_block = block.select_one("div.product-card__brand")
        if not brand_block:
            logger.error("no brand_block")
            return

        brand_name_block = brand_block.select_one("p.product-card__brand-name")
        title = brand_name_block.select_one("span.goods-name")
        if not title:
            logger.error("no title")
            return
        return title.text.replace('/', '').strip()

    def _get_url(self, block):
        url_block = block.select_one("a.product-card__main.j-card-link")
        if not url_block:
            logger.error("no url block")
            return

        item_url = url_block.get("href")
        if not item_url:
            logger.error("no href")
            return
        return item_url

    def _get_review_count(self, block):
        return re.findall(r'\d+', "".join(block.select_one("span.product-card__count").text.split()))[0]

    def _get_current_price(self, block):
        current_price = None
        try:
            current_price = re.findall(
                r'\d+', "".join(block.select_one("ins.price__lower-price").text.split()))[0]
        except AttributeError:
            try:
                current_price = re.findall(
                    r'\d+', "".join(block.select_one("span.price__lower-price").text.split()))[0]
            except AttributeError:
                print("no low price found!")
        return current_price

    # TODO: should handle the exception when it cannot find the `<del> ... </del>`` block
    def _get_old_price(self, block):
        return re.findall(r'\d+', "".join(block.select_one("del").text.split()))[0]

    def _parse_block(self, block):
        item_brand = self._get_brand(block)
        item_title = self._get_title(block)
        item_url = self._get_url(block)
        item_review_count = self._get_review_count(block)
        item_current_price = self._get_current_price(block)
        # item_old_price = self._get_old_price(block)

        self.result.append(ParseResult(
            brand=item_brand,
            title=item_title,
            url=item_url,
            review_count=item_review_count,
            current_price=item_current_price
        ))

    def save_result(self, path: str = "wildberries.csv"):
        with open(path, "a+") as f:
            writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
            writer.writerow(HEADERS)
            for item in self.result:
                writer.writerow(item)

    def run(self, url):
        page = self._load_page(url=url)
        self._parse_page(page)


Example Links: 
- 1st page: `https://www.wildberries.ru/catalog/zhenshchinam/odezhda/bryuki-i-shorty#c17539910` or `https://www.wildberries.ru/catalog/zhenshchinam/odezhda/bryuki-i-shorty?sort=popular&page=1`
- 2nd page: `https://www.wildberries.ru/catalog/zhenshchinam/odezhda/bryuki-i-shorty?sort=popular&page=2`
- 3rd page: `https://www.wildberries.ru/catalog/zhenshchinam/odezhda/bryuki-i-shorty?sort=popular&page=7`

If we send a request to this page `https://www.wildberries.ru/catalog/zhenshchinam/odezhda/bryuki-i-shorty?sort=popular&page=101`, we will get a `404` status code, so we can parse only the first 100 pages on WB website.

In [None]:
from tqdm import tqdm

PAGE_URL = "https://www.wildberries.ru/catalog/zhenshchinam/odezhda/futbolki-i-topy"
WB_MAX_PAGES = 100 # please do not change this parameter


for page_num in tqdm(range(1, WB_MAX_PAGES + 1)):
    # create new random user-agent every time 
    # to avoid being banned for parsing
    my_parser = Parser() 
    
    my_parser.run(f"{PAGE_URL}?sort=popular&page={page_num}")
    my_parser.save_result()
    time.sleep(random.randint(3))
my_parser.stop()