In [5]:
import requests
from bs4 import BeautifulSoup
import re
import time
import gzip
import json
import codecs
from tqdm.notebook import tqdm
from multiprocessing import Pool

In [2]:
def get_page_relod(i):
    global url_books_set
    page = requests.get('https://shop.relod.ru/catalog-products/4577/?sort=PROPERTY_RATING&order=desc&PAGEN_1='+str(i))
    soup = BeautifulSoup(page.text, 'lxml')
    for a in soup.find_all('a', class_='bxr-font-color bxr-font-color-hover', href=True):
        with lock:
            with open('url_books.txt', 'a') as file:
                print('https://shop.relod.ru' + a['href'], file=file)

In [3]:
%%time
with Pool(processes=8) as pool:
    pool.map(get_page_relod, range(254))
    
pool.join()

CPU times: user 215 ms, sys: 81.7 ms, total: 297 ms
Wall time: 3min 9s


In [4]:
url_books_set = set()
with open('url_books.txt', 'r') as f: 
    for line in f:
        url_books_set.add(line.strip())
len(url_books_set)

5012

In [None]:
def get_page(url, n_attempts=5, t_sleep=1, **kwargs):
    page = requests.get(result['url'])
    count = 0
    while not page.ok and count < n_attempts:
        time.sleep(t_sleep)
        page = requests.get(result['url'])
        count += 1
    return page

In [5]:
def process_page(url):
    result = dict()

    result['url'] = url
    
    page = get_page(result['url'])
    if page.ok:
        book = requests.get(result['url']).text
    else:
        return page

    soup = BeautifulSoup(book, 'lxml')
    
    result['Название'] = soup.find('h1', itemprop='name').text.strip()

    result['Иллюстрации'] = ['https:' + img['data-src']
                             for img in soup.find_all('img', itemprop='image') if 'resize_cache' not in img['data-src']]

    result['Метки'] = [text for text in soup.find(
        'div', class_='bxr-element-slider').text.split('\n') if text != '']

    result['Описание'] = re.sub('[\r\n\t\f]', '', soup.find(
        'div', class_='bxr-detail').text).replace(u'\xa0', u' ').strip()

    score = soup.find('meta', itemprop='ratingValue')
    if score is None:
        result['Оценка'] = 0
    else:
        result['Оценка'] = float(score['content'])

    votes = soup.find('meta', itemprop='ratingCount')
    if votes is None:
        result['Число голосов'] = 0
    else:
        result['Число голосов'] = int(votes['content'])

    result['Наличие'] = soup.find('div', itemprop='availability').text

    result['Цена'] = float(soup.find('meta', itemprop='price')['content'])

    if 'ПТВ' in result['Метки']:
        regular = regular = re.compile('"PRICE" : (\d+)')
        text = regular.search(book)
        if text is not None:
            result['Цена (скидка)'] = float(regular.search(book).group(1))
        else:
            print('Can\'t find the lower price {}'.format(result['url']))

    table = soup.find('table', class_="bxr-props-table").find_all('td')
    for key, value in zip(table[::2], table[1::2]):
        key_ = key.text.strip().replace('\n', '')
        value_ = value.text.strip().replace('\n', '')
        if key_ == 'Издатель':
            value_ = value_.replace('(сайт издательства)', '')
        result[key_] = value_
    
    return result

In [6]:
from multiprocessing.dummy import Pool, Queue

In [7]:
queue = Queue()   # очередь ссылок на книги

for i in list(url_books_set):
        queue.put(i)

queue.qsize()

5012

In [8]:
def process_page_wrapper(i):
    with gzip.open('data/part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)
        while not queue.empty():
            record = process_page(queue.get())
            if type(record) is not dict:
                print('Can\'t reach the page {}'.format(record.url))
                with lock:
                    with open('not_completed.txt', 'a') as file:
                        print(record.url, file=file)
                continue
            
            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)
            # счетчик должен атомарно обновиться
            with lock:
                pbar.update(1)

In [9]:
with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

pool.join()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5012.0), HTML(value='')))




In [10]:
import pandas as pd

from itertools import chain         # рекомендуется использовать
from contextlib import ExitStack    # рекомендуется использовать

from typing import Generator, Dict, Any
import os

In [11]:
def records_reader(dirname: str) -> Generator[Dict[str, Any], None, None]:
    with ExitStack() as stack:
        for f_json in chain([stack.enter_context(gzip.open(dirname+'/'+gfile, mode='rb'))
                            for gfile in os.listdir(path=dirname)]):
            
            f_json = codecs.getreader('utf8')(f_json)
            for book in f_json:
                try:
                    yield json.loads(book)
                except:
                    print(book)

In [12]:
df = pd.DataFrame(records_reader('data'))
df.shape

{"url": "https://shop.relod.ru/catalog-products/the_chrysalids/", "Название": "The Chrysalids", "Иллюстрации": ["https://opt-1458870.ssl.1c-bitrix-cdn.ru/upload/iblock/231/231151eda3ea5ffb2a019b23756aae91.jpg?153114449240832"], "Метки": [], "Описание": "David Strorms father doesnt approve of Angus Mortons unusually large horses, calling them blasphemies against nature. Little does he realize that his own son, his niece Rosalind and their friends, have their own secret aberration which would label them as mutants. But as David and Rosalind grow older it becomes more difficult to conceal their differences from the village elders. Soon they face a choice: wait for eventual discovery or flee to the terrifying and mutable Badlands 
 The Chrysalids is a post-nuclear story of genetic mutation in a devastated world, which tells of the lengths the intolerant will go to to keep themselves pure.", "Оценка": 0, "Число голосов": 0, "Наличие": "Под заказ", "Цена": 678.0, "ISBN": "9780141038469",

(5011, 29)

In [13]:
df.to_csv('result.csv', index=False)