In [10]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import re
import os
import urllib
import csv

In [8]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'}

In [16]:
basic_path = r"D:\python_project\chaekchecklab\data\basic"
detail_path = r"D:\python_project\chaekchecklab\data\detail"

basic_list = os.listdir(basic_path)
detail_list = os.listdir(detail_path)

In [9]:
def get_detail_info(book_url):
    res = requests.get(book_url, headers= headers)
    soup = BeautifulSoup(res.text, 'html.parser')

    # 작가
    author_list = []

    author_links = [e['href'] for e in soup.select('span.moreAuthLiCont>ul>li>a')]
    if not author_links:
        author_links = [e['href'] for e in soup.select('span.gd_auth>a')]

    for author_link in author_links:
        author_name = re.search(r'author=(.+)$', author_link).group(1)
        if re.search(r'%..', author_name):
            author_name = urllib.parse.unquote(author_name).replace('+', ' ')
        try:
            author_code = re.search(r'authorNo=(\d+)', author_link).group(1)
            author_list.append([author_name, author_code])
        except AttributeError:
            # author_code = ''
            author_list.append([author_name])
    
    if not author_links:
        author_list = [e.text for e in soup.select('span.moreAuthLiCont>ul>li')]
        if not author_links:
            author_list = [e.text.strip() for e in soup.select('span.gd_auth')]
    # print(author_list)s


    # 품목 정보
    item_box = soup.select_one('tbody.b_size')
    item_infos = [e.text for e in item_box.select('td')]
    # print(item_infos)

    # Category
    cates = [e.text.strip().replace('\n>\n', ' > ') for e in soup.select('div#infoset_goodsCate li')]
    # print(cates)

    # 책 소개
    try:
        intro = soup.select_one('div.infoWrap_txtInner').text.strip()
    except AttributeError:
        intro = ""
    # print(intro)
    
    # break
    return [book_url, author_list, str(item_infos), str(cates), intro]

In [23]:
def fill_infos(b, d):
    b_path = os.path.join(basic_path, b)
    d_path = os.path.join(detail_path, d)
    # print(b_path, d_path)

    # print(f"Read csv files...")
    basic_urls = pd.read_csv(b_path).book_url.to_list()
    detail_urls = pd.read_csv(d_path).book_url.to_list()

    remain_list = set(basic_urls) - set(detail_urls)
    # print(f"{len(remain_list)}개 추가해야함")

    # print(f"Get informations")
    added_list = []
    for book_url in remain_list:
        added_list.append(get_detail_info(book_url))

    added_df = pd.DataFrame(added_list)
    # print(f"Add {len(added_df)} rows!")
    
    added_df.to_csv(d_path, index=False, mode='a', header=False, quoting=csv.QUOTE_MINIMAL)
    # print(f"Have Saved {len(added_df)} rows!")
    return 0

In [20]:
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [24]:
with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(fill_infos, basic_list, detail_list), total=len(basic_list)))


  0%|          | 0/41 [00:00<?, ?it/s]

100%|██████████| 41/41 [04:12<00:00,  6.15s/it]  


In [None]:
for b, d in zip(basic_list, detail_list):
    b_path = os.path.join(basic_path, b)
    d_path = os.path.join(detail_path, d)
    # print(b_path, d_path)

    # print(f"Read csv files...")
    basic_urls = pd.read_csv(b_path).book_url.to_list()
    detail_urls = pd.read_csv(d_path).book_url.to_list()

    print(len(basic_urls) == len(detail_urls), len(basic_urls), len(detail_urls))