In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

import time
import pandas as pd
import numpy as np

# 1. Danawa

In [2]:
# 예제 7-10 상품 정보 태그에서 원하는 정보를 추출하는 함수
def danawa_get_prod_items(prod_items):
    prod_data = []
    for prod_item in prod_items:
        if 'product-pot' in prod_item['class']:
            continue  # 공백부분(‘li.prod_item.product-pot’) 제외하기
        try:
            # ① 상품명 가져오기
            title = prod_item.select('p.prod_name > a')[0].text.strip()
            # ② 스펙 목록 가져오기
            spec_list = prod_item.select('div.spec_list')[0].text.strip()
            # ③ 가격 정보 가져오기
            price = prod_item.select('li.rank_one > p.price_sect > a > strong')[0].text.strip().replace(",", "")
            # 상품평
            rate = prod_item.select('div.point_num > strong')[0].text.strip()
            # 참여수
            opinion = prod_item.select('a.click_log_prod_content_count > strong')[0].text.strip()
            
            prod_data.append([title, spec_list, price, rate, opinion])
        except:
            pass  # 진행시 에러가 발생할 경우(광고 상품 등) 넘어가기
    return prod_data

def danawa_get_search_page_url(keyword, page):
    return 'http://search.danawa.com/dsearch.php?query={}&volumeType=allvs&page={}&limit=30&sort=saveDESC&list=list&boost=true&addDelivery=N&tab=goods&tab=goods'.format(keyword, page)

def danawa_start_crawling(keyword, total_page):
    # setting
    driver = webdriver.Chrome(service = Service('./chromedriver.exe'))
    driver.implicitly_wait(3)

    prod_data_total = []

    for page in tqdm(range(1, total_page+1)):

        # open chrome browser
        url = danawa_get_search_page_url(keyword, page)
        driver.get(url)
        time.sleep(5)

        # get html source of the page
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # get info
        prod_items = soup.select('div.main_prodlist > ul.product_list > li.prod_item')
        prod_item_list = danawa_get_prod_items(prod_items)

        prod_data_total = prod_data_total + prod_item_list
        
        return prod_data_total

In [82]:
prod_data_total = danawa_start_crawling('트리트먼트', 10)

data = pd.DataFrame(prod_data_total)
data.columns = ['title', 'spec_list', 'price', 'rate', 'opinion']
data.to_excel('./treatment_danawa_10page.xlsx', index = False)

  0%|          | 0/10 [00:00<?, ?it/s]

# Coupang

In [1]:
def coupang_get_search_page_url(keyword, page):
    return 'https://www.coupang.com/np/search?q={}&channel=user&component=&eventCategory=SRP&trcid=&traid=&sorter=scoreDesc&minPrice=&maxPrice=&priceRange=&filterType=&listSize=36&filter=&isPriceRange=false&brand=&offerCondition=&rating=0&page={}&rocketAll=false&searchIndexingToken=1=6&backgroundColor='.format(keyword, page)


In [19]:
# setting
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36")

driver = webdriver.Chrome(service = Service('./chromedriver.exe'))
driver.implicitly_wait(3)

keyword = '샴푸'
total_page = 1

prod_data_total = []

for page in tqdm(range(1, total_page+1)):

    # open chrome browser
    url = coupang_get_search_page_url(keyword, page)
    driver.get(url)
    time.sleep(3)

    # get html source of the page
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    # get info
    prod_items = soup.select('ul#productList.search-product-list')

  0%|          | 0/1 [00:00<?, ?it/s]

# 2. Naver

In [2]:
# 예제 7-10 상품 정보 태그에서 원하는 정보를 추출하는 함수
def naver_get_prod_items(prod_items):
    prod_data = []
    for prod_item in prod_items:
        if 'product-pot' in prod_item['class']:
            continue  # 공백부분(‘li.prod_item.product-pot’) 제외하기
            print(1)
        try:
            # ① 상품명 가져오기
            title = prod_item.select('a.basicList_link__1MaTN')[0].text.strip()
            # ② 스펙 목록 가져오기
            spec_list = [x.text.strip() for x in prod_item.select('a.basicList_detail__27Krk')]
            # ③ 가격 정보 가져오기
            price = prod_item.select('span.price_num__2WUXn')[0].text.strip()
            # 상품평
            rate = prod_item.select('span.basicList_star__3NkBn')[0].text.split(' ')[-1]
            # 참여수
            opinion = prod_item.select('em.basicList_num__1yXM9')[0].text
            # site
            url = prod_item.select('a.thumbnail_thumb__3Agq6')[0]['href']
            
            prod_data.append([title, spec_list, price, rate, opinion, url])
        except:
            pass  # 진행시 에러가 발생할 경우(광고 상품 등) 넘어가기
    return prod_data

def naver_get_search_page_url(keyword, page):
    return 'https://search.shopping.naver.com/search/all?frm=NVSHATC&origQuery={}&pagingIndex={}&pagingSize=40&productSet=total&query={}&sort=rel&timestamp=&viewType=list'.format(keyword, page, keyword)

def naver_start_crawling(keyword, total_page):
    # setting
    driver = webdriver.Chrome(service = Service('./chromedriver.exe'))
    driver.implicitly_wait(3)

    prod_data_total = []

    for page in tqdm(range(1, total_page+1)):

        # open chrome browser
        url = naver_get_search_page_url(keyword, page)
        driver.get(url)
        time.sleep(1)
        
        # 최하단으로 스크롤링 하여 모든 데이터 표출
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(3)
        
        # get html source of the page
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # get info
        prod_items = soup.select('li.basicList_item__2XT81')
        prod_item_list = naver_get_prod_items(prod_items)

        prod_data_total = prod_data_total + prod_item_list
        
    return prod_data_total


import datetime
def doScrollDown(whileSeconds):
    global driver
    start = datetime.datetime.now()
    end = start + datetime.timedelta(seconds=whileSeconds)
    i = 0
    while True:
        driver.execute_script(f'window.scrollTo(0, {3000 + (i)*500});')
        i+=1
        time.sleep(0.5)
        if datetime.datetime.now() > end:
            break
            
            
def naver_image_crawling(urls):
    urls = data.url
    driver = webdriver.Chrome(service = Service('./chromedriver.exe'))
    driver.implicitly_wait(3)


    prod_data_total = []

    for url in tqdm(urls):

        # open chrome browser
        driver.get(url)

        # 하단으로 x만큼만 스크롤을 해야 이미지 표출
        doScrollDown(3)

        # get html source of the page
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        try:
            product_image = soup.select('div.image_thumb__20xyr > img')[0]['src']
        except:
            product_image = np.nan
        try:
            ocr_image = soup.select('p > img')[0]['src']
        except:
            ocr_image = np.nan

        # get image data
        prod_data_total = prod_data_total + [product_image, ocr_image]
        
        return prod_data_total

In [199]:
prod_data_total = naver_start_crawling('샴푸', 10)

data = pd.DataFrame(prod_data_total)
data.columns = ['title', 'spec_list', 'price', 'rate', 'opinion', 'url']
data.to_excel('./shampoo_naver_10page_new.xlsx', index = False)

  0%|          | 0/10 [00:00<?, ?it/s]

In [57]:
import datetime
def doScrollDown(whileSeconds):
    global driver
    start = datetime.datetime.now()
    end = start + datetime.timedelta(seconds=whileSeconds)
    i = 0
    while True:
        driver.execute_script(f'window.scrollTo(0, {1500 + (i)*500});')
        i+=1
        time.sleep(0.1)
        if datetime.datetime.now() > end:
            break

In [75]:
# naver_image_crawling
data = pd.read_excel('./shampoo_naver_10page_new.xlsx')
urls = data.url
prod_data_total = []

driver = webdriver.Chrome(service = Service('./chromedriver.exe'))
driver.implicitly_wait(3)

for url in tqdm(urls):

    # open chrome browser
    driver.get(url)

    # 하단으로 x만큼만 스크롤을 해야 이미지 표출
    doScrollDown(2)
    
    # get html source of the page
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    try:
        product_image = soup.select('div.image_thumb__20xyr > img')[0]['src']
    except:
        product_image = np.nan
    try:
        ocr_image = []
        for img in soup.select('div.imageSpecInfo_product_img__2buWL')[0].select('img'):
            src = img['src']
            if 'data' in src:
                continue
            else:
                ocr_image.append(src)
    except:
        ocr_image = np.nan
    
    # get image data
    prod_data_total = prod_data_total + [[product_image, ocr_image]]

  0%|          | 0/362 [00:00<?, ?it/s]

In [121]:
data[['image','ocr_image']] = prod_data_total

func = lambda x : x.replace(['[]'], np.nan, inplace=True)
data.apply(func)

data.to_excel('./shampoo_naver_10page_with_image.xlsx', index = False)

  return array(a, dtype, copy=False, order=order)
