# Shopping Mall Crawling
* Made by Cheonsol Lee
* Updated (20.08.11)

## Import
- selenium
- chrome drvier를 설치해야함

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

### 함수 정의부

In [52]:
 # 쇼핑몰 종류에 따라 다르게 크롤링
def crawling(file_path, site_name, keyword, chromedriver_path):
    file_name = file_path + site_name + '_' + keyword + '.csv'
    
    # Auction은 정적크롤링(BeautifulSoup)
    if(site_name == "Auction"):
        print("쇼핑몰 : Auction!")
        res = requests.get('http://browse.auction.co.kr/search?keyword=' + keyword)
        soup = BeautifulSoup(res.content, 'html.parser')
        items = soup.select("div.section--itemcard")
        
        title_list = [item.select("div.area--itemcard_title span.text--title")[0].get_text() for item in items]
        price_list = [item.select("strong.text--price_seller")[0].get_text() for item in items]
        seller_list = [item.select("a.link--shop span")[1].get_text() for item in items]
        
        df = pd.DataFrame({'title': title_list,'price': price_list, 'seller': seller_list})
        df.index.name = 'index'
    
    # Danawa는 동적크롤링(selenium, BeautifulSoup)
    if(site_name == "Danawa"):
        print("쇼핑몰 : Danawa!")
        driver = webdriver.Chrome(chromedriver_path)
        driver.get("http://search.danawa.com/dsearch.php?query=" + keyword)
        
        # 명시적 대기 : 특정 태그가 로드될 때까지 대기
        wait = WebDriverWait(driver, 10)
        element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'prod_name')))
        
        html = driver.page_source

        soup = BeautifulSoup(html, 'html.parser')
        items = soup.select("div.main_prodlist li.prod_item")

        title_list = [item.select('p.prod_name a')[0].text for item in items]
        price_list = [item.select('p.price_sect a strong')[0].text for item in items]
        volume_list = [item.select('p.prod_name a')[0].text.split(" ")[-1] for item in items]
        
        df = pd.DataFrame({'title': title_list,'price': price_list, 'volume': volume_list})
        df.index.name = 'index'

        
    if(site_name == "Amazone"):
        print("쇼핑몰 : Amazone!")
        driver = webdriver.Chrome(chromedriver_path)
        driver.get("https://www.amazon.com/s?k=" + keyword)

        # 명시적 대기 : 특정 태그가 로드될 때까지 대기
        wait = WebDriverWait(driver, 10)
        element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'nav-input')))

        html = driver.page_source

        soup = BeautifulSoup(html, 'html.parser')
        items = soup.select("div.sg-col-inner div.sg-col-inner")
        items = items[ : -1] # 마지막은 상품명이 아님.
        
        title_list = [item.select('h2 span')[0].text for item in items]
        price_list = []
        for item in items:
            try:
                item.select("span.a-price span.a-offscreen")[0].get_text()
                price = item.select("span.a-price span.a-offscreen")[0].get_text()
                price_list.append(price)
            except:
                price_list.append(0)
        
        df = pd.DataFrame({'title': title_list,'price': price_list})
        df.index.name = 'index'
        
        
    if(site_name == "Bol"):
        print("쇼핑몰 : Bol!")
        res = requests.get('https://www.bol.com/nl/s/?searchtext=' + keyword)
        soup = BeautifulSoup(res.content, 'html.parser')
        items = soup.select("li.product-item--row.js_item_root")
            
        title_list = [item.select("div.product-title--inline a")[0].get_text() for item in items]
        price_list = []
        for item in items:
            tmp = re.split("\n", item.select("div.price-block__price")[0].text.strip())
            price_item = tmp[0] + ',' + tmp[1].strip()
            price_list.append(price_item)
                
        df = pd.DataFrame({'title': title_list,'price': price_list})
        df.index.name = 'index'
    
    
    print("-----------------------아래는 테이블 구조입니다.-----------------------")
    print(df.head())
    print("--------------------------------------------------------------------")
    
    df.to_csv(file_name, mode = 'w', encoding ='utf-8-sig', index = True, header = True)
    print("저장되었습니다.파일을 확인해주세요.")

### 사용자정의 입력부
- file_path : 저장위치 디렉토리
- site_name : 'Aution' or 'Danawa'입력(2개만 가능)
- keyword   : 'hemp', 'bean' 등 자유롭게 입력가능
- chromedriver_path : 크롬드라이버가 설치된 절대경로

In [53]:
# 파일명 형태 : Auction_hemp.csv
file_path = 'C:/Users/KSE/JupyterProjects/knowledge_structure/'
site_name = 'Bol'
keyword = 'hemp'

# 크롬드라이버 절대경로
chromedriver_path = 'C:/Users/KSE/Desktop/crawling/chromedriver.exe'

### 실행부

In [54]:
crawling(file_path, site_name, keyword, chromedriver_path)

쇼핑몰 : Bol!
-----------------------아래는 테이블 구조입니다.-----------------------
                                                   title  price
index                                                          
0            PuroCuro Hemp/CBD pleisters 64mg (32 stuks)  42,94
1          PuroCuro Hemp/CBD pleisters - 16mg (32 stuks)  17,95
2      Wiet Grinder Diamondgrind Luxe 4-delige Grinde...  19,95
3                      MediHemp CBD Olie raw - 5% - 30ml  72,50
4      Hennepvezelstrooisel bodembedekking knaagdier ...  12,62
--------------------------------------------------------------------
저장되었습니다.파일을 확인해주세요.


In [None]:
class="product-item--row js_item_root "

In [2]:
keyword = 'hemp'

In [47]:
res = requests.get('https://www.bol.com/nl/s/?searchtext=' + keyword)
soup = BeautifulSoup(res.content, 'html.parser')
items = soup.select("li.product-item--row.js_item_root")
        

In [9]:
len(items)

24

In [40]:
import re

tmp = items[2].select("div.price-block__price")[0].text.strip()
tmp1 = re.split("\n",tmp)

In [41]:
tmp1

['19', '  95']

In [42]:
tmp1[1].strip()

'95'

In [45]:
ss = tmp1[0] + ',' + tmp1[1].strip()

In [50]:
price_list = []
for item in items:
    tmp = re.split("\n", item.select("div.price-block__price")[0].text.strip())
    price_item = tmp[0] + ',' + tmp[1].strip()
    price_list.append(price_item)
    
    

In [51]:
price_list

['42,94',
 '17,95',
 '19,95',
 '72,50',
 '12,62',
 '50,20',
 '32,94',
 '19,99',
 '13,45',
 '30,-',
 '151,50',
 '11,99',
 '19,95',
 '26,88',
 '5,-',
 '25,-',
 '13,45',
 '25,56',
 '16,97',
 '14,95',
 '18,30',
 '15,99',
 '20,89',
 '7,95']

In [44]:
ss

'19,95'

In [22]:
title_list = [item.select("div.product-title--inline a")[0].get_text() for item in items]
price_list = []
for item in items:
    tmp = re.split("\n", item.select("div.price-block__price")[0].text.strip())
    price_item = tmp[0] + ',' + tmp[1].strip()
    price_list.append(price_item)
# seller_list = [item.select("a.link--shop span")[1].get_text() for item in items]

In [21]:
title_list

['PuroCuro Hemp/CBD pleisters 64mg (32 stuks)',
 'PuroCuro Hemp/CBD pleisters - 16mg (32 stuks)',
 'Wiet Grinder Diamondgrind Luxe 4-delige Grinder - Aluminium - 40 MM - Crusher - Wietcrusher',
 'MediHemp CBD Olie raw - 5% - 30ml',
 'Hennepvezelstrooisel bodembedekking knaagdier 4x48 ltr',
 'Sesam atlas van de acupunctuur',
 'PuroCuro Hemp/CBD pleisters - 32 mg (32 stuks)',
 'WeightWorld Hennep Eiwitpoeder Hemp Proteïne - 500g',
 'PUUR.SHOP Biologisch mondkapje - maat M - Hemp/Cotton blend Chambray - Wasbaar/herbruikbaar/Verstelbaar met elastiek - Ademend - Trendy face mask - Mondmasker met perfecte pasvorm - Hennep / biokatoen Chambray',
 'High Tea Party |Elegant thee/koffie kopje met schotel roze marmer met gouden handvat en gouden hemp. High Tea Party in Wonderland. Alice in wonderland thema. Kopje en schotel set.Tea Coffee Cup & Saucer set',
 'MediHemp CBD olie raw - 10% - 30ml',
 'Hypoallergenic – Hypoallergene Hemp Seed Great Lashes Regenerist Mascara',
 'Hennepolie Huisdieren - 