### 차량 데이터(트림 level)의 수집
차량의 가격, 제원, 사양/옵션은 차종 level 아니라 트림 level로 수집 필요

라인업(엔진종류) 수집한 다음 라인업 별로 

가격, 제원, 사양/옵션 데이터를 각각 수집하자

selenium, BeautifulSoup 사용

In [1]:
import pandas as pd
import os,time,tqdm

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait,Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, WebDriverException, TimeoutException

import requests
from bs4 import BeautifulSoup

In [3]:
# utilities
class Util:
    @classmethod
    def make_url(cls,params):
        url = 'http://m.auto.danawa.com/auto/?Work=model'
        for k,v in params.items(): # 쿼리 순서??
            if v is not None:
                url += '&'+k+'='+v

        return url

In [14]:
# 가격 탭 -> 라인업 가져오기
def get_lineup():
    params['Tab'] = 'price'
    url_price = Util.make_url(params)
    driver.get(url_price)
    
    try:
    # 라인업쭉저장(최근년도만 직접보며 라인업 선별, id,nm,출시,  )
        buttonSelect = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable(
                            (By.ID, "buttonSelect")))
        buttonSelect.click()

        popup_data = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located(
                            (By.ID, "popup_data")))
    except TimeoutException as te: # 아예 라인업 없는 경우
            return []
            
    lineup_lis = popup_data.find_element_by_class_name('trimInfo').find_elements_by_tag_name('li')
    
    lu_len = len(lineup_lis) # stale Element Error 때문에 우선 라인업 길이만 가져옴
    
    driver.get(url_price)
    lineups = []
    for lu_idx in range(lu_len):
        driver.get(url_price)
        buttonSelect = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable(
                        (By.ID, "buttonSelect")))
        buttonSelect.click()
    
        popup_data = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located(
                            (By.ID, "popup_data")))
        
        lu_buttons = popup_data.find_element_by_class_name('trimInfo').find_elements_by_tag_name('li')
        lu_start = lu_buttons[lu_idx].find_element_by_tag_name('a').get_attribute('href').find('&Lineup=')
        lu_cd = lu_buttons[lu_idx].find_element_by_tag_name('a').get_attribute('href')[lu_start+8:lu_start+13]
        lu_nm = lu_buttons[lu_idx].find_element_by_tag_name('a').find_element_by_tag_name('span').text.strip()
        lineups.append((lu_cd,lu_nm))
    return lineups

In [15]:
# 가격 탭
def get_price():
    params['Tab'] = 'price'
    url_price = Util.make_url(params)
    driver.get(url_price)
    
    price_list = WebDriverWait(driver, 5).until(
                        EC.presence_of_element_located(
                            (By.ID, "priceList")))
    dts = price_list.find_elements_by_tag_name('dt')
    trim_prices = []
    for dt in dts:
        try:
            trim_prices.append((dt.find_element_by_class_name('title').text.strip(), dt.find_element_by_class_name('price').text.strip()))
        except NoSuchElementException as nse: # '공통사양' 인 element
            continue
            
    return trim_prices # trim 개수와 트림,가격 리스트 리턴


In [16]:
#  제원 탭
def get_spec()->dict:
    global spec_valid,driver
    
    params['Tab'] = 'spec'
    url_spec = Util.make_url(params)

    driver.get(url_spec)

    source = driver.page_source
    soup = BeautifulSoup(source,'lxml')
    try: 
        table = soup.find('table',{'class':'tableBlock'})
        tr_list = table.find_all('tr')
    except AttributeError as ae: # 제원 탭 아예 존재 안함
        spec_valid = False
        return
    
    spec = {}
    for tr in tr_list:
        spec[tr.find('td').text.strip()] = [td.text.strip() for td in tr.find_all('td',{'class':'tdCenter'})]
        
    return spec
    

In [17]:
# 사양/옵션 탭
def get_opt()->dict:
    global opt_valid,driver

    params['Tab'] = 'item'
    url_item = Util.make_url(params)
    driver.get(url_item)

    source = driver.page_source
    soup = BeautifulSoup(source,'lxml')
    
    try:
        table = soup.find('table',{'class':'tableBlock'})
        tr_list = table.find_all('tr')
    except AttributeError as ae: # 사양/옵션 탭 아예 존재 안함
        opt_valid = False
        return
    
    item = {}
    for tr in tr_list:
        item[tr.find('td').text.strip()] = [td.text.strip() for td in tr.find_all('td',{'class':'tdCenter'})]
    
    return item

메인

In [24]:
if __name__=='__main__':
    
    os.chdir('./data')
    # 판매실적 데이터에서 모델아이디를 추출
    monthly = pd.read_excel('monthly.xlsx',encoding='utf-8')
    model_ids = [str(el) for el in list(monthly.model.unique())]
    model_ids.append(3341)# 티볼리 에어 직접 추가
    del monthly
    
    # url 의 params 정의
    params = {} 
    params['Model'] = None 
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'}
    
    # 크롬 드라이버 호출
    os.chdir('./..')
    path = './chromedriver.exe'
    options = webdriver.ChromeOptions()
    options.add_argument('headless')
    options.add_argument('disable-gpu')
    driver = webdriver.Chrome(path,options=options)
    
    # pandas DF 형태의 스키마 정의
    car_detail = pd.DataFrame({'model':[],'lu_cd':[],'lu_nm':[],'trim':[],'trim_idx':[],'price':[],'specs':[],'opt':[]}) 
    
    # 전체적인 예외처리 위함
    unhandled_e = [] 
    
    for model_id in tqdm.tqdm(model_ids[:5]): # 판매실적있는 model에 대해서만 진행
        try:
            params['Model'] = str(model_id)

            lineups = get_lineup() # 라인업 쭉 가져온다           
            for lineup in lineups: # 라인업에 대해 진행
                params['Lineup'] = str(lineup[0])
                
                spec_valid =True # 제원, 옵션 탭 존재 여부
                opt_valid = True

                trim_prices = get_price() # 트림과 가격 정보 가져옴
                trim_len = len(trim_prices) # 트림 개수만큼 데이터 쌓을 것
                spec_dic = get_spec() # 스펙 정보 딕셔너리 형태로 가져옴
                opt_dic = get_opt() # 옵션 정보 딕셔너리 형태로 가져옴

                tmp_detail_base = {'model':[],'lu_cd':[],'lu_nm':[],'trim':[],'trim_idx':[],'price':[],'specs':[],'opt':[]}
                for trim_idx in range(trim_len): # 트림에 대해 tmp_detail_base를 채워가겠다
                    tmp_detail_base['model'].append(params['Model'])
                    tmp_detail_base['lu_cd'].append(params['Lineup'])
                    tmp_detail_base['lu_nm'].append(lineup[1])
                    tmp_detail_base['trim'].append(trim_prices[trim_idx][0])
                    tmp_detail_base['trim_idx'].append(trim_idx)
                    tmp_detail_base['price'].append(trim_prices[trim_idx][1])

                    tmp_spec_dic = {}
                    if spec_valid:
                        for k,v in spec_dic.items():
                            tmp_spec_dic[k] = v[trim_idx]
                    tmp_detail_base['specs'].append(tmp_spec_dic)

                    tmp_opt_dic = {}
                    if opt_valid:
                        for k,v in opt_dic.items():
                            tmp_opt_dic[k] = v[trim_idx]
                    tmp_detail_base['opt'].append(tmp_opt_dic)


                tmp_detail = pd.DataFrame(tmp_detail_base)
                car_detail = pd.concat((car_detail,tmp_detail), axis=0,sort=False) # tmp_detail을 기존 car_detail에 쌓기
        
        except Exception as e: # 예외 발생시 모델id 저장해 다시 처리하자
            print(model_id, e)
            unhandled_e.append(model_id)
            continue
                
    driver.quit()


In [20]:
unhandled_e

[]

In [21]:
car_detail.price = car_detail.price.str.replace(',','').astype('int64')
car_detail.trim_idx = car_detail.trim_idx.astype('int32')

In [22]:
car_detail.head(3)

Unnamed: 0,model,lu_cd,lu_nm,trim,trim_idx,price,specs,opt
0,3333,44018,가솔린 1.0,밴 M/T,0,9650000,"{'엔진': '카파 1.0 ECO Prime', '연료': '가솔린', '배기량 (...","{'외관': '벌브형', '전방 안개등': '벌브형', '주간 주행등': '벌브형'..."
1,3333,44018,가솔린 1.0,밴 A/T,1,10900000,"{'엔진': '카파 1.0 ECO Prime', '연료': '가솔린', '배기량 (...","{'외관': '벌브형', '전방 안개등': '벌브형', '주간 주행등': '벌브형'..."
2,3333,44018,가솔린 1.0,밴 고급형 M/T,2,9950000,"{'엔진': '카파 1.0 ECO Prime', '연료': '가솔린', '배기량 (...","{'외관': '벌브형', '전방 안개등': '벌브형', '주간 주행등': '벌브형'..."


In [36]:
# car_detail.reset_index(drop=True,inplace=True)

In [37]:
car_detail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6680 entries, 0 to 6679
Data columns (total 8 columns):
model       6680 non-null object
lu_cd       6680 non-null object
lu_nm       6680 non-null object
trim        6680 non-null object
trim_idx    6680 non-null int32
price       6680 non-null int64
specs       6680 non-null object
opt         6680 non-null object
dtypes: int32(1), int64(1), object(6)
memory usage: 391.5+ KB


In [38]:
os.chdir('./data')
car_detail.to_excel('car_detail_all_raw.xlsx',encoding='utf-8')