# Crawling 크롤링

In [1]:
# 웹 사이트로부터 데이터를 얻어오는 클래스
import requests
import json

class Requester:
    def get_data(self, url, parameters = {}):
        if (url is None) or (url.strip() is ''):
            raise Exception('url should not be empty')
        
        response = requests.get(url.strip(), params = parameters)
        if response.status_code is not 200:
            raise Exception('Can not receive response')

        json_response = json.loads(response.text)
        return json_response

## '카테고리 목록'을 크롤링하는 클래스

In [2]:
# 홈페이지 주소: https://ohou.se/store/category?category=0_1_0_7
class CategoryCrawler:
    def __init__(self):
        self.categories = list()
        self.requester = Requester()
    
    def collect_all_sub_categories(self, sub_categories):
        if (sub_categories is None) or (len(sub_categories) <= 0):
            return []
        
        collected_sub_categories = list()
        for inner_sub_category in sub_categories:
            if (inner_sub_category['child'] is None) or (len(inner_sub_category['child']) <= 0):
                collected_sub_categories.append({
                    'category_name': inner_sub_category['title'].strip(),
                    'category_key': inner_sub_category['hash'].strip()
                })
                continue
            found_sub_categories = self.collect_all_sub_categories(inner_sub_category['child'])
            collected_sub_categories = collected_sub_categories + found_sub_categories
            
        return collected_sub_categories
    
    def collect(self):
        parameters = {'used': str(2), 'include_html': 'true'}
        url = 'https://ohou.se/productions/categories.json'
        response = self.requester.get_data(url, parameters)
        
        self.categories = list()
        for category in response['categories']:
            item = {
                'category_name': category['title'].strip(),
                'category_key': category['hash'].strip(),
                'sub_categories': [],
            }
            item['sub_categories'] = self.collect_all_sub_categories(category['child'])
            self.categories.append(item)

In [3]:
category_crawler = CategoryCrawler()
category_crawler.collect()

print('> Total main category count:', len(category_crawler.categories))
print('> "{}"\'s sub categories: {}'.format(
    category_crawler.categories[4]['category_name'],
    category_crawler.categories[4]['sub_categories']
))

> Total main category count: 12
> "수납/정리"'s sub categories: [{'category_name': '플라스틱서랍장', 'category_key': '11_0_0'}, {'category_name': '이동식정리함/트롤리', 'category_key': '11_0_2'}, {'category_name': '공간박스', 'category_key': '11_0_3'}, {'category_name': '리빙박스', 'category_key': '11_1_0'}, {'category_name': '수납정리함', 'category_key': '11_1_1'}, {'category_name': '기타수납/정리용품', 'category_key': '11_1_2'}, {'category_name': '수납바스켓', 'category_key': '11_2_0'}, {'category_name': '라탄바스켓', 'category_key': '11_2_1'}, {'category_name': '빨래바구니/보관함', 'category_key': '11_2_2'}, {'category_name': '소품트레이', 'category_key': '11_2_3'}, {'category_name': '스탠드행거', 'category_key': '11_3_0'}, {'category_name': '이동식행거', 'category_key': '11_3_1'}, {'category_name': '고정식행거', 'category_key': '11_3_2'}, {'category_name': '벽선반', 'category_key': '11_4_0'}, {'category_name': '스탠드선반', 'category_key': '11_4_1'}, {'category_name': '세탁기선반', 'category_key': '11_4_2'}, {'category_name': '옷걸이/바지걸이', 'category_key': '11_5_0'}, {'categ

## '카테고리별' 모든 '상품' 및 관련된 '리뷰텍스트 등'을 크롤링하는 클래스

In [14]:
class ProductCrawler:
    def __init__(self, categories = []):
        self.products = list()
        self.categories = categories
        self.requester = Requester()
        self.default_page_size = 1000 # 1페이지에 출력될 아이템 갯수

    def calculat_total_page(self, total_item_count):
        if (total_item_count is None) or (total_item_count <= 0):
            return 1
        total_page = (total_item_count // self.default_page_size)
        if (total_item_count % self.default_page_size) is not 0:
            total_page = total_page + 1
        return total_page
    
    def get_production_data_by_page(self, category_key, page):
        parameters = {
             'v': str(2), # 고정 queryParam 값. v인걸 보아 API 버전 정보인듯?
             'per': str(self.default_page_size), # 고정 queryParam 값. 페이에 출력될 아이템 갯수
             'category': category_key,
             'page': str(page)
        }
        url = 'https://ohou.se/store/category.json'
        response = self.requester.get_data(url, parameters)
        return response
    
    def collect_productions(self, productions = []):
        for production in productions:
            self.products.append({
                'id': production['id'],
                'name': production['name'],
                'review_count': production['review_count'], #  제품에 남겨진 리뷰 총 갯수
                'review_average': production['review_avg'],  # 제품에 남겨진 평점 평균
                'scrap_count': production['scrap_count'],  # 제품을 스크랩한 총 갯수
                'view_count': production['view_count'],  # 제품을 본 횟수
                #'review_text': '' # TODO
            })
        
    def collect(self):
        self.products = list()
        if (self.categories is None) or (len(self.categories) <= 0):
            return
        
        for category in self.categories:
            category_keys = [sub['category_key'] for sub in category['sub_categories']]
            for category_key in category_keys:
                print('> Start to collect productions. category_key:', category_key)
                current_page = 1
                response = self.get_production_data_by_page(category_key, current_page)
                self.collect_productions(response['productions'])
                
                total_page = self.calculat_total_page(response['item_count']) # item_count: 현재 카테고리의 모든 제품 갯수
                print('>> This category_key has total_page:', total_page)
                if current_page is total_page:
                    break
                else:
                    while (current_page <= total_page):
                        current_page = current_page + 1
                        print('>> Collect proudctions at page:', current_page)
                        response = self.get_production_data_by_page(category_key, current_page)
                        self.collect_productions(response['productions'])
                

In [15]:
product_crawler = ProductCrawler(categories = category_crawler.categories)
product_crawler.collect()

> Start to collect productions. category_key: 0_1_0_3
>> This category_key has total_page: 5
>> Collect proudctions at page: 2
>> Collect proudctions at page: 3
>> Collect proudctions at page: 4
>> Collect proudctions at page: 5
>> Collect proudctions at page: 6
> Start to collect productions. category_key: 0_1_0_4
>> This category_key has total_page: 1
> Start to collect productions. category_key: 1_0_20
>> This category_key has total_page: 4
>> Collect proudctions at page: 2
>> Collect proudctions at page: 3
>> Collect proudctions at page: 4
>> Collect proudctions at page: 5
> Start to collect productions. category_key: 1_0_1
>> This category_key has total_page: 4
>> Collect proudctions at page: 2
>> Collect proudctions at page: 3
>> Collect proudctions at page: 4
>> Collect proudctions at page: 5
> Start to collect productions. category_key: 1_0_0
>> This category_key has total_page: 2
>> Collect proudctions at page: 2
>> Collect proudctions at page: 3
> Start to collect productions

In [11]:
print('Total product count:', len(product_crawler.products))
print(product_crawler.products[0])

Total product count: 18472
{'type': 'Production', 'id': 267218, 'name': '[쿠폰할인] DK053 3인용 풀커버 그레이 발수 패브릭 소파 (스툴 기본포함)', 'review_avg': 4.64, 'review_count': 1879, 'scrap_count': 30426, 'view_count': 437807, 'used_card_count': 1715, 'selling_price': 264000, 'original_price': 335000, 'original_image_url': 'https://bucketplace-v2-development.s3.amazonaws.com/uploads/productions/158563416665392402.jpg', 'resized_image_url': 'https://image.ohou.se/image/central_crop/bucketplace-v2-development/uploads-productions-158563416665392402.jpg/640/640', 'status': 4, 'week_rank': 0, 'delivery_type': 2, 'user_id': 7631246, 'brand': {'id': 6801, 'name': '듀커소파'}, 'is_cheapest_price': False, 'is_special_price': True, 'is_hidden': False, 'is_delivery_date_specified': False, 'is_selling': True, 'is_sold_out': False, 'is_scrap': False, 'is_free_delivery': False, 'is_discontinued': False, 'is_overseas_purchase': False, 'is_buyable': True, 'is_consultable': False, 'is_remodel': False, 'is_discounted': False, '

## 'Product에 남겨진 리뷰텍스트'를 크롤링하는 클래스

In [None]:
# https://ohou.se/production_reviews.json?production_id=81625&page=2&order=best&photo_review_only=
class ProdcutReviewCrawler:

# `input/data.csv` 생성