# Crawling 크롤링 + 전처리

In [2]:
# 웹 사이트로부터 데이터를 얻어오는 클래스
import requests
import json

class Requester:
    def get_data(self, url, parameters = {}):
        if (url is None) or (url.strip() is ''):
            raise Exception('url should not be empty')
        
        response = requests.get(url.strip(), params = parameters)
        if response.status_code is not 200:
            print('[ERROER]Can not receive response')

        response_text = (response.text is None) and '{}' or response.text
        json_response = json.loads(response_text)
        return json_response

## '집들이 목록의 고유키'을 크롤링하는 클래스

In [3]:
# 홈페이지 주소: https://ohou.se/projects?writer=self
from time import sleep

class HouseWarmingCrawler:
    def __init__(self):
        self.house_warming_item_ids = list()
        self.requester = Requester()
        self.default_page_size = 100 # 1페이지에 출력될 아이템 갯수
    
    def get_house_warming_data_by_page(self, page):
        parameters = {
             'v': str(2), # 고정 queryParam 값. v인걸 보아 API 버전 정보인듯?
             'writer': 'self', # 고정 queryParam 값. 뭔지 모르겠음.
             'per': str(self.default_page_size), # 고정 queryParam 값. 페이에 출력될 아이템 갯수
             'page': str(page)
        }
        url = 'https://ohou.se/projects.json'
        response = self.requester.get_data(url, parameters)
        return response
    
    def collect_house_warming_list(self, projects = []):
        for project in projects:
            self.house_warming_item_ids.append(project['id'])
        
    def collect(self):
        self.house_warming_item_ids = list()
        
        has_next_page = True
        current_page = 1
        while has_next_page is True:
            print('> Start to collect house warming list. current_page:', current_page)
            response = self.get_house_warming_data_by_page(current_page)
            if (response['projects'] is not None) and (len(response['projects']) > 0):
                self.collect_house_warming_list(response['projects'])
                print(response['next'])
                has_next_page = response['next']
                current_page = current_page + 1
                sleep(3)

In [4]:
house_warming_crawler = HouseWarmingCrawler()
house_warming_crawler.collect()

print('> Total main house warming item count:', len(house_warming_crawler.house_warming_item_ids))

> Start to collect house warming list. current_page: 1
True
> Start to collect house warming list. current_page: 2
True
> Start to collect house warming list. current_page: 3
True
> Start to collect house warming list. current_page: 4
True
> Start to collect house warming list. current_page: 5
True
> Start to collect house warming list. current_page: 6
True
> Start to collect house warming list. current_page: 7
True
> Start to collect house warming list. current_page: 8
True
> Start to collect house warming list. current_page: 9
True
> Start to collect house warming list. current_page: 10
True
> Start to collect house warming list. current_page: 11
True
> Start to collect house warming list. current_page: 12
True
> Start to collect house warming list. current_page: 13
True
> Start to collect house warming list. current_page: 14
True
> Start to collect house warming list. current_page: 15
True
> Start to collect house warming list. current_page: 16
True
> Start to collect house warming 

## '집들이 ' 항목의  '상품' 및 관련된 '리뷰텍스트 등'을 크롤링하는 클래스

In [5]:
from time import sleep

class HouseWarmingItemDetailCrawler:
    def __init__(self, item_ids = []):
        self.house_warming_details = list()
        self.item_ids = item_ids
        self.requester = Requester()
    
    def get_detail_data_by_item_id(self, item_id):
        parameters = {
             'v': str(4), # 고정 queryParam 값. v인걸 보아 API 버전 정보인듯?
        }
        url = 'https://ohou.se/projects/' + str(item_id) + '/detail.json'
        response = self.requester.get_data(url, parameters)
        return response
    
    def collect_house_warming_detail(self, project_detail = []):
        purchased_product_ids = [ product['id'] for product in project_detail['bucketplace_document_supplements']['products'] ]
        
        self.house_warming_details.append({
            'id': project_detail['id'],
            
            'residence': project_detail['residence'], # 주거 형태
            'area': project_detail['area'], # 평수
            'region': project_detail['region'],  # 주거지 지역
            'expertise': project_detail['expertise'], # 인테리어 포스트 카테고리
            
            'color_list': ','.join(project_detail['color_list']),
            'style_list': ','.join(project_detail['style_list']),
            'constructions': ','.join(project_detail['constructions']),
            
            'purchased_product_ids': purchased_product_ids, # 구매한 제품 아이디 목록
            
            'family_list': project_detail['family_list'], # 가족 구성원 정보
            'like_count': project_detail['like_count'], #  집 인테리어 포스트의 좋아요 갯수
            'reply_count': project_detail['reply_count'],  # 집 인테리어 포스트에 남겨진 답글 갯수
            'scrap_count': project_detail['scrap_count'],  # 집 인테리어 포스트를 스크랩한 총 갯수
            'view_count': project_detail['view_count'],  # 집 인테리어 포스트를 본 횟수
            'share_count': project_detail['share_count'] # 해당 집 인테리어 포스트를 공유한 횟수
        })
        
    def collect(self):
        self.house_warming_details = list()
        if (self.item_ids is None) or (len(self.item_ids) <= 0):
            return
        
        for item_id in self.item_ids:
            response = self.get_detail_data_by_item_id(item_id)
            print('Collect item detail item_id:', item_id)
            if (response['project'] is not None) and (len(response['project']) > 0):
                    self.collect_house_warming_detail(response['project'])
            print('...')

In [None]:
house_warming_item_detail_crawler = HouseWarmingItemDetailCrawler(item_ids = house_warming_crawler.house_warming_item_ids)
house_warming_item_detail_crawler.collect()

Collect item detail item_id: 40578
...
Collect item detail item_id: 39868
...
Collect item detail item_id: 39270
...
Collect item detail item_id: 39218
...
Collect item detail item_id: 38991
...
Collect item detail item_id: 40580
...
Collect item detail item_id: 40055
...
Collect item detail item_id: 39404
...
Collect item detail item_id: 39302
...
Collect item detail item_id: 38844
...
Collect item detail item_id: 40576
...
Collect item detail item_id: 39858
...
Collect item detail item_id: 39211
...
Collect item detail item_id: 38993
...
Collect item detail item_id: 38718
...
Collect item detail item_id: 40577
...
Collect item detail item_id: 39779
...
Collect item detail item_id: 39714
...
Collect item detail item_id: 39337
...
Collect item detail item_id: 38539
...
Collect item detail item_id: 40579
...
Collect item detail item_id: 39744
...
Collect item detail item_id: 39041
...
Collect item detail item_id: 40402
...
Collect item detail item_id: 39831
...
Collect item detail item_

In [None]:
print('Total product count:', len(house_warming_item_detail_crawler.house_warming_details))
print(house_warming_item_detail_crawler.house_warming_details[0])

# `input/data-house-warming.csv` 생성

In [None]:
import os
import shutil

input_dir_path = os.path.join(os.getcwd(), 'input')
if os.path.exists(input_dir_path) is False:
    # 현재 위치에 input 폴더 생성
os.mkdir(input_dir_path)

In [None]:
# input 폴더 만들고 data.csv 에 product_n_reviews 내용 저장하기
import pandas

df = pandas.DataFrame(house_warming_item_detail_crawler.house_warming_details)
df.head(3)

In [None]:
df.to_csv("./input/data-house-warming.csv", sep='\t', na_rep='', encoding="UTF-16")

In [None]:
# 잘 만들어졌는지 확인
pandas.read_csv('./input/data-house-warming.csv', sep='\t', encoding="UTF-16")