전체적인 작업 overview

매물 데이터를 활용해서, 
* 원하는 지역에 매물이 있는지 확인을 한다
* 시세를 대강 파악 한다
* 올라오지 않은 매물에 대한 정보 수집 할 때 비교해본다
* 추천 받을 때 주변과 비교해본다

In [93]:
import pandas as pd
import time
import re
import requests
from bs4 import BeautifulSoup

In [94]:
import time, sys
from IPython.display import clear_output

def update_progress(progress):
    bar_length = 20
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1
    
    block = int(round(bar_length * progress))
    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)


In [95]:
def pretty_html(url):
    
    response = requests.request('GET', url)
    html_doc = response.text

    soup = BeautifulSoup(html_doc, 'html.parser')
    result_text = soup.prettify()
    return soup, result_text


In [96]:
def explore_home(url):
    """

    :return:
    """

    soup, result_text = pretty_html(url)

    pagenation_html = soup.findAll("a", {"data-ci-pagination-page" : True})
    max_num = pagenation_html[len(pagenation_html)-1]['data-ci-pagination-page']

    return max_num

In [97]:
def extract_detail_nums(url):
    
    _, result_text = pretty_html(url)

    details = []
    for line in result_text.split('\n'):

        # extract detail
        match = re.search(r'\/CProperty\/detail\?num=(\d+)', line)
        if match:
            detail_url = match.group()
            detail_num = match.group(1)
            details.append(detail_num)

    return details
    

In [98]:
def extract_latlng_with_detail(html_doc):
    """

    :return:
    """

    lat_num = None
    lng_num = None
    for line in html_doc.split('\n'):
        match = re.search(r'LatLng\(([\d.]+), ([\d.]+)\)', line)
        if match:
            lat_num = match.group(1)
            lng_num = match.group(2)
            break
    return lat_num, lng_num


In [153]:
def extract_detail_info(soup):
    
    title_html = soup.findAll("td", class_="info_t_cont")
    if len(title_html) > 0:
        title = title_html[0].getText().strip()
    else:
        title = ''

    category_html = soup.findAll("span", class_="detail_cate")
    if len(category_html) > 0:
        category = category_html[0].getText().strip()
    else:
        category = ''

    cost_type_html = soup.findAll("span", class_=re.compile("type*"))
    if len(cost_type_html) > 0:
        cost_type = cost_type_html[0].getText().strip()
    else:
        cost_type = ''

    price_html = soup.findAll("strong", class_="red")
    
    if len(price_html) > 0:
        price = price_html[0].getText().strip()
    else:
        price = ''

    sup_size_html = soup.find(id="sup_area_m")
    sup_size = None
    if sup_size_html:
        sup_size = sup_size_html['value']

    dedi_size_html = soup.find(id="dedi_area_m")
    dedi_size = None
    if dedi_size_html:
        dedi_size = dedi_size_html['value']

    item = {
        'category': category,
        'cost_type': cost_type,
        'price': price,
        'sup_size': sup_size,
        'dedi_size': dedi_size,
        'title': title
    }
    
    return item

In [154]:
def explore_detail(detail_num):
    
    home_url = 'https://www.jejuall.com'
    detail_url = '/CProperty/detail?num='
    
    print(home_url+detail_url+detail_num)
    soup, result_text = pretty_html(home_url+detail_url+detail_num)
    
    item = extract_detail_info(soup)
    lat_num, lng_num = extract_latlng(result_text)
    item['lat_num'] = lat_num
    item['lng_num'] = lng_num
    item['detail_num'] = detail_num
    
    return item        
        
        

In [155]:
def execute(details):
    
    result = []
    
    number_of_elements = len(details)
    
    for idx, detail_num in enumerate(details):
        item = explore_detail(detail_num)
        result.append(item)
        
        print(idx, '/' ,number_of_elements)
        
        time.sleep(2)

    return result

In [102]:
explore_detail('1115710')

{'category': '상가점포',
 'cost_type': '임대',
 'price': '700/400',
 'sup_size': '43',
 'dedi_size': '43',
 'title': '★SR-374(이도동상가) 중소기업청부근 아담한1층 상가임대★',
 'lat_num': None,
 'lng_num': None}

In [103]:
home_url = 'https://www.jejuall.com/CProperty/'
max_num = explore_home(home_url)

In [104]:
max_num

'157'

In [105]:
all_dup_details = []

In [111]:
page_url = 'https://www.jejuall.com/CProperty/index/params/category/NO/sort/1/sido/NO/dong/NO/small_area/0/big_area/1000000/small_price/0/big_price/1000000/keyword/NO/page1/1/page/'
number_of_elements = int(max_num)+1

for i in range(1, number_of_elements):
    page_url = page_url + str(i)
    raw_details = extract_detail_nums(page_url)
    all_dup_details.extend(raw_details)
    update_progress(i / number_of_elements)
    print(i, min(raw_details), max(raw_details), len(raw_details), len(list(set(raw_details))))

Progress: [####################] 99.4%
157 0 0


In [113]:
print(len(all_dup_details))

5309


In [123]:
pd.Series(all_dup_details).value_counts()

1115786    32
1094057    28
1114736    28
819240     28
1114099    28
           ..
1115444     1
1111455     1
1115476     1
1111407     1
1115806     1
Length: 276, dtype: int64

In [118]:
all_details = list(set(all_dup_details))
sample = list(map(lambda x: int(x), all_details))

In [120]:
max(sample)

1115806

In [156]:
all_results = []
result = execute(all_details)
all_results.extend(result)

https://www.jejuall.com/CProperty/detail?num=1111441
0 / 276
https://www.jejuall.com/CProperty/detail?num=1115790
1 / 276
https://www.jejuall.com/CProperty/detail?num=955319
2 / 276
https://www.jejuall.com/CProperty/detail?num=1115457
3 / 276
https://www.jejuall.com/CProperty/detail?num=1111965
4 / 276
https://www.jejuall.com/CProperty/detail?num=1115789
5 / 276
https://www.jejuall.com/CProperty/detail?num=807374
6 / 276
https://www.jejuall.com/CProperty/detail?num=1099456
7 / 276
https://www.jejuall.com/CProperty/detail?num=1096048
8 / 276
https://www.jejuall.com/CProperty/detail?num=1115467
9 / 276
https://www.jejuall.com/CProperty/detail?num=812060
10 / 276
https://www.jejuall.com/CProperty/detail?num=1016291
11 / 276
https://www.jejuall.com/CProperty/detail?num=1005888
12 / 276
https://www.jejuall.com/CProperty/detail?num=1103218
13 / 276
https://www.jejuall.com/CProperty/detail?num=1099914
14 / 276
https://www.jejuall.com/CProperty/detail?num=1115804
15 / 276
https://www.jejuall.c

132 / 276
https://www.jejuall.com/CProperty/detail?num=1115451
133 / 276
https://www.jejuall.com/CProperty/detail?num=1106641
134 / 276
https://www.jejuall.com/CProperty/detail?num=1115779
135 / 276
https://www.jejuall.com/CProperty/detail?num=1085630
136 / 276
https://www.jejuall.com/CProperty/detail?num=1081845
137 / 276
https://www.jejuall.com/CProperty/detail?num=1115389
138 / 276
https://www.jejuall.com/CProperty/detail?num=1084743
139 / 276
https://www.jejuall.com/CProperty/detail?num=1107043
140 / 276
https://www.jejuall.com/CProperty/detail?num=1108479
141 / 276
https://www.jejuall.com/CProperty/detail?num=1071246
142 / 276
https://www.jejuall.com/CProperty/detail?num=1090935
143 / 276
https://www.jejuall.com/CProperty/detail?num=992658
144 / 276
https://www.jejuall.com/CProperty/detail?num=932853
145 / 276
https://www.jejuall.com/CProperty/detail?num=1111435
146 / 276
https://www.jejuall.com/CProperty/detail?num=1111414
147 / 276
https://www.jejuall.com/CProperty/detail?num=11

263 / 276
https://www.jejuall.com/CProperty/detail?num=1112174
264 / 276
https://www.jejuall.com/CProperty/detail?num=1112176
265 / 276
https://www.jejuall.com/CProperty/detail?num=1115470
266 / 276
https://www.jejuall.com/CProperty/detail?num=1081673
267 / 276
https://www.jejuall.com/CProperty/detail?num=1111423
268 / 276
https://www.jejuall.com/CProperty/detail?num=1070389
269 / 276
https://www.jejuall.com/CProperty/detail?num=1100961
270 / 276
https://www.jejuall.com/CProperty/detail?num=1115773
271 / 276
https://www.jejuall.com/CProperty/detail?num=1081650
272 / 276
https://www.jejuall.com/CProperty/detail?num=1115805
273 / 276
https://www.jejuall.com/CProperty/detail?num=943152
274 / 276
https://www.jejuall.com/CProperty/detail?num=1111447
275 / 276


In [139]:
tt = [item['detail_num'] for item in all_results]

In [157]:
df = pd.DataFrame(all_results)
df.to_csv('housing.csv')

In [143]:
check = list(set(tt))

In [144]:
no_dup = list(filter(lambda x: x['detail_num'] in check, all_results))

In [148]:
df = pd.DataFrame(no_dup)
df.drop_duplicates().to_csv('housing.csv')

In [147]:
df.drop_duplicates().shape

(160, 9)