-
Notifications
You must be signed in to change notification settings - Fork 0
/
transaction_crawler.py
161 lines (141 loc) · 5.89 KB
/
transaction_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import urllib.parse
from datetime import datetime
from logging import getLogger
from xml.etree import cElementTree as ElementTree
import config
import psycopg2
import requests
from query import (CREATE_CITY_TABLE, CREATE_SCHEMA, CREATE_TRANSACTION_TABLE,
GRANT_DB_TO_USER, INSERT_CITY_DATA, CREATE_CITY_TABLE, GENERATE_SERIAL_NO)
from utils import XmlDictConfig, XmlListConfig
class InitSetting:
"""
DB에 테이블을 생성 및 도시정보 저장
"""
def __init__(self):
host = config.HOST
port = config.PORT
db = config.DB
user = config.USER
password = config.PASSWORD
self.conn = psycopg2.connect(host=host, port=port, database=db, user=user, password=password)
def refresh_connection(self):
"""
connection을 갱신
"""
self.__init__()
def create_init_data(self):
"""
초기 테이블 생성과 도시정보를 생성
"""
with self.conn.cursor() as cursor:
cursor.execute(GRANT_DB_TO_USER % (config.DB, config.USER))
cursor.execute(CREATE_SCHEMA)
cursor.execute(CREATE_CITY_TABLE)
cursor.execute(INSERT_CITY_DATA)
cursor.execute(CREATE_TRANSACTION_TABLE)
self.conn.commit()
def get_city_code(self):
"""
도시정보를 조회
"""
with self.conn.cursor() as cursor:
cursor.execute("select * from apartment.city;")
result = cursor.fetchall()
return result
def get_query_date(self):
"""
실거래가 최초 기준인 2006년부터 조회할 현재시간을 반환
"""
date_list = []
now = datetime.now()
start_year = 2006
end_year = now.year
end_month = now.month
for year in range(start_year, end_year+1):
for month in range(1, 12+1):
if year == end_year and end_month < month:
break
date_list.append(str(year)+str(month).zfill(2))
return date_list
def generate_serial_no(self):
"""
serial_no가 없는 거래내역의 serial_no를 생성해준다
"""
with self.conn.cursor() as cursor:
cursor.execute(GENERATE_SERIAL_NO)
self.conn.commit()
def runner(self, date_code, city_list):
"""
API를 호출하여 적재
"""
for city in city_list:
city_code = str(city[0])
print("START", date_code, city)
url = 'http://openapi.molit.go.kr/OpenAPI_ToolInstallPackage/service/rest/RTMSOBJSvc/getRTMSDataSvcAptTradeDev'
queryParams = '?' + urllib.parse.urlencode(
{
'ServiceKey':config.SECRETKEY,
'pageNo':'1',
'numOfRows':'1000000',
'LAWD_CD':city_code,
'DEAL_YMD': date_code
}
)
url = url + queryParams
response_body = requests.get(url)
xml = response_body.text
root = ElementTree.XML(xml)
xmldict = XmlDictConfig(root)
price_data = xmldict.get('body','').get('items', '').get('item', [])
for data in price_data:
dic = {
'price': int(data.get('거래금액','').replace(' ', '').replace(',','')),
'built_year': int(data.get('건축년도')),
'road_name': data.get('도로명'),
'road_building_main_code': data.get('도로명건물본번호코드'),
'road_building_sub_code': data.get('도로명건물부번호코드'),
'road_city_code': data.get('도로명시군구코드'),
'road_serial_code':data.get('도로명일련번호코드'),
'road_basement_code':data.get('도로명지상지하코드'),
'road_code':data.get('도로명코드'),
'region_name':data.get('법정동'),
'region_main_code':data.get('법정동본번코드'),
'region_sub_code':data.get('법정동부번코드'),
'region_city_code':data.get('법정동시군구코드'),
'region_city2_code':data.get('법정동읍면동코드'),
'region_zip_code':data.get('법정동지번코드'),
'apart_name':data.get('아파트'),
'year':data.get('년'),
'month':data.get('월'),
'day':data.get('일'),
'serial_no':data.get('일련번호'),
'size':float(data.get('전용면적')),
'zip_code':data.get('지번'),
'city_code':data.get('지역코드'),
'floor':int(data.get('층'))
}
with self.conn.cursor() as cur:
placeholders = ', '.join(['%s'] * len(dic))
columns = ', '.join(dic.keys())
table = 'apartment.transaction'
sql = "INSERT INTO %s ( %s ) VALUES ( %s )" % (table, columns, placeholders)
cur.execute(sql, list(dic.values()))
self.conn.commit()
if __name__ == '__main__':
try:
logger = getLogger('crawler logger')
logger.warning('CRAWLER START')
crawler = InitSetting()
crawler.create_init_data()
date_list = crawler.get_query_date()
city_list = crawler.get_city_code()
for index, date_code in enumerate(date_list):
crawler.refresh_connection()
crawler.runner(date_code, city_list)
logger.warning((date_code, 'done'))
crawler.generate_serial_no()
except AttributeError as e:
logger.error('CRAWLER STOPPED -%s', e)
else:
logger.warning('CRAWLER FINISHED')