In [2]:
import requests
import json
import re
import sys
from bs4 import BeautifulSoup

soup = BeautifulSoup('html.parser')
soup

<html><body><p>html.parser</p></body></html>

In [3]:
# 경기도 31개 시/군 법정동 코드 (앞 5자리)
GYEONGGI_DISTRICTS = {
    "가평군": "41820",
    "고양시 덕양구": "41281",
    "고양시 일산동구": "41285",
    "고양시 일산서구": "41287",
    "과천시": "41290",
    "광명시": "41210",
    "광주시": "41610",
    "구리시": "41310",
    "군포시": "41410",
    "김포시": "41570",
    "남양주시": "41360",
    "동두천시": "41250",
    "부천시": "41190",
    "성남시 분당구": "41135",
    "성남시 수정구": "41131",
    "성남시 중원구": "41133",
    "수원시 권선구": "41115",
    "수원시 영통구": "41117",
    "수원시 장안구": "41111",
    "수원시 팔달구": "41113",
    "시흥시": "41390",
    "안산시 단원구": "41273",
    "안산시 상록구": "41271",
    "안성시": "41550",
    "안양시 동안구": "41173",
    "안양시 만안구": "41171",
    "양주시": "41630",
    "양평군": "41830",
    "여주시": "41670",
    "연천군": "41800",
    "오산시": "41370",
    "용인시 기흥구": "41463",
    "용인시 수지구": "41465",
    "용인시 처인구": "41461",
    "의왕시": "41430",
    "의정부시": "41150",
    "이천시": "41500",
    "파주시": "41480",
    "평택시": "41220",
    "포천시": "41650",
    "하남시": "41450",
    "화성시": "41590"
}

# # 인천광역시 10개 구/군 법정동 코드 (앞 5자리)
# INCHEON_DISTRICTS = {
#     "강화군": "28710",
#     "계양구": "28245",
#     "남동구": "28200",
#     "동구": "28140",
#     "미추홀구": "28177",  # (구)남구
#     "부평구": "28237",
#     "서구": "28260",
#     "연수구": "28185",
#     "옹진군": "28720",
#     "중구": "28110"
# }

In [39]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_URL = "http://apis.data.go.kr/1613000/RTMSDataSvcAptTradeDev/getRTMSDataSvcAptTradeDev"
SERVICE_KEY = "YPxXvm/3jO5hggkbdJFWnhh8IC3VdMlkRvZHh1pkG8eZxXw12ymm4OAW10urfUVN++TzkurcrZ5Os3Gr9P8Kwg=="
URL = f"{BASE_URL}"

# 경기도 구별 데이터 저장할 딕셔너리
gg_real_estate_data = {}

# 크롤링할 기간 설정
start_year = 2025
start_month = 1
end_year = 2025
end_month = 2

# 서울시 각 구별 데이터 요청
# for 서울, 인천, 경기 
# for 연월

for year in range(start_year, end_year + 1):
    for month in range(1, 13):
        if year == start_year and month < start_month:
            continue
        if year == end_year and month > end_month:
            break

        deal_ymd = f"{year}{month:02}"  # 거래 연월 (YYYYMM 형식)
        print(f"⏳ {deal_ymd} 데이터 수집 시작")
    
    # 각 구별 데이터 요청       
    for gu, code in GYEONGGI_DISTRICTS.items():
        print(f"📌 {gu} ({code}) 데이터 요청 중...")

        params = {
            "LAWD_CD": code,
            "DEAL_YMD": deal_ymd,  # 거래 연월
            "serviceKey": SERVICE_KEY,
            "pageNo": 1,
            "numOfRows": 999
        }

        response = requests.get(URL, params=params)
        response.encoding = "utf-8"
        response.text
        
        # 응답 상태 코드 체크
        if response.status_code == 200:
            print(f"✅ {gu} 응답 상태: {response.status_code}")
            soup = BeautifulSoup(response.text, "xml")

            # 응답 XML 구조 확인
            print(f"📊 {gu} 응답 데이터 샘플:")
            print(soup.prettify()[:1000])  # 데이터 일부 출력해서 확인

            # 'item' 태그가 있는지 확인
            items = soup.find_all("item")
            if not items:
                print(f"⚠️ {gu}에 거래 데이터가 없습니다.")
            else:
                gg_real_estate_data[gu] = items
                print(f"✅ {gu} 데이터 수집 완료 ({len(items)}건)")

                # 데이터 추출 및 검증
                all_data = []  # 구별 데이터 리스트 초기화
                for item in items:
                    data = {
                        "구": gu,
                        "거래금액": item.find("거래금액").text.strip() if item.find("거래금액") else None,
                        "건축년도": item.find("건축년도").text.strip() if item.find("건축년도") else None,
                        "법정동": item.find("법정동").text.strip() if item.find("법정동") else None,
                        "아파트": item.find("아파트").text.strip() if item.find("아파트") else None,
                        "전용면적": item.find("전용면적").text.strip() if item.find("전용면적") else None,
                        "층": item.find("층").text.strip() if item.find("층") else None,
                        "거래일": f"{item.find('년').text}-{item.find('월').text}-{item.find('일').text}"
                        if item.find('년') and item.find('월') and item.find('일') else None
                    }

                    # 데이터 유효성 검사
                    if None not in data.values():  # 모든 값이 유효한 경우에만 리스트에 추가
                        all_data.append(data)

                # DataFrame 변환
                if all_data:
                    df = pd.DataFrame(all_data)
                    print(f"✅ {gu} DataFrame 생성 완료")
                    print(df.head())

        else:
            print(f"❌ {gu} 요청 실패 (Status Code: {response.status_code})")
        
        time.sleep(0.5)  # API 요청 간격 조절

print("📦 데이터 수집 완료")

⏳ 202501 데이터 수집 시작
⏳ 202502 데이터 수집 시작
📌 가평군 (41820) 데이터 요청 중...
✅ 가평군 응답 상태: 200
📊 가평군 응답 데이터 샘플:
<?xml version="1.0" encoding="utf-8"?>
<response>
 <header>
  <resultCode>
   000
  </resultCode>
  <resultMsg>
   OK
  </resultMsg>
 </header>
 <body>
  <items>
   <item>
    <aptDong>
    </aptDong>
    <aptNm>
     청평삼성쉐르빌
    </aptNm>
    <aptSeq>
     41820-334
    </aptSeq>
    <bonbun>
     0837
    </bonbun>
    <bubun>
     0000
    </bubun>
    <buildYear>
     2012
    </buildYear>
    <buyerGbn>
     개인
    </buyerGbn>
    <cdealDay>
    </cdealDay>
    <cdealType>
    </cdealType>
    <dealAmount>
     26,000
    </dealAmount>
    <dealDay>
     18
    </dealDay>
    <dealMonth>
     2
    </dealMonth>
    <dealYear>
     2025
    </dealYear>
    <dealingGbn>
     중개거래
    </dealingGbn>
    <estateAgentSggNm>
     경기 가평군
    </estateAgentSggNm>
    <excluUseAr>
     84.66
    </excluUseAr>
    <floor>
     10
    </floor>
    <jibun>
     837
    </jibun>
    <landCd>
     1
 

In [40]:
import pandas as pd
import xml.etree.ElementTree as ET

df_gg_real_estate_data = pd.DataFrame()

for gu, items in gg_real_estate_data.items():
    for item in items:
        # XML 데이터를 파싱하여 Element 객체로 변환
        xml_data = str(item)

        # XML 파싱
        root = ET.fromstring(xml_data)

        # 각 항목을 딕셔너리 형태로 추출하여 리스트에 저장
        data = {
            'aptDong': root.find('aptDong').text if root.find('aptDong') is not None else '',
            'aptNm': root.find('aptNm').text if root.find('aptNm') is not None else '',
            'aptSeq': root.find('aptSeq').text if root.find('aptSeq') is not None else '',
            'bonbun': root.find('bonbun').text if root.find('bonbun') is not None else '',
            'bubun': root.find('bubun').text if root.find('bubun') is not None else '',
            'buildYear': root.find('buildYear').text if root.find('buildYear') is not None else '',
            'buyerGbn': root.find('buyerGbn').text if root.find('buyerGbn') is not None else '',
            'cdealDay': root.find('cdealDay').text if root.find('cdealDay') is not None else '',
            'cdealType': root.find('cdealType').text if root.find('cdealType') is not None else '',
            'dealAmount': root.find('dealAmount').text if root.find('dealAmount') is not None else '',
            'dealDay': root.find('dealDay').text if root.find('dealDay') is not None else '',
            'dealMonth': root.find('dealMonth').text if root.find('dealMonth') is not None else '',
            'dealYear': root.find('dealYear').text if root.find('dealYear') is not None else '',
            'dealingGbn': root.find('dealingGbn').text if root.find('dealingGbn') is not None else '',
            'estateAgentSggNm': root.find('estateAgentSggNm').text if root.find('estateAgentSggNm') is not None else '',
            'excluUseAr': root.find('excluUseAr').text if root.find('excluUseAr') is not None else '',
            'floor': root.find('floor').text if root.find('floor') is not None else '',
            'jibun': root.find('jibun').text if root.find('jibun') is not None else '',
            'landCd': root.find('landCd').text if root.find('landCd') is not None else '',
            'landLeaseholdGbn': root.find('landLeaseholdGbn').text if root.find('landLeaseholdGbn') is not None else '',
            'rgstDate': root.find('rgstDate').text if root.find('rgstDate') is not None else '',
            'roadNm': root.find('roadNm').text if root.find('roadNm') is not None else '',
            'roadNmBonbun': root.find('roadNmBonbun').text if root.find('roadNmBonbun') is not None else '',
            'roadNmBubun': root.find('roadNmBubun').text if root.find('roadNmBubun') is not None else '',
            'roadNmCd': root.find('roadNmCd').text if root.find('roadNmCd') is not None else '',
            'roadNmSeq': root.find('roadNmSeq').text if root.find('roadNmSeq') is not None else '',
            'roadNmSggCd': root.find('roadNmSggCd').text if root.find('roadNmSggCd') is not None else '',
            'roadNmbCd': root.find('roadNmbCd').text if root.find('roadNmbCd') is not None else '',
            'sggCd': root.find('sggCd').text if root.find('sggCd') is not None else '',
            'slerGbn': root.find('slerGbn').text if root.find('slerGbn') is not None else '',
            'umdCd': root.find('umdCd').text if root.find('umdCd') is not None else '',
            'umdNm': root.find('umdNm').text if root.find('umdNm') is not None else ''
        }

        # 데이터프레임으로 변환
        df = pd.DataFrame([data])

        # 기존 데이터프레임에 합침
        df_gg_real_estate_data = pd.concat([df_gg_real_estate_data, df], ignore_index=True)

# 결과 출력
df_gg_real_estate_data


Unnamed: 0,aptDong,aptNm,aptSeq,bonbun,bubun,buildYear,buyerGbn,cdealDay,cdealType,dealAmount,...,roadNmBonbun,roadNmBubun,roadNmCd,roadNmSeq,roadNmSggCd,roadNmbCd,sggCd,slerGbn,umdCd,umdNm
0,,청평삼성쉐르빌,41820-334,0837,0000,2012,개인,,,26000,...,00054,00000,3216002,01,41820,0,41820,개인,32521,청평면 청평리
1,,가평센트럴파크더스카이,41820-508,0457,0005,2022,개인,,,34800,...,00011,00000,3216017,00,41820,0,41820,개인,25021,가평읍 읍내리
2,,이안지안스청평,41820-436,0657,0001,2018,개인,,,21000,...,00020,00000,3216005,00,41820,,41820,개인,32521,청평면 청평리
3,,에이원파란채,41820-275,0771,0001,2007,개인,,,16000,...,00223,00000,3000142,01,41820,0,41820,개인,25021,가평읍 읍내리
4,102,썬힐,41820-281,0402,0001,2008,개인,,,18000,...,00063,00000,3216052,01,41820,0,41820,개인,25022,가평읍 대곡리
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8698,,시범다은마을우남퍼스트빌,41590-966,0079,0000,2007,개인,,,69500,...,00171,00000,3210027,01,41590,0,41590,개인,12700,반송동
8699,,시범한빛마을삼부르네상스,41590-968,0083,0000,2007,개인,,,61700,...,00207,00000,3210169,01,41590,0,41590,개인,12700,반송동
8700,,한화 포레나 동탄호수,41590-2036,0921,0000,2020,개인,,,71800,...,00011,00000,4852363,00,41590,0,41590,개인,13600,장지동
8701,,신영통현대4차,41590-26,0870,0000,2002,개인,,,42700,...,00020,00000,4430968,01,41590,0,41590,개인,12000,반월동


In [41]:
real_estate_schema = {
    "sggCd": "법정동시군구코드",
    "umdCd": "법정동읍면동코드",
    "landCd": "법정동지번코드",
    "bonbun": "법정동본번코드",
    "bubun": "법정동부번코드",
    "roadNm": "도로명",
    "roadNmSggCd": "도로명시군구코드",
    "roadNmCd": "도로명코드",
    "roadNmSeq": "도로명일련번호코드",
    "roadNmbCd": "도로명지상지하코드",
    "roadNmBonbun": "도로명건물본번호코드",
    "roadNmBubun": "도로명건물부번호코드",
    "umdNm": "법정동",
    "aptNm": "아파트명",
    "jibun": "지번",
    "excluUseAr": "전용면적",
    "dealYear": "계약년도",
    "dealMonth": "계약월",
    "dealDay": "계약일",
    "dealAmount": "거래금액(만원)",
    "floor": "층",
    "buildYear": "건축년도",
    "aptSeq": "단지 일련번호",
    "cdealType": "해제여부",
    "cdealDay": "해제사유발생일",
    "dealingGbn": "거래유형(중개 및 직거래 여부)",
    "estateAgentSggNm": "중개사소재지(시군구 단위)",
    "rgstDate": "등기일자",
    "aptDong": "아파트 동명",
    "slerGbn": "거래주체정보_매도자(개인/법인/공공기관/기타)",
    "buyerGbn": "거래주체정보_매수자(개인/법인/공공기관/기타)",
    "landLeaseholdGbn": "토지임대부 아파트 여부"
}


In [42]:
df_gg_real_estate_data.rename(columns=real_estate_schema, inplace=True)
df_gg_real_estate_data.head()

Unnamed: 0,아파트 동명,아파트명,단지 일련번호,법정동본번코드,법정동부번코드,건축년도,거래주체정보_매수자(개인/법인/공공기관/기타),해제사유발생일,해제여부,거래금액(만원),...,도로명건물본번호코드,도로명건물부번호코드,도로명코드,도로명일련번호코드,도로명시군구코드,도로명지상지하코드,법정동시군구코드,거래주체정보_매도자(개인/법인/공공기관/기타),법정동읍면동코드,법정동
0,,청평삼성쉐르빌,41820-334,837,0,2012,개인,,,26000,...,54,0,3216002,1,41820,0.0,41820,개인,32521,청평면 청평리
1,,가평센트럴파크더스카이,41820-508,457,5,2022,개인,,,34800,...,11,0,3216017,0,41820,0.0,41820,개인,25021,가평읍 읍내리
2,,이안지안스청평,41820-436,657,1,2018,개인,,,21000,...,20,0,3216005,0,41820,,41820,개인,32521,청평면 청평리
3,,에이원파란채,41820-275,771,1,2007,개인,,,16000,...,223,0,3000142,1,41820,0.0,41820,개인,25021,가평읍 읍내리
4,102.0,썬힐,41820-281,402,1,2008,개인,,,18000,...,63,0,3216052,1,41820,0.0,41820,개인,25022,가평읍 대곡리


In [43]:
# 거래금액 칼럼 데이터타입 변환환
df_gg_real_estate_data['거래금액(만원)'] = df_gg_real_estate_data['거래금액(만원)'].str.replace(",", "").astype(int)

In [13]:
# 거래금액을 평균 값으로 그룹화
grouped_gyeonggi_2020 = df_gg_real_estate_data.groupby(["법정동"], as_index=False).agg({"거래금액(만원)": "mean"}).rename(columns={"거래금액(만원)": "거래금액_2020"}) 
grouped_gyeonggi_2020

Unnamed: 0,법정동,거래금액_2020
0,가남읍 신해리,9191.666667
1,가남읍 심석리,7900.000000
2,가능동,29177.272727
3,가사동,22255.555556
4,가수동,14325.000000
...,...,...
497,화도읍 창현리,27411.290323
498,화서동,47325.617284
499,화정동,37906.590909
500,회덕동,24025.000000


In [19]:
grouped_gyeonggi_2021 = df_gg_real_estate_data.groupby(["법정동"], as_index=False).agg({"거래금액(만원)": "mean"}).rename(columns={"거래금액(만원)": "거래금액_2021"}) 
grouped_gyeonggi_2021

Unnamed: 0,법정동,거래금액_2021
0,가남읍 신해리,8537.500000
1,가남읍 심석리,8733.333333
2,가능동,41960.000000
3,가사동,35000.000000
4,가수동,26987.500000
...,...,...
464,화도읍 창현리,30125.000000
465,화서동,56662.500000
466,화정동,54661.538462
467,회덕동,30750.000000


In [25]:
grouped_gyeonggi_2022 = df_gg_real_estate_data.groupby(["법정동"], as_index=False).agg({"거래금액(만원)": "mean"}).rename(columns={"거래금액(만원)": "거래금액_2022"}) 
grouped_gyeonggi_2022

Unnamed: 0,법정동,거래금액_2022
0,가남읍 신해리,11306.250000
1,가남읍 심석리,7556.500000
2,가남읍 태평리,9900.000000
3,가능동,22000.000000
4,가사동,30533.333333
...,...,...
427,화도읍 묵현리,29842.857143
428,화도읍 월산리,41000.000000
429,화도읍 창현리,21300.000000
430,화서동,41133.333333


In [32]:
grouped_gyeonggi_2023 = df_gg_real_estate_data.groupby(["법정동"], as_index=False).agg({"거래금액(만원)": "mean"}).rename(columns={"거래금액(만원)": "거래금액_2023"}) 
grouped_gyeonggi_2023

Unnamed: 0,법정동,거래금액_2023
0,가남읍 신해리,15425.000000
1,가남읍 심석리,8800.000000
2,가능동,29416.666667
3,가사동,37000.000000
4,가수동,25725.000000
...,...,...
480,화도읍 창현리,33087.500000
481,화서동,44100.000000
482,화정동,41470.833333
483,회덕동,26800.000000


In [38]:
grouped_gyeonggi_2024 = df_gg_real_estate_data.groupby(["법정동"], as_index=False).agg({"거래금액(만원)": "mean"}).rename(columns={"거래금액(만원)": "거래금액_2024"}) 
grouped_gyeonggi_2024

Unnamed: 0,법정동,거래금액_2024
0,가남읍 신해리,6400.000000
1,가능동,30773.333333
2,가사동,24950.000000
3,가수동,22566.666667
4,가재동,23500.000000
...,...,...
467,화도읍 묵현리,27400.000000
468,화도읍 창현리,24900.000000
469,화서동,49845.652174
470,화정동,40764.583333


In [44]:
grouped_gyeonggi_2025 = df_gg_real_estate_data.groupby(["법정동"], as_index=False).agg({"거래금액(만원)": "mean"}).rename(columns={"거래금액(만원)": "거래금액_2025"}) 
grouped_gyeonggi_2025

Unnamed: 0,법정동,거래금액_2025
0,가남읍 신해리,12164.285714
1,가남읍 심석리,7500.000000
2,가능동,33420.000000
3,가사동,26090.000000
4,가수동,27558.333333
...,...,...
492,화도읍 차산리,35071.428571
493,화도읍 창현리,28290.909091
494,화서동,49650.000000
495,화정동,44189.436620


In [45]:
gyeonggi = pd.merge(grouped_gyeonggi_2020, grouped_gyeonggi_2021, on='법정동', how='outer')
gyeonggi = pd.merge(gyeonggi, grouped_gyeonggi_2022, on='법정동', how='outer')  
gyeonggi = pd.merge(gyeonggi, grouped_gyeonggi_2023, on='법정동', how='outer')
gyeonggi = pd.merge(gyeonggi, grouped_gyeonggi_2024, on='법정동', how='outer')
gyeonggi = pd.merge(gyeonggi, grouped_gyeonggi_2025, on='법정동', how='outer')
gyeonggi

Unnamed: 0,법정동,거래금액_2020,거래금액_2021,거래금액_2022,거래금액_2023,거래금액_2024,거래금액_2025
0,가남읍 신해리,9191.666667,8537.500000,11306.250000,15425.000000,6400.000000,12164.285714
1,가남읍 심석리,7900.000000,8733.333333,7556.500000,8800.000000,,7500.000000
2,가남읍 태평리,,,9900.000000,,,
3,가능동,29177.272727,41960.000000,22000.000000,29416.666667,30773.333333,33420.000000
4,가사동,22255.555556,35000.000000,30533.333333,37000.000000,24950.000000,26090.000000
...,...,...,...,...,...,...,...
553,화도읍 창현리,27411.290323,30125.000000,21300.000000,33087.500000,24900.000000,28290.909091
554,화서동,47325.617284,56662.500000,41133.333333,44100.000000,49845.652174,49650.000000
555,화정동,37906.590909,54661.538462,44742.857143,41470.833333,40764.583333,44189.436620
556,회덕동,24025.000000,30750.000000,,26800.000000,,
