## 데이터 불러오기

In [1]:
import pandas as pd
#공원데이터 불러오기
park = pd.read_csv('./data/park.csv')
#어린이집 데이터 불러오기
daycare = pd.read_csv('./data/day_care_center.csv')
#아파트 데이터 불러오기
train = pd.read_csv('./data/train.csv')
#지하철 데이터 불러오기
subway = pd.read_csv('./data/Seoul_subway_stations.csv')

## API로 데이터 불러오기 및 데이터 전처리 - 공원

In [2]:
def combine_name_and_type(row):
    name = str(row['park_name']).replace(" ", "")
    ptype = str(row['park_type']).replace(" ", "")
    if ptype in name:
        return name  # 이미 포함되어 있으면 park_name 그대로
    else:
        return name + ptype  # 없으면 붙이기

park['park_name'] = park.apply(combine_name_and_type, axis=1)

In [3]:
# 공원 위치 찾기
import requests
import time

# 카카오 API 설정
KAKAO_API_KEY = "18e2e3c1a4ad76506029e86c48ce034d"
headers = {"Authorization": f"KakaoAK {KAKAO_API_KEY}"}

# 장소 검색 함수
def search_place(keyword):
    url = "https://dapi.kakao.com/v2/local/search/keyword.json"
    params = {"query": keyword}
    res = requests.get(url, headers=headers, params=params)
    if res.status_code == 200:
        for doc in res.json().get('documents', []):
            if "공원" in doc['place_name']:
                return float(doc['y']), float(doc['x'])
    return None, None

# 검색 키워드 생성 (park_name만 사용)
park['search_keyword'] = park['park_name']

# 좌표 리스트 초기화
lat_list = []
lng_list = []

# 좌표 검색 루프
for keyword in park['search_keyword']:
    lat, lng = search_place(keyword)  # ✅ 함수 이름 수정됨!
    lat_list.append(lat)
    lng_list.append(lng)
    time.sleep(0.1)  # 카카오 API 호출 제한 방지

# 결과 저장
park['lat'] = lat_list
park['lng'] = lng_list


In [4]:
import re

failed_parks = park[park['lat'].isna() | park['lng'].isna()]
failed_parks.to_csv("noserch.csv", index=False)

def simplify_name(name):
    # 공원 종류 제거
    name = re.sub(r"(어린이공원|근린공원|도시자연공원기타)", "공원", name)
    # 숫자 제거 (ex. 방아다리1 → 방아다리)
    name = re.sub(r'\d+', '', name)
    return name

failed_parks['search_keyword_fixed'] = failed_parks['search_keyword'].apply(simplify_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_parks['search_keyword_fixed'] = failed_parks['search_keyword'].apply(simplify_name)


In [5]:
# API 키
API_KEY = "18e2e3c1a4ad76506029e86c48ce034d"
headers = {"Authorization": f"KakaoAK {API_KEY}"}

# 검색 함수
def get_coords(keyword):
    url = "https://dapi.kakao.com/v2/local/search/keyword.json"
    params = {"query": keyword}
    try:
        res = requests.get(url, headers=headers, params=params)
        result = res.json()
        if result['documents']:
            first = result['documents'][0]
            return pd.Series([first['y'], first['x'], '성공'])
        else:
            return pd.Series([None, None, '검색실패'])
    except:
        return pd.Series([None, None, '에러'])

# 적용
failed_parks[['lat', 'lng', 'search_result']] = failed_parks['search_keyword_fixed'].apply(get_coords)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_parks[['lat', 'lng', 'search_result']] = failed_parks['search_keyword_fixed'].apply(get_coords)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  failed_parks[['lat', 'lng', 'search_result']] = failed_parks['search_keyword_fixed'].apply(get_coords)


In [6]:
failed_parks.isnull().sum()

city                             0
gu                               0
dong                             0
park_name                        0
park_type                        0
park_area                        0
park_exercise_facility         303
park_entertainment_facility    263
park_benefit_facility          302
park_cultural_facitiy          336
park_facility_other            301
park_open_year                 127
reference_date                   0
search_keyword                   0
lat                            187
lng                            187
search_keyword_fixed             0
search_result                    0
dtype: int64

In [7]:
recovered = failed_parks[failed_parks['lat'].notna() & failed_parks['lng'].notna()]
park.loc[recovered.index, 'lat'] = recovered['lat']
park.loc[recovered.index, 'lng'] = recovered['lng']

In [8]:
not_found = failed_parks[failed_parks['lat'].isna() | failed_parks['lng'].isna()]

In [9]:
park = park.drop(index=not_found.index)

In [10]:
park.isnull().sum()

city                              0
gu                                3
dong                              0
park_name                         0
park_type                         0
park_area                         0
park_exercise_facility          922
park_entertainment_facility     775
park_benefit_facility           942
park_cultural_facitiy          1119
park_facility_other            1025
park_open_year                  368
reference_date                    0
search_keyword                    0
lat                               0
lng                               0
dtype: int64

In [11]:
park = park.drop(columns='search_keyword', axis=True)
park['park_open_year'] = park['park_open_year'].astype('Int64')

In [12]:
#공원 부분에서 사용할 컬럼 지정
park = park[['city', 'gu', 'dong', 'park_name','park_area','lat','lng']]

In [52]:
park.isnull().sum()

city         0
gu           0
dong         0
park_name    0
park_area    0
lat          0
lng          0
dtype: int64

In [51]:
park = park[park['city'] == '서울특별시']

In [45]:
park[park['gu'].isnull()]

Unnamed: 0,city,gu,dong,park_name,park_area,lat,lng
565,부산광역시,,영주동,중앙공원근린공원,5020657.0,36.363272,127.427826
566,부산광역시,,초읍동,어린이대공원근린공원,4947290.0,37.558976,127.090932
567,부산광역시,,온천동,금강공원근린공원,3096579.0,35.219893,129.074864


In [46]:
park = park[~park['gu'].isnull()]

In [53]:
park.isnull().sum()

city         0
gu           0
dong         0
park_name    0
park_area    0
lat          0
lng          0
dtype: int64

In [54]:
park.to_csv('./data/df_park.csv', encoding='utf-8')

## API로 데이터 불러오기 및 데이터 전처리 - 어린이집

In [15]:
daycare['search_keyword'] = daycare['city'] + " " + daycare['gu'] + " " + daycare['day_care_name']

In [16]:
import requests
import pandas as pd
import time

API_KEY = "18e2e3c1a4ad76506029e86c48ce034d"
headers = {"Authorization": f"KakaoAK {API_KEY}"}

def get_coords(keyword):
    url = "https://dapi.kakao.com/v2/local/search/keyword.json"
    params = {"query": keyword}
    try:
        res = requests.get(url, headers=headers, params=params)
        result = res.json()
        if result['documents']:
            first = result['documents'][0]
            return pd.Series([first['y'], first['x']])
    except:
        pass
    return pd.Series([None, None])

# 적용
daycare[['lat', 'lng']] = daycare['search_keyword'].apply(get_coords)



In [17]:
# 이린이집 필요 컬럼만 선정하기
daycare  = daycare [['city', 'gu', 'day_care_name', 'is_commuting_vehicle','reference_date', 'lat', 'lng']]

In [18]:
daycare  = daycare [daycare ['city'] == '서울특별시']

In [19]:
daycare = daycare[~daycare['lat'].isnull()]

In [20]:
daycare['is_commuting_vehicle'] = daycare['is_commuting_vehicle'].fillna('N')

In [21]:
daycare.isnull().sum()

city                    0
gu                      0
day_care_name           0
is_commuting_vehicle    0
reference_date          0
lat                     0
lng                     0
dtype: int64

In [22]:
daycare.to_csv('./data/df_daycare.csv', index=False)

## API로 데이터 불러오기 및 데이터 전처리 - 아파트

In [23]:
from tqdm import tqdm
import requests
import time
import pandas as pd

# Kakao 주소 검색 API 설정
API_KEY = "18e2e3c1a4ad76506029e86c48ce034d"  # 카카오 REST API 키
headers = {"Authorization": f"KakaoAK {API_KEY}"}  # 요청 헤더

# 주소 기반 위경도 검색 함수
def get_coords_by_address(address):
    url = "https://dapi.kakao.com/v2/local/search/address.json"
    params = {"query": address}
    try:
        res = requests.get(url, headers=headers, params=params)
        result = res.json()
        if 'documents' in result and result['documents']:
            first = result['documents'][0]
            return float(first['y']), float(first['x'])  # 위도, 경도
    except Exception as e:
        print(f"[에러] {address} → {e}")
    return None, None

# 1. 서울특별시만 필터링
train = train[train['city'] == '서울특별시'].copy()

# 2. 지번 주소로 검색 키워드 생성 (예: "서울특별시 강남구 삼성동 123-45")
train['search_keyword'] = train['city'] + " " + train['dong'] + " " + train['jibun']

# 3. 중복 제거
unique_addrs = train[['search_keyword']].drop_duplicates().copy()

# 4. 지번 주소 기반 위경도 요청
lat_list, lng_list = [], []
for keyword in tqdm(unique_addrs['search_keyword']):
    lat, lng = get_coords_by_address(keyword)
    lat_list.append(lat)
    lng_list.append(lng)
    time.sleep(0.1)  # 과도한 호출 방지

# 5. 좌표 결과 병합
unique_addrs['lat'] = lat_list
unique_addrs['lng'] = lng_list
train = train.merge(unique_addrs, on='search_keyword', how='left')

# 6. 동일 단지(key) 기준으로 좌표 채우기
train['key'] = train['city'] + train['dong'] + train['apt']
lat_dict = train.dropna(subset=['lat'])[['key', 'lat']].drop_duplicates().set_index('key')['lat'].to_dict()
lng_dict = train.dropna(subset=['lng'])[['key', 'lng']].drop_duplicates().set_index('key')['lng'].to_dict()
train['lat'] = train.apply(lambda row: lat_dict.get(row['key'], row['lat']), axis=1)
train['lng'] = train.apply(lambda row: lng_dict.get(row['key'], row['lng']), axis=1)

# 7. 여전히 좌표 누락된 경우 → 아파트명 주소 기반 재시도
train_missing = train[train['lat'].isna()].copy()
train_missing['search_keyword'] = train_missing['city'] + " " + train_missing['dong'] + " " + train_missing['apt']

# 8. 아파트명 주소 기반 위경도 요청
lat_list, lng_list = [], []
for keyword in tqdm(train_missing['search_keyword']):
    lat, lng = get_coords_by_address(keyword)
    lat_list.append(lat)
    lng_list.append(lng)
    time.sleep(0.1)

train_missing['lat'] = lat_list
train_missing['lng'] = lng_list

# 9. 새로 찾은 좌표만 추출하여 병합
train_fixed = train_missing[train_missing['lat'].notna()].copy()
train_fixed['key'] = train_fixed['city'] + train_fixed['dong'] + train_fixed['apt']
lat_dict = train_fixed[['key', 'lat']].drop_duplicates().set_index('key')['lat'].to_dict()
lng_dict = train_fixed[['key', 'lng']].drop_duplicates().set_index('key')['lng'].to_dict()
train['lat'] = train.apply(lambda row: lat_dict.get(row['key'], row['lat']), axis=1)
train['lng'] = train.apply(lambda row: lng_dict.get(row['key'], row['lng']), axis=1)

# 10. 최종 결과 출력
print("최종 좌표 수집 완료")
print("좌표가 있는 행 수:", train['lat'].notna().sum())
print("여전히 누락된 좌표 수:", train['lat'].isna().sum())


100%|██████████| 8174/8174 [21:11<00:00,  6.43it/s]
100%|██████████| 17667/17667 [42:43<00:00,  6.89it/s]


최종 좌표 수집 완료
좌표가 있는 행 수: 724748
여전히 누락된 좌표 수: 17537


In [24]:
train.isnull().sum()

transaction_id                0
apartment_id                  0
city                          0
dong                          0
jibun                         0
apt                           0
addr_kr                       0
exclusive_use_area            0
year_of_completion            0
transaction_year_month        0
transaction_date              0
floor                         0
transaction_real_price        0
search_keyword                0
lat                       17537
lng                       17537
key                           0
dtype: int64

In [None]:
train = train[~train['lat'].isnull()]
# train.drop(columns=['Unnamed: 0'], inplace=True)

In [27]:
df_train = train.copy()

In [28]:
df_train.isnull().sum()

transaction_id            0
apartment_id              0
city                      0
dong                      0
jibun                     0
apt                       0
addr_kr                   0
exclusive_use_area        0
year_of_completion        0
transaction_year_month    0
transaction_date          0
floor                     0
transaction_real_price    0
search_keyword            0
lat                       0
lng                       0
key                       0
dtype: int64

## API로 데이터 불러오기 및 데이터 전처리 - 한강

In [29]:
# 한강공원 데이터 얻기 (한강 대처용)
API_KEY = "AIzaSyB-Cz1OT3LvYXdpglAiTNxpQIiW6fOGutU"
query = "한강공원"
location = "37.5665,126.9780"  # 서울 중심
radius = 15000  # 15km 반경

url = (
    f"https://maps.googleapis.com/maps/api/place/textsearch/json"
    f"?query={query}&location={location}&radius={radius}&key={API_KEY}"
)

res = requests.get(url)
data = res.json()

name = []
lat = []
lng = []

for place in data.get("results", []):
    name.append(place["name"])
    lat.append(place["geometry"]["location"]["lat"])
    lng.append(place["geometry"]["location"]["lng"])

# zip으로 묶어서 DataFrame 생성
river_park = pd.DataFrame(zip(name, lat, lng), columns=["name", "lat", "lng"])

# 확인
print(river_park.head())

                   name        lat         lng
0  Yeouido Hangang Park  37.526711  126.934711
1          Hangang Park  37.527001  127.019889
2    Banpo Hangang Park  37.510623  126.995963
3   Jamwon Hangang Park  37.520686  127.012272
4   Jamsil Hangang Park  37.517590  127.086724


In [30]:
# 한강공원 데이터 필요 컬럼만 사용
river_park = river_park[['name','lat', 'lng']]

In [31]:
river_park.isnull().sum()

name    0
lat     0
lng     0
dtype: int64

In [32]:
river_park.to_csv('./data/river_park.csv', encoding='utf-8')

## API로 데이터 불러오기 및 데이터 전처리 - 지하철

In [33]:
# 지하철 데이터 필요 컬럼만 사용
subway = subway[['line', 'name', 'lat', 'lng']]

In [34]:
subway.isnull().sum()

line    0
name    0
lat     0
lng     0
dtype: int64

## 거리계산으로 아파트와 주변 인프라 거리 구하기

In [35]:
import pandas as pd
from math import radians, sin, cos, sqrt, atan2
from tqdm import tqdm

# 거리 계산 함수
def haversine(lat1, lng1, lat2, lng2):
    R = 6371
    lat1, lng1, lat2, lng2 = map(radians, [lat1, lng1, lat2, lng2])
    dlat = lat2 - lat1
    dlng = lng2 - lng1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlng/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c  # km 단위

df_daycare = pd.read_csv('./data/df_daycare.csv')
df_park = pd.read_csv('./data/df_park.csv')
df_river = pd.read_csv('./data/river_park.csv')
df_subway = pd.read_csv('./data/Seoul_subway_stations.csv')

# 🔸 필요한 컬럼만 추출 (lat, lng 컬럼 이름은 실제 데이터 확인 후 맞춰줘야 해)
daycare_coords = df_daycare[['lat', 'lng']].dropna().values
park_coords = df_park[['lat', 'lng']].dropna().values
river_coords = df_river[['lat', 'lng']].dropna().values
subway_coords = df_subway[['lat', 'lng']].dropna().values

# 🔸 거리 계산 함수: 특정 시설까지 가장 가까운 거리 계산
def get_min_distance(lat, lng, facility_coords):
    return min([haversine(lat, lng, f_lat, f_lng) for f_lat, f_lng in facility_coords])

# 🔸 거리 계산 시작
dist_daycare = []
dist_park = []
dist_river = []
dist_subway = []

for _, row in tqdm(df_train.iterrows(), total=len(df_train)):
    if pd.notna(row['lat']) and pd.notna(row['lng']):
        # 기존에 추가했던 부분은 그대로 두고
        dist_daycare.append(get_min_distance(row['lat'], row['lng'], daycare_coords))
        dist_park.append(get_min_distance(row['lat'], row['lng'], park_coords))
        dist_river.append(get_min_distance(row['lat'], row['lng'], river_coords))
        dist_subway.append(get_min_distance(row['lat'], row['lng'], subway_coords))  # 👈 추가
    else:
        dist_daycare.append(None)
        dist_park.append(None)
        dist_river.append(None)
        dist_subway.append(None)

# 🔸 결과 컬럼 추가
df_train['dist_daycare'] = dist_daycare
df_train['dist_park'] = dist_park
df_train['dist_river_park'] = dist_river
df_train['dist_subway'] = dist_subway

# 🔸 확인
print(df_train[['apt', 'dist_daycare', 'dist_park', 'dist_river_park', 'dist_subway']].head())


100%|██████████| 724748/724748 [2:14:46<00:00, 89.62it/s]   


        apt  dist_daycare  dist_park  dist_river_park  dist_subway
0  신현(101동)      0.313544   0.913290         5.692753     1.002637
1    사직파크맨션      0.205796   0.758868         5.169633     0.374661
2    두레엘리시안      0.090520   0.784680         5.344415     0.432593
3     파크팰리스      0.028071   0.735842         5.077732     0.317168
4      킹스매너      0.123193   0.638931         5.005506     0.386442


In [36]:
df_train.isnull().sum()

transaction_id            0
apartment_id              0
city                      0
dong                      0
jibun                     0
apt                       0
addr_kr                   0
exclusive_use_area        0
year_of_completion        0
transaction_year_month    0
transaction_date          0
floor                     0
transaction_real_price    0
search_keyword            0
lat                       0
lng                       0
key                       0
dist_daycare              0
dist_park                 0
dist_river_park           0
dist_subway               0
dtype: int64

In [37]:
df_train = df_train[~df_train['lat'].isnull()]

In [40]:
from math import radians, cos, sin, asin, sqrt
tqdm.pandas() 
# 1. 데이터 불러오기

df_subway

# 2. 거리 계산 함수
def haversine(lon1, lat1, lon2, lat2):
    R = 6371  # km
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    return R * c * 1000  # meters

# 3. 거리 범위 내 지하철역 개수 및 이름
def get_subways_in_range(lat, lng, df_subway, min_d=100, max_d=1000):
    names = []
    for _, row in df_subway.iterrows():
        dist = haversine(lng, lat, row['lng'], row['lat'])
        if min_d < dist <= max_d:
            names.append(row['name'])
    return len(names), ', '.join(names)

# 4. tqdm + apply로 진행률 보이게 적용
df_train[['num_subways_100_1000m', 'nearby_subways_100_1000m']] = df_train.progress_apply(
    lambda row: pd.Series(get_subways_in_range(row['lat'], row['lng'], df_subway)),
    axis=1
)

100%|██████████| 724748/724748 [3:12:01<00:00, 62.90it/s]   


In [42]:
df_train

Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,...,search_keyword,lat,lng,key,dist_daycare,dist_park,dist_river_park,dist_subway,num_subways_100_1000m,nearby_subways_100_1000m
0,0,7622,서울특별시,신교동,6-13,신현(101동),신교동 6-13 신현(101동),84.82,2002,200801,...,서울특별시 신교동 6-13,37.583583,126.967867,서울특별시신교동신현(101동),0.313544,0.913290,5.692753,1.002637,0,
1,1,5399,서울특별시,필운동,142,사직파크맨션,필운동 142 사직파크맨션,99.17,1973,200801,...,서울특별시 필운동 142,37.576334,126.969340,서울특별시필운동사직파크맨션,0.205796,0.758868,5.169633,0.374661,2,"경복궁, 광화문"
2,2,3578,서울특별시,필운동,174-1,두레엘리시안,필운동 174-1 두레엘리시안,84.74,2007,200801,...,서울특별시 필운동 174-1,37.578219,126.969724,서울특별시필운동두레엘리시안,0.090520,0.784680,5.344415,0.432593,1,경복궁
3,3,10957,서울특별시,내수동,95,파크팰리스,내수동 95 파크팰리스,146.39,2003,200801,...,서울특별시 내수동 95,37.574170,126.970544,서울특별시내수동파크팰리스,0.028071,0.735842,5.077732,0.317168,3,"경복궁, 서대문, 광화문"
4,4,10639,서울특별시,내수동,110-15,킹스매너,내수동 110-15 킹스매너,194.43,2004,200801,...,서울특별시 내수동 110-15,37.573280,126.970461,서울특별시내수동킹스매너,0.123193,0.638931,5.005506,0.386442,3,"경복궁, 서대문, 광화문"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742280,1196844,996,서울특별시,강일동,717,고덕리엔파크2단지,강일동 717 고덕리엔파크2단지,59.72,2011,201711,...,서울특별시 강일동 717,37.559597,127.173087,서울특별시강일동고덕리엔파크2단지,0.041076,0.112341,3.656264,0.340784,2,"상일동, 강일"
742281,1196846,682,서울특별시,강일동,674,강일리버파크3단지,강일동 674 강일리버파크3단지,59.87,2009,201711,...,서울특별시 강일동 674,37.566787,127.174602,서울특별시강일동강일리버파크3단지,0.074864,0.181848,3.443433,1.042478,0,
742282,1196847,682,서울특별시,강일동,674,강일리버파크3단지,강일동 674 강일리버파크3단지,84.74,2009,201711,...,서울특별시 강일동 674,37.566787,127.174602,서울특별시강일동강일리버파크3단지,0.074864,0.181848,3.443433,1.042478,0,
742283,1196848,680,서울특별시,강일동,665,강일리버파크1단지,강일동 665 강일리버파크1단지,84.74,2009,201711,...,서울특별시 강일동 665,37.567640,127.172116,서울특별시강일동강일리버파크1단지,0.105664,0.391109,3.259564,1.178679,0,


In [61]:
# 1. 관련 컬럼만 추출
corr_cols = ['dist_daycare', 'dist_park', 'dist_river_park','dist_subway','num_subways_100_1000m','transaction_real_price']
corr_df =  df_train[corr_cols].copy()

# 2. 상관계수 확인
correlations = corr_df.corr()['transaction_real_price'].drop('transaction_real_price').abs()
correlations.round(3)


dist_daycare             0.193
dist_park                0.170
dist_river_park          0.390
dist_subway              0.128
num_subways_100_1000m    0.175
Name: transaction_real_price, dtype: float64

In [62]:
# 1. 절댓값 기준 정규화
corr_values = {
    'dist_daycare': 0.193,
    'dist_park': 0.170,
    'dist_river_park': 0.390,
    'dist_subway':0.128,
    'num_subways_100_1000m':0.175
}

total = sum(corr_values.values())
weights = {k: round(v/total, 3) for k, v in corr_values.items()}
print(weights)


{'dist_daycare': 0.183, 'dist_park': 0.161, 'dist_river_park': 0.369, 'dist_subway': 0.121, 'num_subways_100_1000m': 0.166}


In [63]:
from sklearn.preprocessing import StandardScaler

# 1. 서비스 컬럼 지정
service_cols = ['dist_daycare', 'dist_park', 'dist_river_park', 'dist_subway', 'num_subways_100_1000m']

# 2. NaN 처리: 평균으로 채우기
df_train[service_cols] = df_train[service_cols].fillna(df_train[service_cols].mean())

# 3. 가중치 지정
weights = {
    'dist_daycare': 0.183,
    'dist_park': 0.161,
    'dist_river_park': 0.369,
    'dist_subway': 0.121,
    'num_subways_100_1000m': 0.166
}

# 4. 표준화
scaler = StandardScaler()
scaled = scaler.fit_transform(df_train[service_cols])

# 5. 인덱스를 맞춰서 DataFrame 생성
scaled_df = pd.DataFrame(scaled, columns=[col + '_w' for col in service_cols], index=df_train.index)

# 6. 가중치 적용하여 'infra' 점수 생성
df_train['infra'] = (
    weights['dist_daycare'] * scaled_df['dist_daycare_w'] +
    weights['dist_park'] * scaled_df['dist_park_w'] +
    weights['dist_river_park'] * scaled_df['dist_river_park_w'] +
    weights['dist_subway'] * scaled_df['dist_subway_w'] +
    weights['num_subways_100_1000m'] * scaled_df['num_subways_100_1000m_w']
)


In [64]:
df_train

Unnamed: 0,transaction_id,apartment_id,city,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,transaction_year_month,...,lng,key,dist_daycare,dist_park,dist_river_park,dist_subway,num_subways_100_1000m,nearby_subways_100_1000m,infra_weighted_score,infra
0,0,7622,서울특별시,신교동,6-13,신현(101동),신교동 6-13 신현(101동),84.82,2002,200801,...,126.967867,서울특별시신교동신현(101동),0.313544,0.913290,5.692753,1.002637,0,,1.932716,0.227058
1,1,5399,서울특별시,필운동,142,사직파크맨션,필운동 142 사직파크맨션,99.17,1973,200801,...,126.969340,서울특별시필운동사직파크맨션,0.205796,0.758868,5.169633,0.374661,2,"경복궁, 광화문",2.287412,-0.033112
2,2,3578,서울특별시,필운동,174-1,두레엘리시안,필운동 174-1 두레엘리시안,84.74,2007,200801,...,126.969724,서울특별시필운동두레엘리시안,0.090520,0.784680,5.344415,0.432593,1,경복궁,2.332973,-0.204013
3,3,10957,서울특별시,내수동,95,파크팰리스,내수동 95 파크팰리스,146.39,2003,200801,...,126.970544,서울특별시내수동파크팰리스,0.028071,0.735842,5.077732,0.317168,3,"경복궁, 서대문, 광화문",2.472525,-0.179179
4,4,10639,서울특별시,내수동,110-15,킹스매너,내수동 110-15 킹스매너,194.43,2004,200801,...,126.970461,서울특별시내수동킹스매너,0.123193,0.638931,5.005506,0.386442,3,"경복궁, 서대문, 광화문",2.388258,-0.084931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742280,1196844,996,서울특별시,강일동,717,고덕리엔파크2단지,강일동 717 고덕리엔파크2단지,59.72,2011,201711,...,127.173087,서울특별시강일동고덕리엔파크2단지,0.041076,0.112341,3.656264,0.340784,2,"상일동, 강일",2.820146,-0.578513
742281,1196846,682,서울특별시,강일동,674,강일리버파크3단지,강일동 674 강일리버파크3단지,59.87,2009,201711,...,127.174602,서울특별시강일동강일리버파크3단지,0.074864,0.181848,3.443433,1.042478,0,,2.491135,-0.479667
742282,1196847,682,서울특별시,강일동,674,강일리버파크3단지,강일동 674 강일리버파크3단지,84.74,2009,201711,...,127.174602,서울특별시강일동강일리버파크3단지,0.074864,0.181848,3.443433,1.042478,0,,2.491135,-0.479667
742283,1196848,680,서울특별시,강일동,665,강일리버파크1단지,강일동 665 강일리버파크1단지,84.74,2009,201711,...,127.172116,서울특별시강일동강일리버파크1단지,0.105664,0.391109,3.259564,1.178679,0,,2.317044,-0.358165


In [None]:

address_df = pd.read_csv("./data/address.csv")

dong_to_gu = dict(zip(address_df['dong'], address_df['gu']))

# 3. df에 gu 컬럼 생성 (dong 기준 매핑)
df_train['gu'] = df_train['dong'].map(dong_to_gu)


In [68]:
# 1. 컬럼 리스트 만들기
cols = df_train.columns.tolist()  # 전체 컬럼 순서
cols.insert(cols.index('city') + 1, cols.pop(cols.index('gu')))  # 'gu'를 'dong' 다음 위치로 이동

# 2. 순서를 바꾼 컬럼 리스트로 재정렬
df_train = df_train[cols]

In [None]:
df_train.drop(columns=['Unnamed: 0','search_keyword','key'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.drop(columns=['search_keyword','key'], inplace=True)


In [71]:
df_train.reset_index(drop=True, inplace=True )

In [72]:
df_train

Unnamed: 0,transaction_id,apartment_id,city,gu,dong,jibun,apt,addr_kr,exclusive_use_area,year_of_completion,...,lat,lng,dist_daycare,dist_park,dist_river_park,dist_subway,num_subways_100_1000m,nearby_subways_100_1000m,infra_weighted_score,infra
0,0,7622,서울특별시,종로구,신교동,6-13,신현(101동),신교동 6-13 신현(101동),84.82,2002,...,37.583583,126.967867,0.313544,0.913290,5.692753,1.002637,0,,1.932716,0.227058
1,1,5399,서울특별시,종로구,필운동,142,사직파크맨션,필운동 142 사직파크맨션,99.17,1973,...,37.576334,126.969340,0.205796,0.758868,5.169633,0.374661,2,"경복궁, 광화문",2.287412,-0.033112
2,2,3578,서울특별시,종로구,필운동,174-1,두레엘리시안,필운동 174-1 두레엘리시안,84.74,2007,...,37.578219,126.969724,0.090520,0.784680,5.344415,0.432593,1,경복궁,2.332973,-0.204013
3,3,10957,서울특별시,종로구,내수동,95,파크팰리스,내수동 95 파크팰리스,146.39,2003,...,37.574170,126.970544,0.028071,0.735842,5.077732,0.317168,3,"경복궁, 서대문, 광화문",2.472525,-0.179179
4,4,10639,서울특별시,종로구,내수동,110-15,킹스매너,내수동 110-15 킹스매너,194.43,2004,...,37.573280,126.970461,0.123193,0.638931,5.005506,0.386442,3,"경복궁, 서대문, 광화문",2.388258,-0.084931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724743,1196844,996,서울특별시,강동구,강일동,717,고덕리엔파크2단지,강일동 717 고덕리엔파크2단지,59.72,2011,...,37.559597,127.173087,0.041076,0.112341,3.656264,0.340784,2,"상일동, 강일",2.820146,-0.578513
724744,1196846,682,서울특별시,강동구,강일동,674,강일리버파크3단지,강일동 674 강일리버파크3단지,59.87,2009,...,37.566787,127.174602,0.074864,0.181848,3.443433,1.042478,0,,2.491135,-0.479667
724745,1196847,682,서울특별시,강동구,강일동,674,강일리버파크3단지,강일동 674 강일리버파크3단지,84.74,2009,...,37.566787,127.174602,0.074864,0.181848,3.443433,1.042478,0,,2.491135,-0.479667
724746,1196848,680,서울특별시,강동구,강일동,665,강일리버파크1단지,강일동 665 강일리버파크1단지,84.74,2009,...,37.567640,127.172116,0.105664,0.391109,3.259564,1.178679,0,,2.317044,-0.358165


In [75]:
df_train.fillna(0, inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.fillna(0, inplace=True)


In [76]:
df_train.to_csv('./data/total_train.csv', encoding='utf-8', index=False)