# 0. Install Library

In [3]:
!pip install eli5==0.13.0

[0m

In [4]:
# 한글 폰트 사용을 위한 라이브러리입니다.
!apt-get install -y fonts-nanum

Reading package lists... Done
Building dependency tree       
Reading state information... Done
fonts-nanum is already the newest version (20180306-3).
0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.


In [103]:
pip install geopy

[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
pip install googlemaps

# 1. Library Import

In [54]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

# 2. Data Loading

In [55]:
# 필요한 데이터를 load 하겠습니다. 경로는 환경에 맞게 지정해주면 됩니다.
train_path = '../data/train.csv'
test_path  = '../data/test.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [4]:
# Train data와 Test data shape은 아래와 같습니다.
print('Train data shape : ', train.shape, 'Test data shape : ', test.shape)

Train data shape :  (1118822, 52) Test data shape :  (9272, 51)


In [5]:
# Train과 Test data를 살펴보겠습니다.
display(train.head(1))
display(test.head(1))      # 부동산 실거래가(=Target) column이 제외된 모습입니다.

Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,건축면적,주차대수,기타/의무/임대/임의=1/2/3/4,단지승인일,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일,target
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712,8,3,1987,...,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0,124000


Unnamed: 0,시군구,번지,본번,부번,아파트명,전용면적(㎡),계약년월,계약일,층,건축년도,...,청소비관리형태,건축면적,주차대수,기타/의무/임대/임의=1/2/3/4,단지승인일,사용허가여부,관리비 업로드,좌표X,좌표Y,단지신청일
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,202307,26,5,1987,...,직영,4858.0,262.0,임의,2022-11-17 13:00:29.0,Y,N,127.05721,37.476763,2022-11-17 10:19:06.0


# 3. Check Data

In [108]:
train.columns

Index(['시군구', '번지', '본번', '부번', '아파트명', '전용면적(㎡)', '계약년월', '계약일', '층', '건축년도',
       '도로명', '해제사유발생일', '등기신청일자', '거래유형', '중개사소재지', 'k-단지분류(아파트,주상복합등등)',
       'k-전화번호', 'k-팩스번호', '단지소개기존clob', 'k-세대타입(분양형태)', 'k-관리방식', 'k-복도유형',
       'k-난방방식', 'k-전체동수', 'k-전체세대수', 'k-건설사(시공사)', 'k-시행사', 'k-사용검사일-사용승인일',
       'k-연면적', 'k-주거전용면적', 'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)',
       'k-전용면적별세대현황(60㎡~85㎡이하)', 'k-85㎡~135㎡이하', 'k-135㎡초과', 'k-홈페이지',
       'k-등록일자', 'k-수정일자', '고용보험관리번호', '경비비관리형태', '세대전기계약방법', '청소비관리형태',
       '건축면적', '주차대수', '기타/의무/임대/임의=1/2/3/4', '단지승인일', '사용허가여부', '관리비 업로드',
       '좌표X', '좌표Y', '단지신청일', 'target'],
      dtype='object')

### 3-1. Variable Korean to English 

In [1]:
train.rename(columns={'시군구':'city','번지':'address','본번':'first_num','부번':'second_num','아파트명':'name',
                      '전용면적(㎡)':'area','계약년월':'contract_year','계약일':'contract_day','층':'floor',
                      '건축년도':'construction_year','도로명':'road_name','해제사유발생일':'cancellation_day',
                      '등기신청일자':'registration_day','거래유형':'transaction_type','중개사소재지':'estate_agent',
                      'k-단지분류(아파트,주상복합등등)':'classification_complex','k-전화번호':'phone_number',
                      'k-팩스번호':'fax_number','단지소개기존clob':'clob','k-세대타입(분양형태)':'sales_type',
                      'k-관리방식':'management_system','k-복도유형':'hallway_type','k-난방방식':'heating_system',
                      'k-전체동수':'total_unit','k-전체세대수':'total_households',
                      'k-건설사(시공사)':'construction_company','k-시행사':'developer',
                      'k-사용검사일-사용승인일':'inspection_approval_date', 'k-연면적': 'total_area',
                      'k-주거전용면적':'residential_area','k-관리비부과면적':'management_fee_area',
                      'k-전용면적별세대현황(60㎡이하)':'household_status_by_area_60',
                      'k-전용면적별세대현황(60㎡~85㎡이하)':'household_status_by_area_85',
                      'k-85㎡~135㎡이하':'household_status_by_area_135', 'k-135㎡초과':'household_status_by_area_big',
                      'k-홈페이지':'homepage','k-등록일자':'registration_date', 'k-수정일자':'modification_date',
                      '고용보험관리번호':'employment_insurance_management_number','경비비관리형태':'guard_management_type',
                      '세대전기계약방법':'electricity_contract','청소비관리형태':'cleaning_fee_management_type',
                      '건축면적':'building_area','주차대수':'parking_space','기타/의무/임대/임의=1/2/3/4':'other1_obligation2_rent3_optional4',
                      '단지승인일':'unit_approval_date','사용허가여부':'permission_yes_no','관리비 업로드':'management_cost',
                      '좌표X':'axisX','좌표Y':'axisY','단지신청일':'unit_application_date'
                      },inplace=True)

NameError: name 'train' is not defined

In [57]:
test.rename(columns={'시군구':'city','번지':'address','본번':'first_num','부번':'second_num','아파트명':'name',
                      '전용면적(㎡)':'area','계약년월':'contract_year','계약일':'contract_day','층':'floor',
                      '건축년도':'construction_year','도로명':'road_name','해제사유발생일':'cancellation_day',
                      '등기신청일자':'registration_day','거래유형':'transaction_type','중개사소재지':'estate_agent',
                      'k-단지분류(아파트,주상복합등등)':'classification_complex','k-전화번호':'phone_number',
                      'k-팩스번호':'fax_number','단지소개기존clob':'clob','k-세대타입(분양형태)':'sales_type',
                      'k-관리방식':'management_system','k-복도유형':'hallway_type','k-난방방식':'heating_system',
                      'k-전체동수':'total_unit','k-전체세대수':'total_households',
                      'k-건설사(시공사)':'construction_company','k-시행사':'developer',
                      'k-사용검사일-사용승인일':'inspection_approval_date', 'k-연면적': 'total_area',
                      'k-주거전용면적':'residential_area','k-관리비부과면적':'management_fee_area',
                      'k-전용면적별세대현황(60㎡이하)':'household_status_by_area_60',
                      'k-전용면적별세대현황(60㎡~85㎡이하)':'household_status_by_area_85',
                      'k-85㎡~135㎡이하':'household_status_by_area_135', 'k-135㎡초과':'household_status_by_area_big',
                      'k-홈페이지':'homepage','k-등록일자':'registration_date', 'k-수정일자':'modification_date',
                      '고용보험관리번호':'employment_insurance_management_number','경비비관리형태':'guard_management_type',
                      '세대전기계약방법':'electricity_contract','청소비관리형태':'cleaning_fee_management_type',
                      '건축면적':'building_area','주차대수':'parking_space','기타/의무/임대/임의=1/2/3/4':'other1_obligation2_rent3_optional4',
                      '단지승인일':'unit_approval_date','사용허가여부':'permission_yes_no','관리비 업로드':'management_cost',
                      '좌표X':'axisX','좌표Y':'axisY','단지신청일':'unit_application_date'
                      },inplace=True)

# 4. Train EDA

In [111]:
train.info(verbose=True,null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1118822 entries, 0 to 1118821
Data columns (total 52 columns):
 #   Column                                  Non-Null Count    Dtype  
---  ------                                  --------------    -----  
 0   city                                    1118822 non-null  object 
 1   address                                 1118597 non-null  object 
 2   first_num                               1118747 non-null  float64
 3   second_num                              1118747 non-null  float64
 4   name                                    1116696 non-null  object 
 5   area                                    1118822 non-null  float64
 6   contract_year                           1118822 non-null  int64  
 7   contract_day                            1118822 non-null  int64  
 8   floor                                   1118822 non-null  int64  
 9   construction_year                       1118822 non-null  int64  
 10  road_name                     

In [114]:
train1 = train.copy()

In [83]:
train1.info(verbose=True,null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1118821 entries, 0 to 1118821
Data columns (total 54 columns):
 #   Column                                  Non-Null Count    Dtype  
---  ------                                  --------------    -----  
 0   city                                    1118821 non-null  object 
 1   address                                 1118821 non-null  object 
 2   first_num                               1118821 non-null  float64
 3   second_num                              1118821 non-null  float64
 4   name                                    1116695 non-null  object 
 5   area                                    1118821 non-null  float64
 6   contract_year                           1118821 non-null  int64  
 7   contract_day                            1118821 non-null  int64  
 8   floor                                   1118821 non-null  int64  
 9   construction_year                       1118821 non-null  int64  
 10  road_name                     

In [133]:
train1.describe()

Unnamed: 0,first_num,second_num,area,contract_year,contract_day,floor,construction_year,cancellation_day,clob,total_unit,...,management_fee_area,household_status_by_area_60,household_status_by_area_85,household_status_by_area_135,household_status_by_area_big,building_area,parking_space,axisX,axisY,target
count,1118747.0,1118747.0,1118822.0,1118822.0,1118822.0,1118822.0,1118822.0,5983.0,68582.0,248192.0,...,249259.0,249214.0,249214.0,249214.0,327.0,249108.0,249108.0,249152.0,249152.0,1118822.0
mean,564.9108,5.978885,77.17475,201476.0,15.80656,8.871968,1998.755,20210570.0,541.529979,14.798346,...,120726.487549,477.912838,476.713439,167.52847,70.0,189507.0,1063.678778,126.995228,37.545785,57991.53
std,516.0642,46.68584,29.36423,418.7868,8.721166,5.982584,9.333908,10606.97,751.809853,17.693533,...,129020.27648,759.9094,727.553569,248.928143,0.0,1729027.0,1235.437604,0.091045,0.052483,46426.02
min,0.0,0.0,10.02,200701.0,1.0,-4.0,1961.0,20200220.0,1.0,1.0,...,0.0,0.0,0.0,0.0,70.0,0.0,0.0,126.798318,37.447843,350.0
25%,176.0,0.0,59.65,201110.0,8.0,4.0,1992.0,20200820.0,4.0,5.0,...,40735.0,48.0,95.0,0.0,70.0,0.0,315.0,126.913157,37.499201,30500.0
50%,470.0,0.0,81.88,201507.0,16.0,8.0,2000.0,20210300.0,174.0,10.0,...,78125.0,225.0,256.0,63.0,70.0,1710.55,683.0,127.014971,37.544936,44800.0
75%,781.0,1.0,84.96,201804.0,23.0,12.0,2005.0,20220210.0,725.0,17.0,...,159544.0,576.0,582.0,237.0,70.0,8414.21,1274.0,127.05959,37.577117,69800.0
max,4974.0,2837.0,424.32,202306.0,31.0,69.0,2023.0,20230930.0,2888.0,124.0,...,969877.0,4975.0,5132.0,1500.0,70.0,31596200.0,12096.0,127.179998,37.687725,1450000.0


### 4-1-1. VARIABLE

In [134]:
max_target_row = train1[train1['target'] == train1['target'].max()]
max_target_row

Unnamed: 0,city,address,first_num,second_num,name,area,contract_year,contract_day,floor,construction_year,...,building_area,parking_space,other1_obligation2_rent3_optional4,unit_approval_date,permission_yes_no,management_cost,axisX,axisY,unit_application_date,target
224567,서울특별시 강남구 청담동,129,129.0,0.0,PH129,273.96,202204,28,16,2020,...,,,,,,,,,,1450000


#### 4-1-1-1. address

In [10]:
train1[train1['address'].isna()]

Unnamed: 0,city,address,first_num,second_num,name,area,contract_year,contract_day,floor,construction_year,...,building_area,parking_space,other1_obligation2_rent3_optional4,unit_approval_date,permission_yes_no,management_cost,axisX,axisY,unit_application_date,target
56930,서울특별시 서초구 내곡동,,0.0,0.0,서초포레스타2단지,84.87,201710,16,6,2015,...,8252.0,1185.0,의무,2019-04-24 15:11:04.0,Y,N,127.062596,37.454703,2015-07-17 11:07:27.0,88900
56931,서울특별시 서초구 내곡동,,0.0,0.0,서초포레스타2단지,59.21,201710,19,11,2015,...,8252.0,1185.0,의무,2019-04-24 15:11:04.0,Y,N,127.062596,37.454703,2015-07-17 11:07:27.0,73000
56932,서울특별시 서초구 내곡동,,0.0,0.0,서초포레스타2단지,59.21,201710,21,6,2015,...,8252.0,1185.0,의무,2019-04-24 15:11:04.0,Y,N,127.062596,37.454703,2015-07-17 11:07:27.0,76300
56933,서울특별시 서초구 내곡동,,0.0,0.0,서초포레스타2단지,84.87,201710,21,19,2015,...,8252.0,1185.0,의무,2019-04-24 15:11:04.0,Y,N,127.062596,37.454703,2015-07-17 11:07:27.0,90000
56934,서울특별시 서초구 내곡동,,0.0,0.0,서초포레스타2단지,84.48,201710,24,19,2015,...,8252.0,1185.0,의무,2019-04-24 15:11:04.0,Y,N,127.062596,37.454703,2015-07-17 11:07:27.0,88000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
720188,서울특별시 서초구 신원동,,,,힐스테이트 서초 젠트리스,84.99,201504,17,9,2014,...,,,,,,,,,,79000
720189,서울특별시 서초구 신원동,,,,힐스테이트 서초 젠트리스,101.90,201505,1,6,2014,...,,,,,,,,,,95000
720190,서울특별시 서초구 신원동,,,,힐스테이트 서초 젠트리스,84.95,201506,16,6,2014,...,,,,,,,,,,87200
720191,서울특별시 서초구 신원동,,,,힐스테이트 서초 젠트리스,101.90,201506,26,8,2014,...,,,,,,,,,,94500


In [144]:
train1[train1['address'].isna()]['name'].unique()

array(['서초포레스타2단지', '힐스테이트 서초 젠트리스'], dtype=object)

'서초포레스타2단지', '힐스테이트 서초 젠트리스' 의 address가 비어있음.

In [115]:
train1.loc[(train1['address'].isna()) & (train1['name'] == '서초포레스타2단지'), 'address'] = 384
# train1[(train1['city'] == '서울특별시 서초구 내곡동')&(train['name']=='서초포레스타2단지')]

In [116]:
train1.loc[(train1['address'].isna()) & (train1['name'] == '힐스테이트 서초 젠트리스'), 'address'] = 557
# train1[(train1['city'] == '서울특별시 서초구 신원동')&(train['name']=='힐스테이트 서초 젠트리스')]

In [117]:
train1.loc[(train1['first_num'].isna()) & (train1['name'] == '힐스테이트 서초 젠트리스'), 'first_num'] = 557

#### 4-1-1-2. axisX / axisY

##### 4-1-1-2-1. Fill axisX / axisY Null 

X 좌표와 Y 좌표 위치가 Model 학습에 매우 중요할 것으로 예측되어  
결측치를 채우는 코드를 제작 및 실행  

In [96]:
train1[train1['axisX'].isna()]

Unnamed: 0,address,first_num,second_num,name,area,contract_year,contract_day,floor,construction_year,road_name,...,other1_obligation2_rent3_optional4,unit_approval_date,permission_yes_no,management_cost,axisX,axisY,unit_application_date,target,gu,dong
975,189,189.0,0.0,개포주공4단지,50.39,201801.0,25.0,1.0,1982.0,삼성로 14,...,,,,,,,,134000.0,강남구,개포동
976,189,189.0,0.0,개포주공4단지,50.39,201801.0,25.0,4.0,1982.0,삼성로 14,...,,,,,,,,158000.0,강남구,개포동
977,189,189.0,0.0,개포주공4단지,50.39,201801.0,25.0,1.0,1982.0,삼성로 14,...,,,,,,,,155000.0,강남구,개포동
978,189,189.0,0.0,개포주공4단지,42.55,201801.0,26.0,4.0,1982.0,삼성로 14,...,,,,,,,,132000.0,강남구,개포동
979,189,189.0,0.0,개포주공4단지,50.39,201802.0,10.0,1.0,1982.0,삼성로 14,...,,,,,,,,154000.0,강남구,개포동
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118203,274-77,274.0,77.0,한영,70.96,200709.0,6.0,4.0,2003.0,동일로144길 74,...,,,,,,,,17000.0,중랑구,중화동
1118204,11,11.0,0.0,현대휴앤미,100.17,200704.0,23.0,2.0,2007.0,봉화산로27가길 23,...,,,,,,,,21000.0,중랑구,중화동
1118205,11,11.0,0.0,현대휴앤미,95.94,200704.0,26.0,3.0,2007.0,봉화산로27가길 23,...,,,,,,,,27000.0,중랑구,중화동
1118206,11,11.0,0.0,현대휴앤미,100.20,200705.0,19.0,6.0,2007.0,봉화산로27가길 23,...,,,,,,,,24200.0,중랑구,중화동


In [97]:
train1[train1['axisY'].isna()]

Unnamed: 0,address,first_num,second_num,name,area,contract_year,contract_day,floor,construction_year,road_name,...,other1_obligation2_rent3_optional4,unit_approval_date,permission_yes_no,management_cost,axisX,axisY,unit_application_date,target,gu,dong
975,189,189.0,0.0,개포주공4단지,50.39,201801.0,25.0,1.0,1982.0,삼성로 14,...,,,,,,,,134000.0,강남구,개포동
976,189,189.0,0.0,개포주공4단지,50.39,201801.0,25.0,4.0,1982.0,삼성로 14,...,,,,,,,,158000.0,강남구,개포동
977,189,189.0,0.0,개포주공4단지,50.39,201801.0,25.0,1.0,1982.0,삼성로 14,...,,,,,,,,155000.0,강남구,개포동
978,189,189.0,0.0,개포주공4단지,42.55,201801.0,26.0,4.0,1982.0,삼성로 14,...,,,,,,,,132000.0,강남구,개포동
979,189,189.0,0.0,개포주공4단지,50.39,201802.0,10.0,1.0,1982.0,삼성로 14,...,,,,,,,,154000.0,강남구,개포동
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118203,274-77,274.0,77.0,한영,70.96,200709.0,6.0,4.0,2003.0,동일로144길 74,...,,,,,,,,17000.0,중랑구,중화동
1118204,11,11.0,0.0,현대휴앤미,100.17,200704.0,23.0,2.0,2007.0,봉화산로27가길 23,...,,,,,,,,21000.0,중랑구,중화동
1118205,11,11.0,0.0,현대휴앤미,95.94,200704.0,26.0,3.0,2007.0,봉화산로27가길 23,...,,,,,,,,27000.0,중랑구,중화동
1118206,11,11.0,0.0,현대휴앤미,100.20,200705.0,19.0,6.0,2007.0,봉화산로27가길 23,...,,,,,,,,24200.0,중랑구,중화동


In [17]:
train1['full_address'] = '서울특별시 ' + train1['gu'] + ' ' + train1['dong'] + ' ' + train1['road_name']
train1

Unnamed: 0,address,first_num,second_num,name,area,contract_year,contract_day,floor,construction_year,road_name,...,unit_approval_date,permission_yes_no,management_cost,axisX,axisY,unit_application_date,target,gu,dong,full_address
0,658-1,658.0,1.0,개포6차우성,79.97,201712.0,8.0,3.0,1987.0,언주로 3,...,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,124000.0,강남구,개포동,서울특별시 강남구 개포동 언주로 3
1,658-1,658.0,1.0,개포6차우성,79.97,201712.0,22.0,4.0,1987.0,언주로 3,...,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,123500.0,강남구,개포동,서울특별시 강남구 개포동 언주로 3
2,658-1,658.0,1.0,개포6차우성,54.98,201712.0,28.0,5.0,1987.0,언주로 3,...,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,91500.0,강남구,개포동,서울특별시 강남구 개포동 언주로 3
3,658-1,658.0,1.0,개포6차우성,79.97,201801.0,3.0,4.0,1987.0,언주로 3,...,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,130000.0,강남구,개포동,서울특별시 강남구 개포동 언주로 3
4,658-1,658.0,1.0,개포6차우성,79.97,201801.0,8.0,2.0,1987.0,언주로 3,...,2022-11-17 13:00:29.0,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,117000.0,강남구,개포동,서울특별시 강남구 개포동 언주로 3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118817,382,382.0,0.0,갈현현대,59.94,200707.0,12.0,11.0,1998.0,서오릉로21길 36,...,2013-06-04 16:18:51.0,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,20000.0,은평구,구산동,서울특별시 은평구 구산동 서오릉로21길 36
1118818,382,382.0,0.0,갈현현대,59.94,200708.0,25.0,10.0,1998.0,서오릉로21길 36,...,2013-06-04 16:18:51.0,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,20000.0,은평구,구산동,서울특별시 은평구 구산동 서오릉로21길 36
1118819,382,382.0,0.0,갈현현대,84.83,200708.0,31.0,20.0,1998.0,서오릉로21길 36,...,2013-06-04 16:18:51.0,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,28000.0,은평구,구산동,서울특별시 은평구 구산동 서오릉로21길 36
1118820,382,382.0,0.0,갈현현대,84.83,200709.0,15.0,8.0,1998.0,서오릉로21길 36,...,2013-06-04 16:18:51.0,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,29000.0,은평구,구산동,서울특별시 은평구 구산동 서오릉로21길 36


In [69]:
address = pd.DataFrame()

In [70]:
address = train1[['axisX','axisY']]
address['full_address'] = '서울특별시 ' + train1['gu'] + ' ' + train1['dong'] + ' ' + train1['road_name']
address

Unnamed: 0,axisX,axisY,full_address
0,127.057210,37.476763,서울특별시 강남구 개포동 언주로 3
1,127.057210,37.476763,서울특별시 강남구 개포동 언주로 3
2,127.057210,37.476763,서울특별시 강남구 개포동 언주로 3
3,127.057210,37.476763,서울특별시 강남구 개포동 언주로 3
4,127.057210,37.476763,서울특별시 강남구 개포동 언주로 3
...,...,...,...
1118817,126.905638,37.612962,서울특별시 은평구 구산동 서오릉로21길 36
1118818,126.905638,37.612962,서울특별시 은평구 구산동 서오릉로21길 36
1118819,126.905638,37.612962,서울특별시 은평구 구산동 서오릉로21길 36
1118820,126.905638,37.612962,서울특별시 은평구 구산동 서오릉로21길 36


In [19]:
address = address[['full_address', 'axisX', 'axisY']].drop_duplicates().reset_index()

In [20]:
address

Unnamed: 0,index,full_address,axisX,axisY
0,0,서울특별시 강남구 개포동 언주로 3,127.057210,37.476763
1,12,서울특별시 강남구 개포동 개포로 307,127.055990,37.483894
2,25,서울특별시 강남구 개포동 개포로109길 69,127.076624,37.496296
3,38,서울특별시 강남구 개포동 개포로 310,127.058521,37.480002
4,44,서울특별시 강남구 개포동 선릉로 7,127.058521,37.480002
...,...,...,...,...
9336,1105412,서울특별시 서초구 서초동,,
9337,1105776,서울특별시 서초구 잠원동 신반포로 45,,
9338,1109728,서울특별시 송파구 송파동 송파대로48길,,
9339,1113986,서울특별시 용산구 한강로2가,,


In [21]:
address.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9341 entries, 0 to 9340
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         9341 non-null   int64  
 1   full_address  9341 non-null   object 
 2   axisX         842 non-null    float64
 3   axisY         842 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 292.0+ KB


#####  4-1-1-2-2. Using Geopy

Geopy Library를 사용해서 위도와 경도 결측치를 채움  

In [23]:
from geopy.geocoders import Nominatim
geo_local = Nominatim(user_agent='South Korea')

In [24]:
# 있으면 위경도, 없으면 0, 0 뽑아내주는 로직으로 짬
def geocoding(address):
    try:
        geo = geo_local.geocode(address)
        x_y = [geo.longitude, geo.latitude]
        return x_y

    except:
        pass

In [25]:
# geocode unique values
for i, add in enumerate(address['full_address']):
   if address['axisX'].isna()[i]:
       print( f"{i}번째 변환중입니다.")
       address.loc[i, ['axisX', 'axisY']] = geocoding(add)

22번째 변환중입니다.
23번째 변환중입니다.
24번째 변환중입니다.
25번째 변환중입니다.
26번째 변환중입니다.
27번째 변환중입니다.
28번째 변환중입니다.
29번째 변환중입니다.
30번째 변환중입니다.
31번째 변환중입니다.
32번째 변환중입니다.
33번째 변환중입니다.
34번째 변환중입니다.
35번째 변환중입니다.
36번째 변환중입니다.
37번째 변환중입니다.
38번째 변환중입니다.
39번째 변환중입니다.
40번째 변환중입니다.
41번째 변환중입니다.
42번째 변환중입니다.
43번째 변환중입니다.
44번째 변환중입니다.
45번째 변환중입니다.
46번째 변환중입니다.
47번째 변환중입니다.
48번째 변환중입니다.
49번째 변환중입니다.
50번째 변환중입니다.
51번째 변환중입니다.
52번째 변환중입니다.
53번째 변환중입니다.
54번째 변환중입니다.
55번째 변환중입니다.
56번째 변환중입니다.
57번째 변환중입니다.
58번째 변환중입니다.
59번째 변환중입니다.
60번째 변환중입니다.
61번째 변환중입니다.
62번째 변환중입니다.
63번째 변환중입니다.
64번째 변환중입니다.
65번째 변환중입니다.
66번째 변환중입니다.
67번째 변환중입니다.
68번째 변환중입니다.
69번째 변환중입니다.
70번째 변환중입니다.
71번째 변환중입니다.
72번째 변환중입니다.
73번째 변환중입니다.
74번째 변환중입니다.
75번째 변환중입니다.
76번째 변환중입니다.
77번째 변환중입니다.
78번째 변환중입니다.
79번째 변환중입니다.
80번째 변환중입니다.
81번째 변환중입니다.
82번째 변환중입니다.
83번째 변환중입니다.
84번째 변환중입니다.
85번째 변환중입니다.
86번째 변환중입니다.
87번째 변환중입니다.
88번째 변환중입니다.
89번째 변환중입니다.
90번째 변환중입니다.
91번째 변환중입니다.
92번째 변환중입니다.
93번째 변환중입니다.
94번째 변환중입니다.
95번째 변환중입니다.
96번째 변환중입니다.
97번째 변환중입니다.
98번째 변환중입니다.

In [29]:
address[address['axisX'].isna()]

Unnamed: 0,index,full_address,axisX,axisY
26,1063,서울특별시 강남구 개포동 논현로2길 61-4,,
27,1064,서울특별시 강남구 개포동 개포로28길 28,,
28,1065,서울특별시 강남구 개포동 개포로109길 21,,
29,1118,서울특별시 강남구 개포동 논현로2길 36,,
30,1120,서울특별시 강남구 개포동 개포로109길 9,,
...,...,...,...,...
9329,1078130,서울특별시 강남구 도곡동 도곡로18길,,
9331,1086865,서울특별시 구로구 구로동 구로동로12길 33,,
9332,1086904,서울특별시 구로구 구로동 구로동로18길 30-10,,
9333,1086974,서울특별시 구로구 구로동 도림로12길 15-7,,


#####  4-1-1-2-3. Save file without missing axisX / axisY

In [None]:
address.to_csv('../data/train_address.csv')

#####  4-1-1-2-4. Prepare Merge Train and Full axisX / axisY file

In [122]:
train_addrpath = '../data/train_address.csv'
train_addr = pd.read_csv(train_addrpath)

In [96]:
train_addr = train_addr[['city', 'address','axisX', 'axisY']].drop_duplicates().reset_index()

In [102]:
train_addr

Unnamed: 0,axisX,axisY,address,city
0,127.057210,37.476763,658-1,서울특별시 강남구 개포동
1,127.055990,37.483894,652,서울특별시 강남구 개포동
2,127.076624,37.496296,12-2,서울특별시 강남구 개포동
3,127.058521,37.480002,141,서울특별시 강남구 개포동
4,127.068028,37.487802,187,서울특별시 강남구 개포동
...,...,...,...,...
8938,126.884548,37.494585,794-32,서울특별시 구로구 구로동
8939,126.884548,37.494585,807-39,서울특별시 구로구 구로동
8940,127.001400,37.503120,16-1,서울특별시 서초구 반포동
8941,127.017510,37.488180,1686-4,서울특별시 서초구 서초동


In [99]:
# 'full_address'를 공백을 기준으로 나누어 리스트로 변환하고 'city', 'address' 컬럼 생성
train_addr[['city1', 'city2', 'city3', 'address']] = train_addr['full_address'].str.split(' ', expand=True)

train_addr['city'] = train_addr['city1'] + ' ' + train_addr['city2'] + ' ' + train_addr['city3']

train_addr = train_addr.drop(columns=['city1','city2','city3'])

train_addr = train_addr.drop(columns=['full_address'])

train_addr

Unnamed: 0,axisX,axisY,address,city
0,127.057210,37.476763,658-1,서울특별시 강남구 개포동
1,127.055990,37.483894,652,서울특별시 강남구 개포동
2,127.076624,37.496296,12-2,서울특별시 강남구 개포동
3,127.058521,37.480002,141,서울특별시 강남구 개포동
4,127.068028,37.487802,187,서울특별시 강남구 개포동
...,...,...,...,...
8938,126.884548,37.494585,794-32,서울특별시 구로구 구로동
8939,126.884548,37.494585,807-39,서울특별시 구로구 구로동
8940,127.001400,37.503120,16-1,서울특별시 서초구 반포동
8941,127.017510,37.488180,1686-4,서울특별시 서초구 서초동


In [100]:
train_addr.to_csv('../data/train_address.csv', index=False)

#####  4-1-1-2-5. Merge Train and Full axisX / axisY file

In [123]:
train2 = train1.copy()

In [110]:
train2 = train2.drop(columns=['axisX','axisY'])

In [124]:
# 'city'와 'address'를 기준으로 두 데이터프레임을 합치기
merged_df = pd.merge(train2, train_addr, on=['city', 'address'], how='left')

In [125]:
merged_df

Unnamed: 0,city,address,first_num,second_num,name,area,contract_year,contract_day,floor,construction_year,...,permission_yes_no,management_cost,axisX_x,axisY_x,unit_application_date,target,gu,dong,axisX_y,axisY_y
0,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712.0,8.0,3.0,1987.0,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,124000.0,강남구,개포동,127.057210,37.476763
1,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201712.0,22.0,4.0,1987.0,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,123500.0,강남구,개포동,127.057210,37.476763
2,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,54.98,201712.0,28.0,5.0,1987.0,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,91500.0,강남구,개포동,127.057210,37.476763
3,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801.0,3.0,4.0,1987.0,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,130000.0,강남구,개포동,127.057210,37.476763
4,서울특별시 강남구 개포동,658-1,658.0,1.0,개포6차우성,79.97,201801.0,8.0,2.0,1987.0,...,Y,N,127.057210,37.476763,2022-11-17 10:19:06.0,117000.0,강남구,개포동,127.057210,37.476763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118816,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,59.94,200707.0,12.0,11.0,1998.0,...,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,20000.0,은평구,구산동,126.905638,37.612962
1118817,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,59.94,200708.0,25.0,10.0,1998.0,...,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,20000.0,은평구,구산동,126.905638,37.612962
1118818,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,84.83,200708.0,31.0,20.0,1998.0,...,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,28000.0,은평구,구산동,126.905638,37.612962
1118819,서울특별시 은평구 구산동,382,382.0,0.0,갈현현대,84.83,200709.0,15.0,8.0,1998.0,...,Y,N,126.905638,37.612962,2013-03-07 09:46:27.0,29000.0,은평구,구산동,126.905638,37.612962


In [126]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1118821 entries, 0 to 1118820
Data columns (total 56 columns):
 #   Column                                  Non-Null Count    Dtype  
---  ------                                  --------------    -----  
 0   city                                    1118821 non-null  object 
 1   address                                 1118821 non-null  object 
 2   first_num                               1118821 non-null  float64
 3   second_num                              1118821 non-null  float64
 4   name                                    1118821 non-null  object 
 5   area                                    1118821 non-null  float64
 6   contract_year                           1118821 non-null  float64
 7   contract_day                            1118821 non-null  float64
 8   floor                                   1118821 non-null  float64
 9   construction_year                       1118821 non-null  float64
 10  road_name                     

# 5. Train_latlon Save

In [None]:
merged_df.to_csv('../data/train_latlon.csv', index=False)

# 6. Test EDA

In [None]:
# 9272개
test.info(verbose=True)

In [33]:
test1 = test.copy()

### 6-1. VARIABLE  

##### 6-1-1. axisX / axisY

In [37]:
address = pd.DataFrame()
address = test1[['axisX','axisY']]
address['full_address'] = '서울특별시 ' + test1['gu'] + ' ' + test1['dong'] + ' ' + test1['road_name']
address

Unnamed: 0,axisX,axisY,full_address
0,127.057210,37.476763,서울특별시 강남구 개포동 언주로 3
1,127.056394,37.484892,서울특별시 강남구 개포동 개포로 311
2,127.055990,37.483894,서울특별시 강남구 개포동 개포로 307
3,127.055990,37.483894,서울특별시 강남구 개포동 개포로 307
4,127.055990,37.483894,서울특별시 강남구 개포동 개포로 307
...,...,...,...
9267,127.106720,37.618870,서울특별시 중랑구 신내동 신내역로1길 85
9268,127.106720,37.618870,서울특별시 중랑구 신내동 신내역로1길 85
9269,127.106720,37.618870,서울특별시 중랑구 신내동 신내역로1길 85
9270,127.106720,37.618870,서울특별시 중랑구 신내동 신내역로1길 85


In [38]:
address = address[['full_address', 'axisX', 'axisY']].drop_duplicates().reset_index()

In [39]:
address.to_csv('test_address.csv', index=False)

##### 6-1-2. Prepare Merge Test and Full axisX / axisY file

In [11]:
test_addrpath  = '../data/test_address.csv'
test_addr = pd.read_csv(test_addrpath)

In [18]:
test_addr

Unnamed: 0,axisX,axisY,full_address
0,127.057210,37.476763,서울특별시 강남구 개포동 658-1
1,127.070060,37.484210,서울특별시 강남구 개포동 651-1
2,127.056394,37.484892,서울특별시 강남구 개포동 651-1
3,127.055990,37.483894,서울특별시 강남구 개포동 652
4,127.068028,37.487802,서울특별시 강남구 개포동 187
...,...,...,...
2659,127.130827,37.479157,서울특별시 송파구 장지동 844
2660,127.128742,37.478220,서울특별시 송파구 장지동 843
2661,127.015931,37.580983,서울특별시 종로구 숭인동 02-01
2662,127.000071,37.560706,서울특별시 중구 묵정동 11-67


In [49]:
# 'full_address'를 공백을 기준으로 나누어 리스트로 변환하고 'city', 'address' 컬럼 생성
test_addr[['city1', 'city2', 'city3', 'address']] = test_addr['full_address'].str.split(' ', expand=True)

test_addr['city'] = test_addr['city1'] + ' ' + test_addr['city2'] + ' ' + test_addr['city3']

test_addr = test_addr.drop(columns=['city1','city2','city3'])

test_addr = test_addr.drop(columns=['full_address'])

test_addr

Unnamed: 0,axisX,axisY,full_address,address,city
0,127.057210,37.476763,서울특별시 강남구 개포동 658-1,658-1,서울특별시 강남구 개포동
1,127.070060,37.484210,서울특별시 강남구 개포동 651-1,651-1,서울특별시 강남구 개포동
2,127.056394,37.484892,서울특별시 강남구 개포동 651-1,651-1,서울특별시 강남구 개포동
3,127.055990,37.483894,서울특별시 강남구 개포동 652,652,서울특별시 강남구 개포동
4,127.068028,37.487802,서울특별시 강남구 개포동 187,187,서울특별시 강남구 개포동
...,...,...,...,...,...
2659,127.130827,37.479157,서울특별시 송파구 장지동 844,844,서울특별시 송파구 장지동
2660,127.128742,37.478220,서울특별시 송파구 장지동 843,843,서울특별시 송파구 장지동
2661,127.015931,37.580983,서울특별시 종로구 숭인동 02-01,02-01,서울특별시 종로구 숭인동
2662,127.000071,37.560706,서울특별시 중구 묵정동 11-67,11-67,서울특별시 중구 묵정동


In [52]:
test_addr.to_csv('../data/test_address.csv', index=False)

In [None]:
test2 = test1.copy()
test2 = test2.drop(columns=['axisX','axisY'])

#####  6-1-3. Merge Test and Full axisX / axisY file

In [None]:
# 'city'와 'address'를 기준으로 두 데이터프레임을 합치기
merged_df = pd.merge(test2, test_addr, on=['city', 'address'], how='left')

In [None]:
merged_df.info()

# 7. Test_latlon Save

In [None]:
merged_df.to_csv('../data/test_latlon.csv', index=False)