# 1. Library Import

In [58]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# 소수점 둘째자리까지 표시하도록 설정 (원하는 자릿수로 변경 가능)
pd.options.display.float_format = '{:.2f}'.format  


# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

## Optuna tunning for XGB
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import optuna

## Optuna visulization
import plotly.express as px
import plotly.graph_objects as go
import plotly

# 열의 개수를 출력할 때 모두 표시하도록 설정
pd.set_option('display.max_columns', None)


# 2. Data Loading

In [59]:
# 필요한 데이터를 load 하겠습니다. 경로는 환경에 맞게 지정해주면 됩니다.
train_path = '../data/train1.csv'
test_path  = '../data/test1.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [60]:
train1 = train.copy()
test1 = test.copy()

# 3. Change variabels name

In [61]:
train1.rename(columns={'city':'ADRES','address':'LOT_NO','first_num':'MAIN_NO','second_num':'SUB_NO',
                       'name':'APT_NM','area':'EXCL_AREA_SQM','contract_year':'CONTR_YEAR_MONTH',
                       'contract_day':'CONTR_DAY','floor':'FLOOR','construction_year':'BUILD_YEAR',
                       'road_name':'ADRES_DORO','cancellation_day':'CANCEL_REASON_DATE',
                       'registration_day':'REGIST_APPL_DATE','transaction_type':'TRADE_TYPE',
                       'estate_agent':'AGNCY_LOCATION',
                       'classification_complex':'CODEAPTNM',
                       'phone_number':'TELNO','fax_number':'FXNUM',
                       'clob':'CN_APT','sales_type':'HSHLDR_TY',
                      'management_system':'GNRL_MANAGECT_MANAGE_STLE','hallway_type':'CRRDPR_TY',
                      'heating_system':'HEAT_MTHD','total_unit':'ALL_DONG_CO',
                      'total_households':'ALL_HSHLD_CO','construction_company':'CO_WO',
                      'developer':'CO_EX','inspection_approval_date':'USE_INSPCT_DE',
                      'total_area': 'TOTAR','residential_area':'PRIVAREA',
                      'management_fee_area':'MANAGECT_LEVY_AR',
                      'household_status_by_area_60':'KAPTMPAREA60',
                      'household_status_by_area_85':'KAPTMPAREA85',
                      'household_status_by_area_135':'KAPTMPAREA135', 
                      'household_status_by_area_big':'KAPTMPAREA136',
                      'homepage':'HMPG','registration_date':'API_INSERT_DATE', 
                      'modification_date':'API_UPDATE_DATE',
                      'employment_insurance_management_number':'EMPLYMINSRNC_MANAGE_NO',
                      'guard_management_type':'EXPENSCTMANAGESTLE',
                      'electricity_contract':'HSHLD_ELCTY_CNTRCT_MTH',
                      'cleaning_fee_management_type':'CLN_CTMANAGESTLE',
                      'building_area':'BU_AR','parking_space':'CNT_PA',
                      'other1_obligation2_rent3_optional4':'GUBUN',
                      'unit_approval_date':'USE_CONFM_DE','permission_yes_no':'USE_TY',
                      'management_cost':'MANUAL_INPUT',
                      'axisX':'X_CODE','axisY':'Y_CODE','unit_application_date':'USE_RQSTDT'
                      },inplace=True)

In [62]:
test1.rename(columns={'city':'ADRES','address':'LOT_NO','first_num':'MAIN_NO','second_num':'SUB_NO',
                       'name':'APT_NM','area':'EXCL_AREA_SQM','contract_year':'CONTR_YEAR_MONTH',
                       'contract_day':'CONTR_DAY','floor':'FLOOR','construction_year':'BUILD_YEAR',
                       'road_name':'ADRES_DORO','cancellation_day':'CANCEL_REASON_DATE',
                       'registration_day':'REGIST_APPL_DATE','transaction_type':'TRADE_TYPE',
                       'estate_agent':'AGNCY_LOCATION',
                       'classification_complex':'CODEAPTNM',
                       'phone_number':'TELNO','fax_number':'FXNUM',
                       'clob':'CN_APT','sales_type':'HSHLDR_TY',
                      'management_system':'GNRL_MANAGECT_MANAGE_STLE','hallway_type':'CRRDPR_TY',
                      'heating_system':'HEAT_MTHD','total_unit':'ALL_DONG_CO',
                      'total_households':'ALL_HSHLD_CO','construction_company':'CO_WO',
                      'developer':'CO_EX','inspection_approval_date':'USE_INSPCT_DE',
                      'total_area': 'TOTAR','residential_area':'PRIVAREA',
                      'management_fee_area':'MANAGECT_LEVY_AR',
                      'household_status_by_area_60':'KAPTMPAREA60',
                      'household_status_by_area_85':'KAPTMPAREA85',
                      'household_status_by_area_135':'KAPTMPAREA135', 
                      'household_status_by_area_big':'KAPTMPAREA136',
                      'homepage':'HMPG','registration_date':'API_INSERT_DATE', 
                      'modification_date':'API_UPDATE_DATE',
                      'employment_insurance_management_number':'EMPLYMINSRNC_MANAGE_NO',
                      'guard_management_type':'EXPENSCTMANAGESTLE',
                      'electricity_contract':'HSHLD_ELCTY_CNTRCT_MTH',
                      'cleaning_fee_management_type':'CLN_CTMANAGESTLE',
                      'building_area':'BU_AR','parking_space':'CNT_PA',
                      'other1_obligation2_rent3_optional4':'GUBUN',
                      'unit_approval_date':'USE_CONFM_DE','permission_yes_no':'USE_TY',
                      'management_cost':'MANUAL_INPUT',
                      'axisX':'X_CODE','axisY':'Y_CODE','unit_application_date':'USE_RQSTDT'
                      },inplace=True)

# 4. Null Value Treatment

In [63]:
# 결측치 함정카드 제거
train1['AGNCY_LOCATION'][train1['AGNCY_LOCATION'] == '-'] = np.nan
train1['TRADE_TYPE'][train1['TRADE_TYPE'] == '-'] = np.nan
train1['REGIST_APPL_DATE'][train1['REGIST_APPL_DATE'] == ' '] = np.nan

In [64]:
# 붙일 칼럼 만들기
train1['FULL_ADRES'] = train1['ADRES'] + ' ' + train1['LOT_NO']
train1['dongAPT_NM'] = train1['dong'] + ' ' + train1['APT_NM']

test1['FULL_ADRES'] = test1['ADRES'] + ' ' + test1['LOT_NO']
test1['dongAPT_NM'] = test1['dong'] + ' ' + test1['APT_NM']

# 5. Merge data 'subway'

In [77]:
subway_bus = pd.read_csv('../data/subway_bus.csv')

In [66]:
subway_bus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8955 entries, 0 to 8954
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   지번주소      8955 non-null   object 
 1   유동승객수     8955 non-null   int64  
 2   역까지_거리    8955 non-null   float64
 3   좌표X뉴      8955 non-null   float64
 4   좌표Y뉴      8955 non-null   float64
 5   1차역세권     8955 non-null   int64  
 6   2차역세권     8955 non-null   int64  
 7   X좌표       8955 non-null   float64
 8   Y좌표       8955 non-null   float64
 9   정류장까지_거리  8955 non-null   float64
dtypes: float64(6), int64(3), object(1)
memory usage: 699.7+ KB


In [67]:
subway_bus = subway_bus.drop(['좌표X뉴', '좌표Y뉴','X좌표','Y좌표'], axis = 1)

In [68]:
subway_bus.rename(columns={'지번주소':'FULL_ADRES','유동승객수':'FLOATING_POPULATION','역까지_거리':'SUBWAY_DIST',
                      '1차역세권':'1STSUBAREA','2차역세권':'2NDSUBAREA','정류장까지_거리':'BUS_DIST'
                      },inplace=True)

In [69]:
subway_bus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8955 entries, 0 to 8954
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FULL_ADRES           8955 non-null   object 
 1   FLOATING_POPULATION  8955 non-null   int64  
 2   SUBWAY_DIST          8955 non-null   float64
 3   1STSUBAREA           8955 non-null   int64  
 4   2NDSUBAREA           8955 non-null   int64  
 5   BUS_DIST             8955 non-null   float64
dtypes: float64(2), int64(3), object(1)
memory usage: 419.9+ KB


In [70]:
# 역세권, 유동인구 붙여넣기 (유동 승객수 = 한달 단위 승하차평균, 역까지 거리 = 최단거리역 기준)
train2 = pd.merge(train1, subway_bus, how = 'left', on = 'FULL_ADRES')
test2 = pd.merge(test1, subway_bus, how = 'left', on = 'FULL_ADRES')

In [71]:
# 저장하기
train2 = train2.drop(['MAIN_NO', 'SUB_NO', 'REGIST_APPL_DATE', 'TELNO', 'FXNUM', 'CN_APT',
             'GNRL_MANAGECT_MANAGE_STLE', 'USE_INSPCT_DE', 
             'MANAGECT_LEVY_AR', 'HMPG', 'API_INSERT_DATE', 'API_UPDATE_DATE',
             'EMPLYMINSRNC_MANAGE_NO', 'EXPENSCTMANAGESTLE', 'CLN_CTMANAGESTLE',
             'USE_CONFM_DE', 'MANUAL_INPUT', 'USE_TY'], axis = 1)
# 저장하기
test2 = test2.drop(['MAIN_NO', 'SUB_NO', 'REGIST_APPL_DATE', 'TELNO', 'FXNUM', 'CN_APT',
             'GNRL_MANAGECT_MANAGE_STLE', 'USE_INSPCT_DE', 
             'MANAGECT_LEVY_AR', 'HMPG', 'API_INSERT_DATE', 'API_UPDATE_DATE',
             'EMPLYMINSRNC_MANAGE_NO', 'EXPENSCTMANAGESTLE', 'CLN_CTMANAGESTLE',
             'USE_CONFM_DE', 'MANUAL_INPUT', 'USE_TY'], axis = 1)

# 6. Save train2 & test2

In [72]:
train2.to_csv('../data/train2.csv', index = False)
test2.to_csv('../data/test2.csv', index = False)