# 1. Library Import

In [47]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# 소수점 둘째자리까지 표시하도록 설정 (원하는 자릿수로 변경 가능)
pd.options.display.float_format = '{:.2f}'.format  


# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

## Optuna tunning for XGB
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import optuna

## Optuna visulization
import plotly.express as px
import plotly.graph_objects as go
import plotly

# 열의 개수를 출력할 때 모두 표시하도록 설정
pd.set_option('display.max_columns', None)


# 2. Data Loading

In [48]:
# 필요한 데이터를 load 하겠습니다. 경로는 환경에 맞게 지정해주면 됩니다.
train_path = '../data/train4.csv'
test_path  = '../data/test4.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
submission = pd.read_csv("../data/sample_submission.csv")

# 3. Model Input Condition

### 3-1. 2017년 이후 데이터 지정

In [49]:
train = train[train['CONTR']>=20170101]
test = test[test['CONTR']>=20170101]

### 3-2. GDP Feature 수정 / 1년전 GDP로 대체

In [50]:
train = train.drop(columns='REAL_GDP')
test = test.drop(columns='REAL_GDP')

train = train.drop(columns='NOMINAL_GDP')
test = test.drop(columns='NOMINAL_GDP')

In [51]:
gdp = pd.read_csv('../data/1year_gdp.csv',encoding='cp949')

In [52]:
# 'contract_year' 값을 정수로 변환
train['CONTR_YEAR_MONTH'] = train['CONTR_YEAR_MONTH'].astype(int)

contract_years = sorted([200612, 200703, 200706, 200709, 200712, 200803, 200806, 200809,
       200812, 200903, 200906, 200909, 200912, 201003, 201006, 201009,
       201012, 201103, 201106, 201109, 201112, 201203, 201206, 201209,
       201212, 201303, 201306, 201309, 201312, 201403, 201406, 201409,
       201412, 201503, 201506, 201509, 201512, 201603, 201606, 201609,
       201612, 201703, 201706, 201709, 201712, 201803, 201806, 201809,
       201812, 201903, 201906, 201909, 201912, 202003, 202006, 202009,
       202012, 202103, 202106, 202109, 202112, 202203, 202206, 202209,
       202212, 202303, 202306, 202309])

before = contract_years[0]
# 조건에 따라 'gdp' 값 대입
for contract_year in contract_years[1:]:
    condition = (train['CONTR_YEAR_MONTH'] > before) & (train['CONTR_YEAR_MONTH'] <= contract_year)
    gdp_values = gdp[gdp['CONTR_YEAR_MONTH'] == contract_year][['REAL_GDP','NOMINAL_GDP']].values
    train.loc[condition, ['REAL_GDP', 'NOMINAL_GDP']] = gdp_values
    before = contract_year
# 조건에 따라 'gdp' 값 대입
for contract_year in contract_years[1:]:
    condition = (test['CONTR_YEAR_MONTH'] > before) & (test['CONTR_YEAR_MONTH'] <= contract_year)
    gdp_values = gdp[gdp['CONTR_YEAR_MONTH'] == contract_year][['REAL_GDP','NOMINAL_GDP']].values
    test.loc[condition, ['REAL_GDP', 'NOMINAL_GDP']] = gdp_values
    before = contract_year

### 3-3. 이상치 제거 / 아파트별 평균가 최저 6111.11

In [53]:
train = train[train['target']>6111.11]

### 3-4. 금리 

In [54]:
goldwe = pd.read_excel('../data/한국은행_금리.xlsx')
goldwe["CONTR"] = goldwe["변경일자_년"].astype(str) + goldwe["변경일자_월일"].map(lambda x : x[:2] + x[4:6])
g01 = goldwe.sort_values('CONTR')
g01['날짜열_1년후'] = pd.to_datetime(g01['CONTR']) + pd.DateOffset(years=1)
g01['날짜열_11개월후'] = pd.to_datetime(g01['CONTR']) + pd.DateOffset(months=11)
g01['날짜열_13개월후'] = pd.to_datetime(g01['CONTR']) + pd.DateOffset(months=13)
g02 = g01[['날짜열_11개월후', '기준금리']]
g03 = g01[['날짜열_1년후', '기준금리']]
g04 = g01[['날짜열_13개월후', '기준금리']]

g02.columns = ['날짜열_11개월후', "11개월전_금리"]
g03.columns = ['날짜열_1년후', "1년전_금리"]
g04.columns = ['날짜열_13개월후', "13개월전_금리"]

g02['날짜열_11개월후_종료일'] = g02['날짜열_11개월후'].shift(-1, fill_value = "2027-01-01") - pd.DateOffset(days=1)
g03['날짜열_1년후_종료일'] = g03['날짜열_1년후'].shift(-1, fill_value = "2027-01-01") - pd.DateOffset(days=1)
g04['날짜열_13개월후_종료일'] = g04['날짜열_13개월후'].shift(-1, fill_value = "2027-01-01") - pd.DateOffset(days=1)

In [55]:
make_date = train["CONTR"].reset_index()
make_date_test = test["CONTR"].reset_index()

make_date['CONTR'] = make_date['CONTR'].astype('str')
make_date_test['CONTR'] = make_date_test['CONTR'].astype('str')

m02 = pd.merge(make_date, g02, how = 'cross')
m03 = pd.merge(make_date, g03, how = 'cross')
m03_test = pd.merge(make_date_test, g03, how = 'cross')
m04 = pd.merge(make_date, g04, how = 'cross')


In [56]:
m02['CONTR'] = pd.to_datetime(m02['CONTR'])
m03['CONTR'] = pd.to_datetime(m03['CONTR'])
m03_test['CONTR'] = pd.to_datetime(m03_test['CONTR'])
m04['CONTR'] = pd.to_datetime(m04['CONTR'])

In [57]:
m02['현재와시작일차이'] = (m02['CONTR'] - m02['날짜열_11개월후']).dt.days
m02['현재와종료일차이'] = (m02['날짜열_11개월후_종료일'] - m02['CONTR']).dt.days
m02['체크'] = m02['현재와시작일차이']*m02['현재와종료일차이']

m03['현재와시작일차이'] = (m03['CONTR'] - m03['날짜열_1년후']).dt.days
m03['현재와종료일차이'] = (m03['날짜열_1년후_종료일'] - m03['CONTR']).dt.days
m03['체크'] = m03['현재와시작일차이']*m03['현재와종료일차이']

m03_test['현재와시작일차이'] = (m03_test['CONTR'] - m03_test['날짜열_1년후']).dt.days
m03_test['현재와종료일차이'] = (m03_test['날짜열_1년후_종료일'] - m03_test['CONTR']).dt.days
m03_test['체크'] = m03_test['현재와시작일차이']*m03_test['현재와종료일차이']

m04['현재와시작일차이'] = (m04['CONTR'] - m04['날짜열_13개월후']).dt.days
m04['현재와종료일차이'] = (m04['날짜열_13개월후_종료일'] - m04['CONTR']).dt.days
m04['체크'] = m04['현재와시작일차이']*m04['현재와종료일차이']

m02[m02['체크'] >= 0]

m03[m03['체크'] >= 0]
m03_test[m03_test['체크'] >= 0]

m04[m04['체크'] >= 0]

m02.set_index('index', inplace=True)

m03.set_index('index', inplace=True)
m03_test.set_index('index', inplace=True)

m04.set_index('index', inplace=True)
# train3 = pd.concat([train2, m02['11개월전_금리'][m02['체크'] >= 0]], axis = 1)

train = pd.concat([train, m03['1년전_금리'][m03['체크'] >= 0]], axis = 1)
test = pd.concat([test, m03_test['1년전_금리'][m03_test['체크'] >= 0]], axis = 1)

# train3 = pd.concat([train2, m04['13개월전_금리'][m04['체크'] >= 0]], axis = 1)



In [58]:
train['INTEREST_RATE'] = train['1년전_금리']
test['INTEREST_RATE'] = test['1년전_금리']

train = train.drop(columns='1년전_금리')
test = test.drop(columns='1년전_금리')

### 3-5. 해제사유일 Label Encoding

In [59]:
# 'CANCEL_REASON_DATE' 열에서 null이 아닌 값을 1로 바꿈
train['CANCEL_REASON_DATE'] = train['CANCEL_REASON_DATE'].notnull().astype(int)
test['CANCEL_REASON_DATE'] = test['CANCEL_REASON_DATE'].notnull().astype(int)

### 3-6. 크롤링 자료 Concat

In [60]:
train = train.drop(columns='DONGAPT_NM')
train['DONGAPT_NM'] = train['DONG'] + ' ' + train['APT_NM']
test = test.drop(columns='DONGAPT_NM')
test['DONGAPT_NM'] = test['DONG'] + ' ' + test['APT_NM']

In [61]:
data1 = pd.read_csv('../data/소중한정보_에어리어.csv')
data2 = pd.read_csv('../data/소중한정보_인포.csv')
data3 = pd.read_csv('../data/소중한정보_두번째_에어리어.csv')
data4 = pd.read_csv('../data/소중한정보_두번째_인포.csv')

In [62]:
data1.rename(columns={'동아파트명':'DONGAPT_NM','전용면적':'EXCL_AREA_SQM',
                      '방개수':'BAND','욕실개수':'BATH',
                      '면적당세대수':'SPACEHOUSEHOLDS','현관구조식':'STRUCT',
                      '공급면적':'AREA2','전용율':'EXCLUSIVE'
                      },inplace=True)

In [63]:
data3.rename(columns={'동아파트명':'DONGAPT_NM','전용면적':'EXCL_AREA_SQM',
                      '방개수':'BAND','욕실개수':'BATH', 
                      '면적당세대수':'SPACEHOUSEHOLDS','현관구조식':'STRUCT',
                      '공급면적':'AREA2','지번주소':'LOT_NO','전용율':'EXCLUSIVE'
                      },inplace=True)

In [64]:
data2.rename(columns={'동아파트명':'DONGAPT_NM',
                      '세대':'HOUSEHOLDS','총동수':'ALLDONG',
                      '면적당세대수':'HOUSEHOLDS','저층기준':'UNDER_STRICT',
                      '최고층':'HIGH','총주차면적개수':'PARK','용적비율':'VOLUME'
                      },inplace=True)

In [65]:
data4.rename(columns={'동아파트명':'DONGAPT_NM','지번주소':'LOT_NO',
                      '세대':'HOUSEHOLDS','총동수':'ALLDONG',
                      '면적당세대수':'HOUSEHOLDS','저층기준':'UNDER_STRICT',
                      '최고층':'HIGH','총주차면적개수':'PARK','용적비율':'VOLUME'
                      },inplace=True)

In [66]:
data2 = data2.drop(columns=['건설사','세대당면적개수','건폐비율'])
data4 = data4.drop(columns=['건설사','세대당면적개수','건폐비율'])

In [67]:
# 'city'와 'address'를 기준으로 두 데이터프레임을 합치기
train = pd.merge(train, data1, on=['DONGAPT_NM', 'EXCL_AREA_SQM'], how='left')
test = pd.merge(test, data1, on=['DONGAPT_NM', 'EXCL_AREA_SQM'], how='left')

In [68]:
train = pd.merge(train, data2, on=['DONGAPT_NM'], how='left')
test = pd.merge(test, data2, on=['DONGAPT_NM'], how='left')

In [69]:
train = pd.merge(train, data3, on=['DONGAPT_NM','LOT_NO','EXCL_AREA_SQM'], how='left')
test = pd.merge(test, data3, on=['DONGAPT_NM','LOT_NO','EXCL_AREA_SQM'], how='left')

In [70]:
train = pd.merge(train, data4, on=['DONGAPT_NM','LOT_NO'], how='left')
test = pd.merge(test, data4, on=['DONGAPT_NM','LOT_NO'], how='left')

In [71]:
train = train.drop(columns=['SPACEHOUSEHOLDS_y','AREA2_y',
                            'EXCLUSIVE_y','BAND_y','BATH_y',
                            'STRUCT_y', 'HOUSEHOLDS_y','ALLDONG_y','PARK_y',
                            'UNDER_STRICT_y','HIGH_y','VOLUME_y'])
test = test.drop(columns=['SPACEHOUSEHOLDS_y','AREA2_y',
                            'EXCLUSIVE_y','BAND_y','BATH_y',
                            'STRUCT_y', 'HOUSEHOLDS_y','ALLDONG_y','PARK_y',
                            'UNDER_STRICT_y','HIGH_y','VOLUME_y'])


In [72]:
train['EXCL_AREA_SQM'] = train['EXCL_AREA_SQM']//3.3
test['EXCL_AREA_SQM'] = test['EXCL_AREA_SQM']//3.3

In [73]:
train['EXCL_AREA_SQM'] = train['EXCL_AREA_SQM'].astype('str')
test['EXCL_AREA_SQM'] = test['EXCL_AREA_SQM'].astype('str')

In [74]:
train['DONGAPTNM_SPACE'] = train['DONGAPT_NM'] + ' ' + train['EXCL_AREA_SQM']
test['DONGAPTNM_SPACE'] = test['DONGAPT_NM'] + ' ' + test['EXCL_AREA_SQM']

### 3-7. 아파트별 평수 평균 값에 따른 분류 feature 생성

In [75]:
targets = [15000,19000,21500,24000,26000,
            27500,28750,30000,31500,33000,
            34250,35500,36750,37750,39000,
            40000,41000,42250,43500,44500,
            45500,46750,48000,49250,50250,
            51500,53000,54500,56000,57500,
            59000,60250,62000,64000,66000,
            68000,70250,72500,75000,77500,
            80000,83000,87000,91000,95000,
            99500,105000,111000,117500,125000,
            132000,140000,150000,162500,177500,
            190000,200000,220000,240000,260000,
            280000,300000,320000,340000,360000,
            380000,400000,425000,450000,500000,
            550000,600000,700000,800000,900000,
            1000000,1100000,1200000
            ]

# 새로운 feature '아파트명 평균가' 생성
train['SPACEAPT_AVG'] = 0

for i in range(len(targets)):
    # 'APT_NM' 별 'target'의 평균 계산
    avg_target_by_apt = train.groupby('DONGAPTNM_SPACE')['target'].mean()

    # 평균이 400000을 초과하는 'APT_NM'에 대한 조건 필터링
    selected_apt_names = avg_target_by_apt[avg_target_by_apt > targets[i]].index
    
    # 조건을 만족하는 행들에 대해 '아파트명 평균가'에 8이라는 값을 할당
    train.loc[train['DONGAPTNM_SPACE'].isin(selected_apt_names), 'SPACEAPT_AVG'] = i
data = train[['SPACEAPT_AVG','DONGAPTNM_SPACE']].drop_duplicates()
test = pd.merge(test,data,how='left',on='DONGAPTNM_SPACE')

In [76]:
# KB 부동산 기준으로 채움. 
# 판매액 245000 
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='개포동 개포더샵트리에 32.0'), 'SPACEAPT_AVG'] = 59
# 판매액 105000 
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='대치동 대치타워 25.0'), 'SPACEAPT_AVG'] = 47
# 판매액 65000 
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='고덕동 고덕센트럴푸르지오 12.0'), 'SPACEAPT_AVG'] = 34
# 판매액 74300 
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='삼성동 한양립스 11.0'), 'SPACEAPT_AVG'] = 38
# 판매액 64000 
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신사동 우찬 17.0'), 'SPACEAPT_AVG'] = 34
# 판매액 371000 
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='압구정동 한양6 33.0'), 'SPACEAPT_AVG'] = 65
# 판매액 52500 
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='역삼동 역삼센트럴2차아이파크 5.0'), 'SPACEAPT_AVG'] = 26
# 판매액 67500 
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='둔촌동 아인리베 24.0'), 'SPACEAPT_AVG'] = 35
# 판매액 91000 ,96000 / AVG 93500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='상일동 고덕자이 15.0'), 'SPACEAPT_AVG'] = 44
# 판매액 62000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='성내동 다성이즈빌 15.0'), 'SPACEAPT_AVG'] = 33
# 판매액 39800
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='성내동 라움 13.0'), 'SPACEAPT_AVG'] = 14
# 판매액 63000, 63000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='성내동 비앤비 22.0'), 'SPACEAPT_AVG'] = 33
# 판매액 90000,92000 / AVG 91000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='천호동 힐데스하임천호 25.0'), 'SPACEAPT_AVG'] = 44
# 판매액 88000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='미아동 꿈의숲한신더휴 25.0'), 'SPACEAPT_AVG'] = 43
# 판매액 18000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='번동 아츠스테이수유점 4.0'), 'SPACEAPT_AVG'] = 1
# 판매액 18000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='번동 아츠스테이수유점 5.0'), 'SPACEAPT_AVG'] = 1
# 판매액 57000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='수유동 수유시그니티 14.0'), 'SPACEAPT_AVG'] = 29
# 판매액 32000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='내발산동 내츄럴4차 16.0'), 'SPACEAPT_AVG'] = 9
# 판매액 55000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='등촌동 강서 23.0'), 'SPACEAPT_AVG'] = 28
# 판매액 31000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='화곡동 화곡한강 20.0'), 'SPACEAPT_AVG'] = 8
# 판매액 60000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='남현동 네스코수피아 14.0'), 'SPACEAPT_AVG'] = 31
# 판매액 64000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='봉천동 에메랄드102동 25.0'), 'SPACEAPT_AVG'] = 34
# 판매액 36000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신림동 삼성아트빌 19.0'), 'SPACEAPT_AVG'] = 12
# 판매액 30000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='구의동 강변 14.0'), 'SPACEAPT_AVG'] = 8
# 판매액 42500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='구의동 광남캐스빌 17.0'), 'SPACEAPT_AVG'] = 18
# 판매액 45000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='자양동 그랜드 14.0'), 'SPACEAPT_AVG'] = 20
# 판매액 62700
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='공릉동 태릉해링턴플레이스 14.0'), 'SPACEAPT_AVG'] = 35
# 판매액 60000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='고척동 더헤리츠 16.0'), 'SPACEAPT_AVG'] = 31
# 판매액 23000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='구로동 함양주택 13.0'), 'SPACEAPT_AVG'] = 3
# 판매액 24000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='구로동 성삼하이츠 9.0'), 'SPACEAPT_AVG'] = 4
# 판매액 44000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='공릉동 서경하누리 18.0'), 'SPACEAPT_AVG'] = 19
# 판매액 60000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='공릉동 테라파크뷰 16.0'), 'SPACEAPT_AVG'] = 31
# 판매액 24000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='답십리동 현대썬앤빌청계 6.0'), 'SPACEAPT_AVG'] = 4
# 판매액 26000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='장안동 우정B 15.0'), 'SPACEAPT_AVG'] = 5
# 판매액 124000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='휘경동 휘경SK뷰 28.0'), 'SPACEAPT_AVG'] = 49
# 판매액 160000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='흑석동 흑석리버파크자이 25.0'), 'SPACEAPT_AVG'] = 53
# 판매액 58000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='망원동 마포월드컵제이스카이 17.0'), 'SPACEAPT_AVG'] = 30
# 판매액 75000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신수동 신수동르끌레브 14.0'), 'SPACEAPT_AVG'] = 39
# 판매액 108000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='북아현동 힐스테이트신촌 16.0'), 'SPACEAPT_AVG'] = 47
# 판매액 433000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='방배동 방배그랑자이 40.0'), 'SPACEAPT_AVG'] = 68
# 판매액 194000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='일원동 디에이치포레센트 17.0'), 'SPACEAPT_AVG'] = 56
# 판매액 43000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='창천동 노블레스 17.0'), 'SPACEAPT_AVG'] = 18
# 판매액 53000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='홍은동 도양라비앙 30.0'), 'SPACEAPT_AVG'] = 27
# 판매액 56000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='홍은동 서강2차 45.0'), 'SPACEAPT_AVG'] = 29
# 판매액 46000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='홍제동 제일 17.0'), 'SPACEAPT_AVG'] = 21
# 판매액 470000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='반포동 래미안원베일리 30.0'), 'SPACEAPT_AVG'] = 69
# 판매액 187000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='반포동 상지리츠빌2차 45.0'), 'SPACEAPT_AVG'] = 55
# 판매액 167000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='방배동 모닝아트빌 25.0'), 'SPACEAPT_AVG'] = 54
# 판매액 85000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='방배동 방배 서리풀 그랑블 11.0'), 'SPACEAPT_AVG'] = 42
# 판매액 80000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='방배동 방배삼성홈타운 31.0'), 'SPACEAPT_AVG'] = 41
# 판매액 274800
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='방배동 하늘바람 73.0'), 'SPACEAPT_AVG'] = 60
# 판매액 242000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='서초동 레미안서초6차 51.0'), 'SPACEAPT_AVG'] = 59
# 판매액 117500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='서초동 티에스프리우스 23.0'), 'SPACEAPT_AVG'] = 49
# 판매액 229000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='잠원동 르엘신반포파크애비뉴 17.0'), 'SPACEAPT_AVG'] = 58
# 판매액 375000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='성수동1가 아크로서울포레스트 28.0'), 'SPACEAPT_AVG'] = 65
# 판매액 83000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='송정동 히페리온리버팰리스 25.0'), 'SPACEAPT_AVG'] = 42
# 판매액 45000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='석촌동 베스트휴 17.0'), 'SPACEAPT_AVG'] = 20
# 판매액 58700
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='오금동 에스아이팰리스송파올림픽공원 8.0'), 'SPACEAPT_AVG'] = 30
# 판매액 104000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='당산동5가 당산센트럴아이파크 13.0'), 'SPACEAPT_AVG'] = 46
# 판매액 40000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신월동 대영월드 25.0'), 'SPACEAPT_AVG'] = 16
# 판매액 69000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신길동 길성그랑프리텔 22.0'), 'SPACEAPT_AVG'] = 36
# 판매액 72300
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신길동 신풍두산위브 20.0'), 'SPACEAPT_AVG'] = 38
# 판매액 85000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='수색동 DMC롯데캐슬더퍼스트 30.0'), 'SPACEAPT_AVG'] = 42
# 판매액 30500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='영등포동1가 여의도이튼브라운 9.0'), 'SPACEAPT_AVG'] = 8
# 판매액 240000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='이태원동 남산힐레지던스 61.0'), 'SPACEAPT_AVG'] = 59
# 판매액 173000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='한남동 대성이태리하우스 33.0'), 'SPACEAPT_AVG'] = 54
# 판매액 520000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='한남동 르가든더메인한남 71.0'), 'SPACEAPT_AVG'] = 70
# 판매액 110000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='효창동 효창아트빌2차 25.0'), 'SPACEAPT_AVG'] = 47
# 판매액 84000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='후암동 남산애지앙 27.0'), 'SPACEAPT_AVG'] = 42
# 판매액 38000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='갈현동 명성 24.0'), 'SPACEAPT_AVG'] = 14
# 판매액 41000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='갈현동 연서노벨 18.0'), 'SPACEAPT_AVG'] = 17
# 판매액 29520
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='갈현동 한아름 15.0'), 'SPACEAPT_AVG'] = 4
# 판매액 49000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='불광동 청송 31.0'), 'SPACEAPT_AVG'] = 23
# 판매액 49000, 48800 / AVG 48900
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신사동 거성리젠시 13.0'), 'SPACEAPT_AVG'] = 23
# 판매액 43000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='역촌동 역촌월드 19.0'), 'SPACEAPT_AVG'] = 18
# 판매액 85000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신당동 신당KCC스위첸 13.0'), 'SPACEAPT_AVG'] = 42
# 판매액 38700
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='인현동2가 세운푸르지오헤리시티 8.0'), 'SPACEAPT_AVG'] = 14
# 판매액 40000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='인현동2가 세운푸르지오헤리시티 7.0'), 'SPACEAPT_AVG'] = 16
# 판매액 310000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='홍파동 경희궁자이 41.0'), 'SPACEAPT_AVG'] = 62
# 판매액 62000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='예장동 예장동삼익 18.0'), 'SPACEAPT_AVG'] = 33
# 판매액 44900
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='입정동 힐스테이트세운센트럴1단지 7.0'), 'SPACEAPT_AVG'] = 20
# 판매액 101000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='입정동 힐스테이트세운센트럴2단지 17.0'), 'SPACEAPT_AVG'] = 46
# 판매액 81000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='묵동 고덕골든빌 28.0'), 'SPACEAPT_AVG'] = 41
# 판매액 105000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='대치동 대치타워 26.0'), 'SPACEAPT_AVG'] = 47
# 판매액 64000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신사동 우찬 18.0'), 'SPACEAPT_AVG'] = 34
# 판매액 96000, 910000 / AVG 93500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='상일동 고덕자이 16.0'), 'SPACEAPT_AVG'] = 44
# 판매액 28500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='수유동 칸타빌수유팰리스 8.0'), 'SPACEAPT_AVG'] = 6
# 판매액 52000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='내발산동 강성레스빌 18.0'), 'SPACEAPT_AVG'] = 26
# 판매액 42500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='구의동 광남캐스빌 18.0'), 'SPACEAPT_AVG'] = 18
# 판매액 62700
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='공릉동 태릉해링턴플레이스 15.0'), 'SPACEAPT_AVG'] = 33
# 판매액 23700
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='답십리동 현대썬앤빌청계 7.0'), 'SPACEAPT_AVG'] = 3
# 판매액 58000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='망원동 마포월드컵제이스카이 18.0'), 'SPACEAPT_AVG'] = 30
# 판매액 75000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신수동 신수동르끌레브 15.0'), 'SPACEAPT_AVG'] = 39
# 판매액 194000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='일원동 디에이치포레센트 18.0'), 'SPACEAPT_AVG'] = 56
# 판매액 80000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='방배동 방배삼성홈타운 32.0'), 'SPACEAPT_AVG'] = 41
# 판매액 229000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='잠원동 르엘신반포파크애비뉴 18.0'), 'SPACEAPT_AVG'] = 58
# 판매액 95000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='목동 목동금호베스트빌 31.0'), 'SPACEAPT_AVG'] = 45
# 판매액 104000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='당산동5가 당산센트럴아이파크 14.0'), 'SPACEAPT_AVG'] = 46
# 판매액 25250
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='갈현동 한아름 16.0'), 'SPACEAPT_AVG'] = 4
# 판매액 48800
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='신사동 거성리젠시 14.0'), 'SPACEAPT_AVG'] = 23
# 판매액 148000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='평창동 벽산블루밍평창힐스 47.0'), 'SPACEAPT_AVG'] = 52
# 판매액 310000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='홍파동 경희궁자이 42.0'), 'SPACEAPT_AVG'] = 62
# 판매액 101000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['DONGAPTNM_SPACE']=='입정동 힐스테이트세운센트럴2단지 18.0'), 'SPACEAPT_AVG'] = 46

# 서울특별시 구로구 구로동 740-7 / 구로동로12길 49 / 18600
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='740-7'), 'SPACEAPT_AVG'] = 1
# 서울특별시 구로구 구로동 743-27 / 구로동로22길 52-2 / 21500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='743-27'), 'SPACEAPT_AVG'] = 3
# 서울특별시 구로구 구로동 747-34 / 도림로3길 35-5 / 10800
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='747-34'), 'SPACEAPT_AVG'] = 0
# 서울특별시 구로구 구로동 752-17 / 구로동로22길 76-6 / 18800
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='752-17'), 'SPACEAPT_AVG'] = 1
# 서울특별시 구로구 구로동 780-86 / 도림로12길 11 / 15000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='780-86'), 'SPACEAPT_AVG'] = 1
# 서울특별시 동대문구 장안동 404-13 / 천호대로77길 62 / 30700
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='404-13'), 'SPACEAPT_AVG'] = 8
# 서울특별시 동작구 상도동 323-4 / 국사봉1길 18 / 85000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='323-4'), 'SPACEAPT_AVG'] = 42
# 서울특별시 영등포구 대림동 1101-1 / 도림천로19길 12 / 11000
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='1101-1'), 'SPACEAPT_AVG'] = 0
# 서울특별시 중구 신당동 432-904 / 동호로11마길 20-8 / 26500
test.loc[(test['SPACEAPT_AVG'].isna())&(test['LOT_NO']=='432-904'), 'SPACEAPT_AVG'] = 5


### 3-8. 2차역세권 및 1차역세권 수정 

In [None]:
train.loc[(train['2NDSUBAREA']>0), '1STSUBAREA'] = 1
test.loc[(test['2NDSUBAREA']>0), '1STSUBAREA'] = 1

### 3-9. Train에는 있지만 Test에 없는 아파트명 평균가 1억 이하 Drop

In [None]:
# 정규표현식 패턴 생성
excluded_apts = ['AirPalace', 'SRvill', '경동팰리스힐', '대길B', '썬앤빌', '재선주택', '코원']

# go
pattern = '|'.join(excluded_apts)

# '열이름' 열에서 패턴이 있는지 확인하여 새로운 열 추가
train['is_아파트'] = np.where(train['APT_NM'].str.contains(pattern), 1, 0)

train = train[train['is_아파트'] == 0]

train = train.drop(columns='is_아파트')

### 3-10. 데이터 형 변환

In [None]:
train['EXCL_AREA_SQM'] = train['EXCL_AREA_SQM'].astype('float64').astype('int64')
test['EXCL_AREA_SQM'] = test['EXCL_AREA_SQM'].astype('float64').astype('int64')

train['GU'] = train['GU'].astype('object')
test['GU'] = test['GU'].astype('object')

train['DONG'] = train['DONG'].astype('object')
test['DONG'] = test['DONG'].astype('object')

### 3-11. 영등포1가, 2가와 같은 자치동들을 영등포동으로 합침.

In [None]:
def make_dong(dong):
    import re
    dong_name = re.search(r'[^0-9]+', dong).group()
    if dong_name[-1] != "동" and dong_name[-1] != "로":
        return dong_name+"동"
    else:
        return dong_name

In [None]:
train['DONG'] = train['DONG'].apply(make_dong)
test['DONG'] = test['DONG'].apply(make_dong)

In [None]:
uint8_columns = train.select_dtypes(include='uint8').columns
train[uint8_columns] = train[uint8_columns].astype('int64')

uint8_columns = test.select_dtypes(include='uint8').columns
test[uint8_columns] = test[uint8_columns].astype('int64')

### 3-12. 자치 동별 평균가 순위대로 feature 생성 / DONG_RANK

In [None]:
train['GU'] = train['GU'].astype('category')
test['GU'] = test['GU'].astype('category')

train['DONG'] = train['DONG'].astype('category')
test['DONG'] = test['DONG'].astype('category')

In [None]:
dong_mean = train.groupby('DONG')['target'].mean()
dong_rank = dong_mean.rank(ascending=False)
train['DONG_RANK'] = train['DONG'].map(dong_rank)
test['DONG_RANK'] = test['DONG'].map(dong_rank)

In [None]:
train['GU'] = train['GU'].astype('str').astype('category')
test['GU'] = test['GU'].astype('str').astype('category')

train['DONG_RANK'] = train['DONG_RANK'].astype('int64')
test['DONG_RANK'] = test['DONG_RANK'].astype('int64')

train['target'] = train['target'].astype('int64')

### 3-13. 한강 중심 위도,경도 & 서울 중심 위도,경도 를 기준으로 아파트 별 하버사인 거리 구하고 feature 만들기

In [None]:
train2 = train[['X_CODE','Y_CODE','FULL_ADRES']]
test2 = test[['X_CODE','Y_CODE','FULL_ADRES']]

# 8943개
train2 = train2.drop_duplicates('FULL_ADRES')
test2 = test2.drop_duplicates('FULL_ADRES')

# 공통된 column을 기준으로 train2와 test2 합치기
data = pd.concat([train2, test2], ignore_index=True)

data = data.drop_duplicates('FULL_ADRES')

# 아파트-한강중심 하버사인 거리
lat1 = np.deg2rad(37.5104939)
lat2 = np.deg2rad(data['Y_CODE'])
lon1 = np.deg2rad(126.9817655)
lon2 = np.deg2rad(data['X_CODE'])
dlat = lat2 - lat1
dlon = lon1 - lon2
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))
data['RIVER_CEN_DIST'] = c * 6371 * 1000

# 아파트-서울중심 하버사인 거리
lat1 = np.deg2rad(37.5518911)
lat2 = np.deg2rad(data['Y_CODE'])
lon1 = np.deg2rad(126.9917937)
lon2 = np.deg2rad(data['X_CODE'])
dlat = lat2 - lat1
dlon = lon1 - lon2
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))
data['SEOUL_CEN_DIST'] = c * 6371 * 1000

data = data.drop(columns=['X_CODE','Y_CODE'])

train = pd.merge(train, data, how = 'left', on = 'FULL_ADRES')
test = pd.merge(test, data, how = 'left', on = 'FULL_ADRES')


### 3-14. 필요없는 feature drop

In [None]:
train = train.drop(columns=[
                            # 아파트 구분 
                            'ADRES', # 시군구
                            'LOT_NO', # 번지
                            'APT_NM', # 아파트명
                            # 'FULL_ADRES',
                            'ADRES_DORO', # 도로명
                            'DONGAPTNM_SPACE',
                            
                            # 계약일 구분
                            # CONTR_YEAR_MONTH 계약년월
                            'CONTR_DAY', # 계약일
                            
                            # 충별 구분
                            # FLOOR
                            
                            # 거래유형
                            # EDA 필요
                            'TRADE_TYPE',
                                                        
                            
                            # EDA 결론 : 의미 없음 
                            # 확정
                            # 건축년도
                            'BUILD_YEAR',
                            # 복도 유형
                            'CRRDPR_TY',
                            # 단지 신청일
                            'USE_RQSTDT',
                            
                            # 미확정
                            # 중개사소재지
                            'AGNCY_LOCATION',
                            # 단지분류(아파트,주상복합)
                            'CODEAPTNM',
                            # 세대 타입
                            'HSHLDR_TY',
                            # 난방 방식
                            'HEAT_MTHD',
                            # 전체 동수
                            'ALL_DONG_CO',
                            # 전체 세대수
                            'ALL_HSHLD_CO',
                            # 건설사
                            'CO_WO',
                            # 시행사
                            'CO_EX',
                            # 연면적
                            'TOTAR',
                            # 주거 전용 면적
                            'PRIVAREA',
                            # 전용면적별 세대현황
                            'KAPTMPAREA60', 'KAPTMPAREA85', 'KAPTMPAREA135', 'KAPTMPAREA136',
                            # 세대 전기 계약 방법
                            'HSHLD_ELCTY_CNTRCT_MTH',
                            # 건축 면적
                            'BU_AR',
                            # 주차 대수
                            'CNT_PA',
                            # 기타 / 의무 / 임대 / 임의 1 2 3 4
                            'GUBUN',
                            
                            #
                            # 'SUBWAY_DIST',
                            # 'FLOATING_POPULATION',
                            # '1STSUBAREA',
                            # '2NDSUBAREA',
                            # 'BUS_DIST',
                            'NOMINAL_GDP',
                            'REDEVELOP',
                            'CLASS_NUM',
                            'DONGAPT_NM',
                            # 'INTEREST_RATE',
                            
                            # 향후 조정 가능
                            'CANCEL_REASON_DATE',
                            'VOTERATE',
                            'FULL_ADRES',
                            '1STSUBAREA',
                            '2NDSUBAREA',
                            'RIVER_CEN_DIST',
                            'CONTR',
                            'DONG',
                            'EXCL_AREA_SQM',
                            'SCHOOL_DISTRICT'
                            ])
test = test.drop(columns=[
                            # 아파트 구분 
                            'ADRES', # 시군구
                            'LOT_NO', # 번지
                            'APT_NM', # 아파트명
                            # 'FULL_ADRES',
                            'ADRES_DORO', # 도로명
                            'DONGAPTNM_SPACE',
                            
                            # 계약일 구분
                            # CONTR_YEAR_MONTH 계약년월
                            'CONTR_DAY', # 계약일
                            
                            # 충별 구분
                            # FLOOR
                            
                            # 거래유형
                            # EDA 필요
                            'TRADE_TYPE',
                                                        
                            
                            # EDA 결론 : 의미 없음 
                            # 확정
                            # 건축년도
                            'BUILD_YEAR',
                            # 복도 유형
                            'CRRDPR_TY',
                            # 단지 신청일
                            'USE_RQSTDT',
                            
                            # 미확정
                            # 중개사소재지
                            'AGNCY_LOCATION',
                            # 단지분류(아파트,주상복합)
                            'CODEAPTNM',
                            # 세대 타입
                            'HSHLDR_TY',
                            # 난방 방식
                            'HEAT_MTHD',
                            # 전체 동수
                            'ALL_DONG_CO',
                            # 전체 세대수
                            'ALL_HSHLD_CO',
                            # 건설사
                            'CO_WO',
                            # 시행사
                            'CO_EX',
                            # 연면적
                            'TOTAR',
                            # 주거 전용 면적
                            'PRIVAREA',
                            # 전용면적별 세대현황
                            'KAPTMPAREA60', 'KAPTMPAREA85', 'KAPTMPAREA135', 'KAPTMPAREA136',
                            # 세대 전기 계약 방법
                            'HSHLD_ELCTY_CNTRCT_MTH',
                            # 건축 면적
                            'BU_AR',
                            # 주차 대수
                            'CNT_PA',
                            # 기타 / 의무 / 임대 / 임의 1 2 3 4
                            'GUBUN',
                            
                            #
                            # 'SUBWAY_DIST',
                            # 'FLOATING_POPULATION',
                            # '1STSUBAREA',
                            # '2NDSUBAREA',
                            # 'BUS_DIST',
                            'NOMINAL_GDP',
                            'REDEVELOP',
                            'CLASS_NUM',
                            'DONGAPT_NM',
                            # 'INTEREST_RATE',
                            
                            # 향후 조정 가능
                            'CANCEL_REASON_DATE',
                            'VOTERATE',
                            'FULL_ADRES',
                            '1STSUBAREA',
                            '2NDSUBAREA',
                            'RIVER_CEN_DIST',
                            'CONTR',
                            'DONG',
                            'EXCL_AREA_SQM',
                            'SCHOOL_DISTRICT'
                            ])

In [None]:
train1 = train.copy()
test1 = test.copy()

# Optuna

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import KFold


In [None]:
# 파생변수 제작으로 추가된 변수들이 존재하기에, 다시한번 연속형과 범주형 칼럼을 분리해주겠습니다.
continuous_columns_v2 = []
categorical_columns_v2 = []

for column in train1.columns:
    if pd.api.types.is_numeric_dtype(train1[column]):
        continuous_columns_v2.append(column)
    else:
        categorical_columns_v2.append(column)

print("연속형 변수:", continuous_columns_v2)
print("범주형 변수:", categorical_columns_v2)

연속형 변수: ['EXCL_AREA_SQM', 'CONTR_YEAR_MONTH', 'CONTR_DAY', 'FLOOR', 'BUILD_YEAR', 'CANCEL_REASON_DATE', 'ALL_DONG_CO', 'ALL_HSHLD_CO', 'TOTAR', 'PRIVAREA', 'KAPTMPAREA60', 'KAPTMPAREA85', 'KAPTMPAREA135', 'KAPTMPAREA136', 'BU_AR', 'CNT_PA', 'target', 'FLOATING_POPULATION', 'SUBWAY_DIST', '1STSUBAREA', '2NDSUBAREA', 'BUS_DIST', 'X_CODE', 'Y_CODE', 'BRIDGE_DIST', 'LEASE_RATE', 'CONTR', 'INTEREST_RATE', 'SCHOOL_DISTRICT', 'REDEVELOP', 'CLASS_NUM', 'VOTERATE', 'REAL_GDP']
범주형 변수: ['ADRES', 'LOT_NO', 'APT_NM', 'ADRES_DORO', 'TRADE_TYPE', 'AGNCY_LOCATION', 'CODEAPTNM', 'HSHLDR_TY', 'CRRDPR_TY', 'HEAT_MTHD', 'CO_WO', 'CO_EX', 'HSHLD_ELCTY_CNTRCT_MTH', 'GUBUN', 'USE_RQSTDT', 'GU', 'DONG', 'FULL_ADRES', 'NOMINAL_GDP', 'DONGAPT_NM']


In [None]:
for col in categorical_columns_v2:
    lbl = OrdinalEncoder()
    # train1[col] = lbl.fit_transform(train1[col])
    # test1[col] = lbl.transform(test1[col])
    train1[col] = lbl.fit_transform(train1[col].astype(str).values.reshape(-1, 1))
    test1[col] = lbl.transform(test1[col].astype(str).values.reshape(-1, 1))

ValueError: Found unknown categories ['240', '67-5', '344-6', '443-6', '142-16', '1-56', '253-89', '636-11', '517-39', '550-20', '717-23', '59-9'] in column 0 during transform

### NULL 처리 -> f.dropna(axis=0)

In [None]:
train1.dropna(axis=0)
test1.dropna(axis=0)

### NULL 처리 -> KNN

In [None]:
from sklearn.impute import KNNImputer

#임퓨터 선언(5개의 평균으로 계산하겠다)
imputer=KNNImputer(n_neighbors=5)

#임퓨터를 사용하여 filled_train으로 저장 이후 같은 임퓨터를 사용할때는 imputer.transform()으로 사용하면됨
filled_train=imputer.fit_transform(train1)
filled_test=imputer.transform(test1)

#사용하면 array값으로 나오기때문에 dataframe으로 바꿔주고 컬럼을가져옴
train1=pd.DataFrame(filled_train, columns=train1.columns)
test1=pd.DataFrame(filled_test, columns=test1.columns)

In [None]:
# Target과 독립변수들을 분리해줍니다.
y = train1['target']
X = train1.drop(['target'], axis=1)
X_test = test1.copy()

# Hold out split을 사용해 학습 데이터와 검증 데이터를 8:2 비율로 나누겠습니다.
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1996)

# AutoML

In [42]:
# load modules
import numpy as np
import pandas as pd

# split
from sklearn.model_selection import train_test_split

# models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# metrics
from sklearn.metrics import mean_squared_log_error

# GridSearchCV
# 최적의 파라미터 값 찾아보기
from sklearn.model_selection import GridSearchCV

# K-FOLD
from sklearn.model_selection import StratifiedKFold

# Using AutoML
from supervised import AutoML

# One-hot-encoding
from sklearn.preprocessing import OneHotEncoder

# Target-Encoding
from category_encoders.target_encoder import TargetEncoder

import random
# Auto ML (mljar-supervised)
from supervised.automl import AutoML
import os

In [45]:
automl = AutoML(mode='Compete',
                    ml_task="regression",
                    algorithms=[
#                         "Baseline",
#                         "Linear",
                        # "Decision Tree",
                        # "Random Forest",
                        # "Extra Trees",
                        "LightGBM",
                        "Xgboost",
                        "CatBoost",
                        ],
                    eval_metric="rmse",
                    validation_strategy={
                        "validation_type": "kfold",
                        "k_folds": 5,
                        "shuffle": True,
                        "stratify": False,
                        "random_seed": int("C0FFEE", 16)
                        },
                    explain_level = 2,
                    stack_models = True,
                    random_state=int("CAFE", 16),
                    n_jobs = -1
                  )
# n_jobs: Union[int, NoneType] = None : 병렬로 실행할 경우, job 개수, -1이면 전체 processor 사용

In [46]:
automl.fit(X, y)

AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['LightGBM', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models




1_Default_LightGBM rmse 39993.395103 trained in 202.59 seconds




2_Default_Xgboost rmse 40192.372009 trained in 109.8 seconds




3_Default_CatBoost rmse 39936.900889 trained in 59.01 seconds
* Step not_so_random will try to check up to 27 models




13_LightGBM rmse 39992.358727 trained in 195.76 seconds




4_Xgboost rmse 40934.530108 trained in 256.66 seconds




22_CatBoost rmse 42204.199033 trained in 217.74 seconds




14_LightGBM rmse 40141.11028 trained in 1526.77 seconds
Skip golden_features because of the time limit.
Skip kmeans_features because of the time limit.
Not enough time to perform features selection. Skip
Time needed for features selection ~ 428.0 seconds
Please increase total_time_limit to at least (4337 seconds) to have features selection
Skip insert_random_feature because no parameters were generated.
Skip features_selection because no parameters were generated.
* Step hill_climbing_1 will try to check up to 11 models




23_CatBoost rmse 39936.981271 trained in 69.79 seconds
* Step hill_climbing_2 will try to check up to 14 models




24_CatBoost rmse 39937.195171 trained in 109.43 seconds




25_CatBoost rmse 39936.794466 trained in 33.99 seconds




26_CatBoost rmse 39937.123093 trained in 123.07 seconds
* Step boost_on_errors will try to check up to 1 model




25_CatBoost_BoostOnErrors rmse 39939.350731 trained in 33.54 seconds
* Step ensemble will try to check up to 1 model
Ensemble rmse 39936.76844 trained in 0.56 seconds
* Step stack will try to check up to 11 models




25_CatBoost_Stacked rmse 39946.505666 trained in 15.08 seconds




13_LightGBM_Stacked rmse 39924.320087 trained in 34.06 seconds




2_Default_Xgboost_Stacked rmse 39947.624063 trained in 32.47 seconds




3_Default_CatBoost_Stacked rmse 39948.260412 trained in 24.88 seconds




1_Default_LightGBM_Stacked rmse 39928.52528 trained in 52.95 seconds




4_Xgboost_Stacked rmse 39979.487798 trained in 33.42 seconds




23_CatBoost_Stacked rmse 39947.171584 trained in 26.6 seconds




14_LightGBM_Stacked rmse 39935.07906 trained in 54.31 seconds




26_CatBoost_Stacked rmse 39944.436896 trained in 51.88 seconds




24_CatBoost_Stacked rmse 39942.734436 trained in 41.75 seconds




22_CatBoost_Stacked rmse 41446.496084 trained in 32.44 seconds
* Step ensemble_stacked will try to check up to 1 model
Ensemble_Stacked rmse 39921.973416 trained in 2.02 seconds
AutoML fit time: 3386.12 seconds
AutoML best model: Ensemble_Stacked


In [47]:
pred = automl.predict(X_test)

In [48]:
submission['target'] = pred
submission = pd.DataFrame(pred.astype(int),columns=['target'])
submission.to_csv('sub.csv',index=False)

# Optuna

In [32]:
# def rmse_cv(model, X = X, y = y):   
#     return np.sqrt(-cross_val_score(model, X, y, scoring = 'neg_mean_squared_error', cv = kf)).mean()

In [666]:
def objective(trial):
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 50,750),
    'max_depth': trial.suggest_int('max_depth', 3, 50),
    'min_child_weight': trial.suggest_float('min_child_weight', 2,50),
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.2, log = True),
    'subsample': trial.suggest_float('subsample', 0.2, 1),
    'gamma': trial.suggest_float('gamma', 1e-4, 1.0),
    'colsample_bytree': trial.suggest_float('colsample_bytree',0.2,1),
    'colsample_bylevel': trial.suggest_float('colsample_bylevel',0.2,1),
    'colsample_bynode': trial.suggest_float('colsample_bynode', 0.2, 1)
    }
    
    # KFold : 회귀모델 / StratifiedKFold : 분류모델
    # kf = KFold(n_splits = 5, shuffle = True, random_state = 1996) 
    
    xgbmodel_optuna = XGBRegressor(**params, random_state = 1996, tree_method = 'gpu_hist')
    # xgbmodel_optuna = XGBClassifier(**params, random_state = 1996)
    
    # cv = -cross_val_score(xgbmodel_optuna, X, y, scoring = 'neg_mean_squared_error', cv = 5).mean()
    cv = np.sqrt(-cross_val_score(xgbmodel_optuna, X, y, scoring = 'neg_mean_squared_error', cv = KFold(n_splits=5))).mean()

    return cv

In [667]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

[I 2024-01-24 02:50:04,947] A new study created in memory with name: no-name-04330bde-7e41-4a79-bc7f-5d46c3696191
[I 2024-01-24 02:50:15,265] Trial 0 finished with value: 42129.452871095215 and parameters: {'n_estimators': 643, 'max_depth': 8, 'min_child_weight': 25.17251817747789, 'learning_rate': 0.08925117077296603, 'subsample': 0.9974298522632519, 'gamma': 0.4890935678498466, 'colsample_bytree': 0.3187239235525898, 'colsample_bylevel': 0.9386462350072076, 'colsample_bynode': 0.5057463725074234}. Best is trial 0 with value: 42129.452871095215.
[I 2024-01-24 02:50:31,368] Trial 1 finished with value: 43557.68784543331 and parameters: {'n_estimators': 545, 'max_depth': 39, 'min_child_weight': 49.97548642146382, 'learning_rate': 0.002954662504836596, 'subsample': 0.7143656772144622, 'gamma': 0.6574821369401869, 'colsample_bytree': 0.32729882995073895, 'colsample_bylevel': 0.7802292464562464, 'colsample_bynode': 0.7550668557415812}. Best is trial 0 with value: 42129.452871095215.
[I 202

In [246]:
# optuna가 시도했던 모든 실험 관련 데이터
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bylevel,params_colsample_bynode,params_colsample_bytree,params_gamma,params_learning_rate,params_max_depth,params_min_child_weight,params_n_estimators,params_subsample,state
0,0,31469.37,2024-01-24 01:16:54.692847,2024-01-24 01:17:57.595675,0 days 00:01:02.902828,0.77,0.73,0.75,0.14,0.13,27,10.28,663,0.90,COMPLETE
1,1,57000.99,2024-01-24 01:17:57.596420,2024-01-24 01:18:00.535913,0 days 00:00:02.939493,0.66,0.55,0.33,0.33,0.00,5,23.05,403,0.39,COMPLETE
2,2,52800.15,2024-01-24 01:18:00.536657,2024-01-24 01:18:02.088061,0 days 00:00:01.551404,0.24,0.49,0.79,0.03,0.00,8,2.55,162,0.42,COMPLETE
3,3,37816.38,2024-01-24 01:18:02.088870,2024-01-24 01:18:08.127701,0 days 00:00:06.038831,0.44,0.28,0.88,0.58,0.00,30,24.22,683,0.71,COMPLETE
4,4,45249.07,2024-01-24 01:18:08.128774,2024-01-24 01:18:12.777291,0 days 00:00:04.648517,0.78,0.24,0.33,0.77,0.00,37,39.68,610,0.66,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,31464.66,2024-01-24 01:54:47.637852,2024-01-24 01:55:14.517485,0 days 00:00:26.879633,0.89,0.68,0.62,0.09,0.05,43,4.09,553,0.93,COMPLETE
96,96,31478.49,2024-01-24 01:55:14.518585,2024-01-24 01:55:28.908295,0 days 00:00:14.389710,0.88,0.68,0.44,0.17,0.05,39,4.53,502,0.92,COMPLETE
97,97,31470.56,2024-01-24 01:55:28.909178,2024-01-24 01:56:15.546742,0 days 00:00:46.637564,1.00,0.70,0.62,0.13,0.03,43,7.53,683,0.88,COMPLETE
98,98,31503.40,2024-01-24 01:56:15.547827,2024-01-24 01:56:39.381710,0 days 00:00:23.833883,0.79,0.62,0.65,0.10,0.05,50,11.02,566,0.29,COMPLETE


In [247]:
# APT_AVG -> INT
# Best trial: score 42086.636990635845, 
# params {'n_estimators': 535, 'max_depth': 50, 'min_child_weight': 8.308263936568466, 'learning_rate': 0.19197345167546542, 'subsample': 0.220274354462544, 'gamma': 0.9543048581305651, 'colsample_bytree': 0.8864539840448289, 'colsample_bylevel': 0.23449090220318408, 'colsample_bynode': 0.8964996378998393}
# learning_rate 0.7

# APT_AVG -> Onehot
# Best trial: score 31445.105286431597, 
# params {'n_estimators': 375, 'max_depth': 43, 'min_child_weight': 3.9491734749656957, 'learning_rate': 0.028582252117302465, 'subsample': 0.3583955512035071, 'gamma': 0.2330615765664894, 'colsample_bytree': 0.47849566217633194, 'colsample_bylevel': 0.9150093920245795, 'colsample_bynode': 0.8446544300945146}

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value,study.best_trial.params))

Best trial: score 31445.105286431597, 
params {'n_estimators': 375, 'max_depth': 43, 'min_child_weight': 3.9491734749656957, 'learning_rate': 0.028582252117302465, 'subsample': 0.3583955512035071, 'gamma': 0.2330615765664894, 'colsample_bytree': 0.47849566217633194, 'colsample_bylevel': 0.9150093920245795, 'colsample_bynode': 0.8446544300945146}


In [254]:
# Hyperparameter Importances를 통해서 parameter를 고정시켜라.
# 그리고 나머지 것들을 진행시켜라.
optuna.visualization.plot_param_importances(study)

In [255]:
optuna.visualization.plot_optimization_history(study)

In [256]:
# model finalization
# optuna는 hyperparameter를 조정.

best_params = study.best_params
best_model = XGBRegressor(**best_params, 
                        random_state=1996, 
                        tree_method = 'gpu_hist',
                       )

pred = best_model.fit(
                X,
                y,
                eval_set = [(X_train, y_train), (X_valid, y_valid)],
            eval_metric = "rmse",
            early_stopping_rounds=100,
            verbose=False
               ).predict(X_test)

submission = pd.DataFrame(pred.astype(int),columns=['target'])
submission.to_csv('sub.csv',index=False)
# print(f'RMSE test: {np.sqrt(metrics.mean_squared_error(y_valid, pred))}')

Val ACC
