In [2]:
# visualization
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
fe = fm.FontEntry(
    fname=r'/usr/share/fonts/truetype/nanum/NanumGothic.ttf', # ttf 파일이 저장되어 있는 경로
    name='NanumBarunGothic')                        # 이 폰트의 원하는 이름 설정
fm.fontManager.ttflist.insert(0, fe)              # Matplotlib에 폰트 추가
plt.rcParams.update({'font.size': 10, 'font.family': 'NanumBarunGothic'}) # 폰트 설정
plt.rc('font', family='NanumBarunGothic')
import seaborn as sns

# utils
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import warnings;warnings.filterwarnings('ignore')

# Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import metrics

import eli5
from eli5.sklearn import PermutationImportance

In [5]:
# 필요한 데이터를 load 하겠습니다. 경로는 환경에 맞게 지정해주면 됩니다.
train_path = '/data/ephemeral/home/Upstage_ML_Competition01/data/train.csv'
test_path  = '/data/ephemeral/home/Upstage_ML_Competition01/data/test.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

## Data Preprocessing

In [6]:
# train/test 구분을 위한 칼럼을 하나 만들어 줍니다.
train['is_test'] = 0
test['is_test'] = 1
concat = pd.concat([train, test])     # 하나의 데이터로 만들어줍니다.

In [7]:
concat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1128094 entries, 0 to 9271
Data columns (total 53 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   시군구                     1128094 non-null  object 
 1   번지                      1127867 non-null  object 
 2   본번                      1128019 non-null  float64
 3   부번                      1128019 non-null  float64
 4   아파트명                    1125958 non-null  object 
 5   전용면적(㎡)                 1128094 non-null  float64
 6   계약년월                    1128094 non-null  int64  
 7   계약일                     1128094 non-null  int64  
 8   층                       1128094 non-null  int64  
 9   건축년도                    1128094 non-null  int64  
 10  도로명                     1128094 non-null  object 
 11  해제사유발생일                 6195 non-null     float64
 12  등기신청일자                  1128094 non-null  object 
 13  거래유형                    1128094 non-null  object 
 14  중개사소재

In [8]:
# 위 처럼 아무 의미도 갖지 않는 칼럼은 결측치와 같은 역할을 하므로, np.nan으로 채워 결측치로 인식되도록 합니다.
concat['등기신청일자'] = concat['등기신청일자'].replace(' ', np.nan)
concat['거래유형'] = concat['거래유형'].replace('-', np.nan)
concat['중개사소재지'] = concat['중개사소재지'].replace('-', np.nan)

In [9]:
# 시군구, 년월 등 분할할 수 있는 변수들은 세부사항 고려를 용이하게 하기 위해 모두 분할해 주겠습니다.
concat['구'] = concat['시군구'].map(lambda x : x.split()[1])
concat['동'] = concat['시군구'].map(lambda x : x.split()[2])
del concat['시군구']

concat['계약년'] = concat['계약년월'].astype('str').map(lambda x : x[:4])
concat['계약월'] = concat['계약년월'].astype('str').map(lambda x : x[4:])
del concat['계약년월']

In [10]:
concat.columns

Index(['번지', '본번', '부번', '아파트명', '전용면적(㎡)', '계약일', '층', '건축년도', '도로명',
       '해제사유발생일', '등기신청일자', '거래유형', '중개사소재지', 'k-단지분류(아파트,주상복합등등)', 'k-전화번호',
       'k-팩스번호', '단지소개기존clob', 'k-세대타입(분양형태)', 'k-관리방식', 'k-복도유형', 'k-난방방식',
       'k-전체동수', 'k-전체세대수', 'k-건설사(시공사)', 'k-시행사', 'k-사용검사일-사용승인일', 'k-연면적',
       'k-주거전용면적', 'k-관리비부과면적', 'k-전용면적별세대현황(60㎡이하)', 'k-전용면적별세대현황(60㎡~85㎡이하)',
       'k-85㎡~135㎡이하', 'k-135㎡초과', 'k-홈페이지', 'k-등록일자', 'k-수정일자', '고용보험관리번호',
       '경비비관리형태', '세대전기계약방법', '청소비관리형태', '건축면적', '주차대수', '기타/의무/임대/임의=1/2/3/4',
       '단지승인일', '사용허가여부', '관리비 업로드', '좌표X', '좌표Y', '단지신청일', 'target',
       'is_test', '구', '동', '계약년', '계약월'],
      dtype='object')

In [11]:
concat = concat.rename(columns={'전용면적(㎡)':'전용면적',
                                'k-단지분류(아파트,주상복합등등)' : '단지분류',
                                'k-전화번호' : '전화번호', 
                                'k-팩스번호' : '팩스번호',
                                'k-세대타입(분양형태)' : '분양형태',
                                'k-관리방식' : '관리방식',
                                'k-복도유형' : '복도유형',
                                'k-난방방식' : '난방방식',
                                'k-전체동수' : '전체동수',
                                'k-전체세대수' : '전체세대수',
                                'k-건설사(시공사)' : '시공사',
                                'k-시행사' : '시행사', 
                                'k-사용검사일-사용승인일' : '사용검사일-사용승인일',
                                'k-연면적' : '연면적',
                                'k-주거전용면적' : '주거전용면적',
                                'k-관리비부과면적' : '관리비부과면적',
                                'k-전용면적별세대현황(60㎡이하)' : '전용면적별세대현황(60㎡이하)',
                                'k-전용면적별세대현황(60㎡~85㎡이하)' : '전용면적별세대현황(60㎡~85㎡이하)',
                                'k-85㎡~135㎡이하' : '전용면적별세대현황(85㎡~135㎡이하)',
                                'k-135㎡초과' : '전용면적별세대현황(135㎡초과)',
                                'k-홈페이지' : '홈페이지',
                                'k-등록일자' : '등록일자',
                                'k-수정일자' : '수정일자',
                                '기타/의무/임대/임의=1/2/3/4' : '권리유형',
                                'target' : '집값(target)',
                                'is_test' : '테스트셋여부'
                                })

In [12]:
concat.columns

Index(['번지', '본번', '부번', '아파트명', '전용면적', '계약일', '층', '건축년도', '도로명', '해제사유발생일',
       '등기신청일자', '거래유형', '중개사소재지', '단지분류', '전화번호', '팩스번호', '단지소개기존clob',
       '분양형태', '관리방식', '복도유형', '난방방식', '전체동수', '전체세대수', '시공사', '시행사',
       '사용검사일-사용승인일', '연면적', '주거전용면적', '관리비부과면적', '전용면적별세대현황(60㎡이하)',
       '전용면적별세대현황(60㎡~85㎡이하)', '전용면적별세대현황(85㎡~135㎡이하)', '전용면적별세대현황(135㎡초과)',
       '홈페이지', '등록일자', '수정일자', '고용보험관리번호', '경비비관리형태', '세대전기계약방법', '청소비관리형태',
       '건축면적', '주차대수', '권리유형', '단지승인일', '사용허가여부', '관리비 업로드', '좌표X', '좌표Y',
       '단지신청일', '집값(target)', '테스트셋여부', '구', '동', '계약년', '계약월'],
      dtype='object')

In [13]:
using_list = ['구', '동','계약년', '계약월', '아파트명', '전용면적', '층', '건축년도',
       '도로명', '거래유형', '단지분류', '분양형태', '전체동수', '전체세대수', '시공사',
       '시행사', '연면적', '주거전용면적', '관리비부과면적', '전용면적별세대현황(60㎡이하)',
       '전용면적별세대현황(60㎡~85㎡이하)', '전용면적별세대현황(85㎡~135㎡이하)', '전용면적별세대현황(135㎡초과)',
       '건축면적', '주차대수', '권리유형', '단지신청일', '집값(target)', '테스트셋여부']

In [14]:
using_concat = concat[using_list]
using_concat

Unnamed: 0,구,동,계약년,계약월,아파트명,전용면적,층,건축년도,도로명,거래유형,...,전용면적별세대현황(60㎡이하),전용면적별세대현황(60㎡~85㎡이하),전용면적별세대현황(85㎡~135㎡이하),전용면적별세대현황(135㎡초과),건축면적,주차대수,권리유형,단지신청일,집값(target),테스트셋여부
0,강남구,개포동,2017,12,개포6차우성,79.97,3,1987,언주로 3,,...,20.0,250.0,0.0,,4858.0,262.0,임의,2022-11-17 10:19:06.0,124000.0,0
1,강남구,개포동,2017,12,개포6차우성,79.97,4,1987,언주로 3,,...,20.0,250.0,0.0,,4858.0,262.0,임의,2022-11-17 10:19:06.0,123500.0,0
2,강남구,개포동,2017,12,개포6차우성,54.98,5,1987,언주로 3,,...,20.0,250.0,0.0,,4858.0,262.0,임의,2022-11-17 10:19:06.0,91500.0,0
3,강남구,개포동,2018,01,개포6차우성,79.97,4,1987,언주로 3,,...,20.0,250.0,0.0,,4858.0,262.0,임의,2022-11-17 10:19:06.0,130000.0,0
4,강남구,개포동,2018,01,개포6차우성,79.97,2,1987,언주로 3,,...,20.0,250.0,0.0,,4858.0,262.0,임의,2022-11-17 10:19:06.0,117000.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9267,중랑구,신내동,2023,07,신내우디안1단지,84.65,13,2014,신내역로1길 85,직거래,...,808.0,504.0,90.0,,14171.0,1568.0,의무,2014-09-01 13:05:03.0,,1
9268,중랑구,신내동,2023,07,신내우디안1단지,84.62,12,2014,신내역로1길 85,중개거래,...,808.0,504.0,90.0,,14171.0,1568.0,의무,2014-09-01 13:05:03.0,,1
9269,중랑구,신내동,2023,08,신내우디안1단지,101.65,12,2014,신내역로1길 85,중개거래,...,808.0,504.0,90.0,,14171.0,1568.0,의무,2014-09-01 13:05:03.0,,1
9270,중랑구,신내동,2023,09,신내우디안1단지,84.94,18,2014,신내역로1길 85,중개거래,...,808.0,504.0,90.0,,14171.0,1568.0,의무,2014-09-01 13:05:03.0,,1


In [15]:
cat_features = using_concat.select_dtypes(include=['object']).columns.to_list()
print("== Categorical ==")
print(cat_features)
num_features = using_concat.select_dtypes(exclude='object').columns.to_list()
print("== Numerical ==")
print(num_features)

== Categorical ==
['구', '동', '계약년', '계약월', '아파트명', '도로명', '거래유형', '단지분류', '분양형태', '시공사', '시행사', '권리유형', '단지신청일']
== Numerical ==
['전용면적', '층', '건축년도', '전체동수', '전체세대수', '연면적', '주거전용면적', '관리비부과면적', '전용면적별세대현황(60㎡이하)', '전용면적별세대현황(60㎡~85㎡이하)', '전용면적별세대현황(85㎡~135㎡이하)', '전용면적별세대현황(135㎡초과)', '건축면적', '주차대수', '집값(target)', '테스트셋여부']


In [16]:
def floor_to_categorical(floor) :
    if floor > 30:
        return "고층"
    elif floor >= 15 and floor <= 30:
        return "로열층"
    elif floor >= 0 and floor < 15:
        return "저층"
    elif floor < 0:
        return "지하"
    else :
        return "NaN"

In [17]:
# 건축년도 분포는 아래와 같습니다. 특히 2005년이 Q3에 해당합니다.
# 2009년 이후에 지어진 건물은 10%정도 되는 것을 확인할 수 있습니다.
using_concat['건축년도'].describe(percentiles = [0.1, 0.25, 0.5, 0.75, 0.8, 0.9])

count    1.128094e+06
mean     1.998791e+03
std      9.358540e+00
min      1.961000e+03
10%      1.986000e+03
25%      1.992000e+03
50%      2.000000e+03
75%      2.005000e+03
80%      2.006000e+03
90%      2.010000e+03
max      2.023000e+03
Name: 건축년도, dtype: float64

In [18]:
def building_state(building_year):
    if building_year >= 2009:
        return "신축"
    elif building_year < 2009 and building_year >= 2000:
        return "보통"
    elif building_year < 2000:
        return "구식"
    else :
        return "NaN"


In [19]:
using_concat["층높이"] = using_concat["층"].apply(floor_to_categorical)

In [20]:
using_concat["신축여부"] = using_concat["건축년도"].apply(floor_to_categorical)

In [21]:
using_concat["계약년"] = using_concat["계약년"].astype(str)
using_concat["계약월"] = using_concat["계약월"].astype(str)

In [22]:
all = list(using_concat['구'].unique())
gangnam = ['강서구', '영등포구', '동작구', '서초구', '강남구', '송파구', '강동구']
gangbuk = [x for x in all if x not in gangnam]

assert len(all) == len(gangnam) + len(gangbuk)       # 알맞게 분리되었는지 체크합니다.

In [23]:
# 강남의 여부를 체크합니다.
is_gangnam = []
for x in using_concat['구'].tolist() :
  if x in gangnam :
    is_gangnam.append(1)
  else :
    is_gangnam.append(0)

# 파생변수를 하나 만릅니다.
using_concat['강남여부'] = is_gangnam

In [24]:
# 이상치 제거 방법에는 IQR을 이용하겠습니다.
def remove_outliers_iqr(dt, column_list):
    df = dt.query('테스트셋여부 == 0')       # train data 내에 있는 이상치만 제거하도록 하겠습니다.
    df_test = dt.query('테스트셋여부 == 1')
    new_df = pd.DataFrame()

    for col in column_list:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        new_df = pd.concat([new_df, df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]], ignore_index=True)

    result = pd.concat([new_df, df_test])   # test data와 다시 합쳐주겠습니다.
    return result

In [25]:
using_concat = remove_outliers_iqr(using_concat, ['전용면적', '층', '건축년도', '전체동수', '전체세대수', '연면적', '주거전용면적', '관리비부과면적', '전용면적별세대현황(60㎡이하)', '전용면적별세대현황(60㎡~85㎡이하)', '전용면적별세대현황(85㎡~135㎡이하)', '전용면적별세대현황(135㎡초과)', '건축면적', '주차대수'])

In [26]:
cat_features = using_concat.select_dtypes(include=['object']).columns.to_list()
print("== Categorical ==")
print(cat_features)
num_features = using_concat.select_dtypes(exclude='object').columns.to_list()
print("== Numerical ==")
print(num_features)

== Categorical ==
['구', '동', '계약년', '계약월', '아파트명', '도로명', '거래유형', '단지분류', '분양형태', '시공사', '시행사', '권리유형', '단지신청일', '층높이', '신축여부']
== Numerical ==
['전용면적', '층', '건축년도', '전체동수', '전체세대수', '연면적', '주거전용면적', '관리비부과면적', '전용면적별세대현황(60㎡이하)', '전용면적별세대현황(60㎡~85㎡이하)', '전용면적별세대현황(85㎡~135㎡이하)', '전용면적별세대현황(135㎡초과)', '건축면적', '주차대수', '집값(target)', '테스트셋여부', '강남여부']


In [27]:
for cat in using_concat.select_dtypes(include=["object"]).columns.to_list():
        using_concat[cat] = using_concat[cat].fillna("NaN")
        using_concat[cat] = using_concat[cat].astype(str)
for num in using_concat.select_dtypes(exclude=["object"]).columns.to_list():
        using_concat[num] = using_concat[num].fillna(np.nan)

## Variance

1. 시군구
2. 번지
3. 본번 : 도로명 주소에서 00-00 에서 앞부분
4. 부번 : 도로명 주소에서 00-00 에서 뒷부분
5. 아파트명 
6. 전용면적 : 각 세대가 독립적으로 사용하는 전용 공간
7. 계약년월
8. 계약일
9. 층
10. 건축년도
11. 도로명 : 도로명 길번호
12. 해제사유발생일 : 부동산 거래에서 계약이나 조건이 해제되거나 종료되는 날짜를 나타냅니다. 이 날짜는 일반적으로 계약 조건에 따라 정해지며, 거래의 어떤 이유로 인해 계약이 종료되거나 해제되는 경우에 발생
13. 등기신청일자 : 부동산 거래와 관련하여 해당 부동산의 소유자 또는 권리자가 등기청에 소유권 이전이나 다른 부동산 권리에 대한 등기를 신청한 날짜를 의미
14. 거래유형 : 중개거래, 직거래 (대부분 Nan값)
15. 중개사소재지 : 공인중개사 소재지 (특이한 점으로 제주, 경기 등 서울이 아닌 곳도 존재)
16. k-단지분류(아파트,주상복합등등) : ['아파트', '주상복합', nan, '연립주택', '도시형 생활주택(아파트)', '도시형 생활주택(주상복합)']
17. k-전화번호
18. k-팩스번호
19. 단지소개기존clob : ??
20. k-세대타입(분양형태) : ['분양', '기타', nan, '임대'], 분양은 사실상 아파트 매매
21. k-관리방식 :['자치관리', '위탁관리', '직영', nan] 자치관리는 입주자들이 직접 관리자 선출, 위탁관리는 관리업체를 통해 관리, 직영은... 뭔데???
22. k-복도유형 : ['계단식', '혼합식', '복도식', '타워형', '기타', nan]
23. k-난방방식 : ['개별난방', '지역난방', '기타', nan, '중앙난방'] 지역난방: 대규모 열생산시설에서 만들어진 열을 지역에 공급
24. k-전체동수
25. k-전체세대수
26. k-건설사(시공사) : 실제로 건설을 하는 회사(건설 의뢰를 맡은 회사)
27. k-시행사 : 건설을 주관하는 회사(건설 의뢰를 맡긴 회사)
28. k-사용검사일-사용승인일
29. k-전용면적별세대현황(60㎡~85㎡이하)
30. k-85㎡~135㎡이하
31. k-135㎡초과
32. k-홈페이지
33. 건축면적 : 대지에서 건축물이 차지하고 있는 면적을 말한다.
34. 주차대수
35. 기타/의무/임대/임의=1/2/3/4 : 이게 뭔데?  
GPT 산 답변  
기타 (Other): "기타"는 주로 다른 명시된 범주에 해당하지 않는 다양한 부동산 유형이나 거래 유형을 나타냅니다. 예를 들어, 특정 분류에 정확하게 포함되지 않는 특별한 부동산 유형을 지칭할 때 사용될 수 있습니다.  
의무 (Encumbrance): "의무"는 부동산에 대한 부담, 제약, 혹은 제약사항을 나타냅니다. 예를 들어, 저당권, 전세권, 가압류, 귀속조서 등이 부동산에 부과되는 제약사항으로 간주될 수 있습니다.  
임대 (Lease): "임대"는 부동산을 임대하거나 빌릴 때 사용되는 용어입니다. 부동산 소유자는 일정 기간 동안 임차인에게 부동산 사용을 허용하고, 그 대가로 임차인은 임대료를 지불합니다.  
임의 (Arbitrary): "임의"는 주로 부동산에 대한 임의의 권리나 조건을 나타내는데 사용됩니다. 이 용어는 특별한 상황에 따라 다르게 해석될 수 있습니다. 일반적으로 부동산 거래나 계약에서 자유롭게 조정 가능한 부분을 나타낼 때 사용될 수 있습니다.  
36. 단지승인일
37. 사용허가여부
38. 관리비 업로드
39. 좌표X
40. 좌표Y
41. 단지신청일


집 값에 직접적 영향이 갈 Variable : 시군구, 아파트명, 전용면적, 계약년월, 계약일, 층, 건축년도, 도로명, 거래유형, k-단지분류, k-세대타입, k-전체동수, k-전체세대수, k-건설사(시공사), k-시행사, k-전용면적별세대현황(60㎡~85㎡이하), k-85㎡~135㎡이하, k-135㎡초과, 건축면적, 주차대수

## Modeling

In [28]:
from catboost import CatBoostRegressor
import optuna

In [29]:
dt_train = using_concat.query('테스트셋여부==0')
dt_test = using_concat.query('테스트셋여부==1')

In [30]:
del dt_train["테스트셋여부"]
del dt_test["테스트셋여부"]

In [31]:
assert dt_train.shape[1] == dt_test.shape[1]

In [32]:
y = dt_train['집값(target)']
X = dt_train.drop(['집값(target)'], axis=1)

In [33]:
cat_features = X.select_dtypes(include=["object"]).columns.to_list()

In [34]:
def object(trial, data=X, target=y):
    train_X, test_X, train_y, test_y = train_test_split(data, target, test_size=0.20,random_state=42)
    param = {
        'iterations' : trial.suggest_int('iterations', 5000, 10000),
        'learning_rate' : trial.suggest_float('learning_rate', 0.001, 0.01),
        'depth' : trial.suggest_int('depth', 10, 25),
        'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0.5, 50),
        'border_count' : trial.suggest_categorical('border_count',[254, 32, 128]),
        'max_leaves' : trial.suggest_categorical('max_leaves', [63, 127, 255, 512, 1024, 2048]),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 5, 50),
        # 'rsm' : trial.suggest_float('rsm', 0.5, 0.8),
        'early_stopping_rounds' : trial.suggest_categorical('early_stopping_rounds', [10, 20]),
        'grow_policy' : 'Lossguide',
        'loss_function' : 'RMSE',
        'cat_features' : cat_features,
        'task_type' : 'GPU',
        'random_state' : 42
    }
    
    model = CatBoostRegressor(**param)
    cb_model = model.fit(train_X, train_y, verbose=False)


    score = np.sqrt(mean_squared_error(cb_model.predict(test_X), test_y, squared=False))

    return score

In [34]:
study = optuna.create_study(direction='minimize')
study.optimize(object, n_trials = 30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-01-17 09:05:02,307] A new study created in memory with name: no-name-f3003009-2dbd-47b7-9760-944ec10c4eba
[I 2024-01-17 09:13:56,315] Trial 0 finished with value: 4642.263449247117 and parameters: {'iterations': 8354, 'learning_rate': 0.009428279661287068, 'depth': 24, 'l2_leaf_reg': 36.90320741960081, 'border_count': 254, 'max_leaves': 255, 'min_data_in_leaf': 30, 'early_stopping_rounds': 20}. Best is trial 0 with value: 4642.263449247117.
[I 2024-01-17 09:17:40,352] Trial 1 finished with value: 6712.995338728123 and parameters: {'iterations': 8179, 'learning_rate': 0.007927929026075019, 'depth': 15, 'l2_leaf_reg': 19.144401693886866, 'border_count': 32, 'max_leaves': 63, 'min_data_in_leaf': 47, 'early_stopping_rounds': 20}. Best is trial 0 with value: 4642.263449247117.
[I 2024-01-17 09:23:13,035] Trial 2 finished with value: 6286.387632430162 and parameters: {'iterations': 8357, 'learning_rate': 0.0063499556280361994, 'depth': 17, 'l2_leaf_reg': 40.078884613694775, 'border_c

In [45]:
## TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit

def time_series_cv(model) :
    tscv = TimeSeriesSplit(n_splits=10)
    rmse_list = []
    model_name = model.__class__.__name__
    for _, (train_index, test_index) in tqdm(enumerate(tscv.split(X), start = 1), desc=f'{model_name} Cross Validations...', total=10):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf = model.fit(X_train, y_train)
        pred = clf.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, pred))
        rmse_list.append(rmse)

    return model_name, rmse_list


In [46]:
def print_rmse_score(model):
    model_name, score = time_series_cv(model)
    for i, r in enumerate(score, start=1):
        print(f"{i} FOLDS: {model_name} RMSE:{r}")
    print(f"\n{model_name} mean RMSE: {np.mean(score)}")
    print("="*40)
    return model_name, np.mean(score)

In [49]:
params = {'iterations': 9958,
           'learning_rate': 0.00985608972818923,
           'depth': 20,
           'l2_leaf_reg': 29.062285600871373,
           'border_count': 254,
           'max_leaves': 1024,
           'min_data_in_leaf': 7,
           'early_stopping_rounds': 10,
           'grow_policy' : 'Lossguide',
           'loss_function' : 'RMSE',
           'cat_features' : cat_features,
           'task_type' : 'GPU',
           'random_state' : 42
           }

In [50]:
model = CatBoostRegressor(**params)

model_name, mean_score = print_rmse_score(model)

CatBoostRegressor Cross Validations...:   0%|          | 0/10 [00:00<?, ?it/s]

0:	learn: 42409.4467303	total: 180ms	remaining: 29m 51s
1:	learn: 42051.7781154	total: 355ms	remaining: 29m 28s
2:	learn: 41698.2195666	total: 520ms	remaining: 28m 44s
3:	learn: 41349.2956895	total: 681ms	remaining: 28m 15s
4:	learn: 41003.3675969	total: 844ms	remaining: 28m
5:	learn: 40661.6549963	total: 1.01s	remaining: 27m 50s
6:	learn: 40322.5094757	total: 1.17s	remaining: 27m 41s
7:	learn: 39987.1937595	total: 1.33s	remaining: 27m 34s
8:	learn: 39655.3176838	total: 1.49s	remaining: 27m 29s
9:	learn: 39325.9517715	total: 1.65s	remaining: 27m 25s
10:	learn: 39001.2080426	total: 1.82s	remaining: 27m 22s
11:	learn: 38679.7830904	total: 1.98s	remaining: 27m 20s
12:	learn: 38362.1498323	total: 2.14s	remaining: 27m 18s
13:	learn: 38047.2582585	total: 2.31s	remaining: 27m 17s
14:	learn: 37735.2632376	total: 2.47s	remaining: 27m 15s
15:	learn: 37424.4095631	total: 2.63s	remaining: 27m 13s
16:	learn: 37117.6261946	total: 2.79s	remaining: 27m 11s
17:	learn: 36815.5542280	total: 2.95s	remaini

CatBoostRegressor Cross Validations...:  10%|█         | 1/10 [27:49<4:10:22, 1669.16s/it]

0:	learn: 35855.8729278	total: 189ms	remaining: 31m 19s
1:	learn: 35547.4771404	total: 368ms	remaining: 30m 30s
2:	learn: 35242.4201935	total: 543ms	remaining: 30m
3:	learn: 34939.5807446	total: 716ms	remaining: 29m 42s
4:	learn: 34640.5793919	total: 892ms	remaining: 29m 35s
5:	learn: 34343.5112490	total: 1.07s	remaining: 29m 31s
6:	learn: 34048.9164635	total: 1.24s	remaining: 29m 28s
7:	learn: 33758.0558535	total: 1.42s	remaining: 29m 26s
8:	learn: 33471.1815233	total: 1.6s	remaining: 29m 24s
9:	learn: 33185.8615551	total: 1.77s	remaining: 29m 22s
10:	learn: 32903.9114763	total: 1.95s	remaining: 29m 22s
11:	learn: 32626.1351319	total: 2.12s	remaining: 29m 19s
12:	learn: 32348.4996085	total: 2.3s	remaining: 29m 17s
13:	learn: 32074.0564227	total: 2.47s	remaining: 29m 15s
14:	learn: 31803.9573148	total: 2.65s	remaining: 29m 14s
15:	learn: 31537.4811079	total: 2.82s	remaining: 29m 12s
16:	learn: 31273.1522563	total: 3s	remaining: 29m 11s
17:	learn: 31010.7215854	total: 3.17s	remaining: 2

CatBoostRegressor Cross Validations...:  20%|██        | 2/10 [56:29<3:46:36, 1699.52s/it]

0:	learn: 43157.2414297	total: 198ms	remaining: 32m 56s
1:	learn: 42786.3571602	total: 390ms	remaining: 32m 19s
2:	learn: 42419.2834752	total: 576ms	remaining: 31m 50s
3:	learn: 42056.5264893	total: 761ms	remaining: 31m 34s
4:	learn: 41697.0373203	total: 946ms	remaining: 31m 22s
5:	learn: 41342.1734268	total: 1.13s	remaining: 31m 17s
6:	learn: 40989.4649750	total: 1.31s	remaining: 31m 7s
7:	learn: 40640.4365680	total: 1.5s	remaining: 31m 4s
8:	learn: 40295.4497160	total: 1.68s	remaining: 30m 59s
9:	learn: 39954.2789066	total: 1.86s	remaining: 30m 54s
10:	learn: 39616.7031220	total: 2.05s	remaining: 30m 50s
11:	learn: 39282.0328931	total: 2.23s	remaining: 30m 45s
12:	learn: 38950.5818064	total: 2.41s	remaining: 30m 42s
13:	learn: 38623.2144932	total: 2.59s	remaining: 30m 39s
14:	learn: 38299.0265346	total: 2.77s	remaining: 30m 37s
15:	learn: 37978.0329031	total: 2.95s	remaining: 30m 35s
16:	learn: 37660.2435689	total: 3.13s	remaining: 30m 33s
17:	learn: 37345.8292416	total: 3.32s	remain

CatBoostRegressor Cross Validations...:  30%|███       | 3/10 [1:25:33<3:20:36, 1719.53s/it]

0:	learn: 40951.5959974	total: 207ms	remaining: 34m 20s
1:	learn: 40598.7139717	total: 417ms	remaining: 34m 33s
2:	learn: 40249.2191076	total: 620ms	remaining: 34m 16s
3:	learn: 39902.9407851	total: 817ms	remaining: 33m 52s
4:	learn: 39559.0519433	total: 1.01s	remaining: 33m 30s
5:	learn: 39218.8413853	total: 1.2s	remaining: 33m 18s
6:	learn: 38883.3689175	total: 1.4s	remaining: 33m 7s
7:	learn: 38551.4904303	total: 1.6s	remaining: 33m 12s
8:	learn: 38223.0771553	total: 1.8s	remaining: 33m 7s
9:	learn: 37897.9668498	total: 1.99s	remaining: 33m 3s
10:	learn: 37576.5315453	total: 2.19s	remaining: 32m 57s
11:	learn: 37257.7274222	total: 2.38s	remaining: 32m 55s
12:	learn: 36942.6490438	total: 2.58s	remaining: 32m 51s
13:	learn: 36630.9139479	total: 2.78s	remaining: 32m 52s
14:	learn: 36321.9811084	total: 2.98s	remaining: 32m 52s
15:	learn: 36016.9643968	total: 3.17s	remaining: 32m 51s
16:	learn: 35712.8161497	total: 3.37s	remaining: 32m 48s
17:	learn: 35412.1842447	total: 3.57s	remaining:

CatBoostRegressor Cross Validations...:  40%|████      | 4/10 [1:56:03<2:56:19, 1763.17s/it]

0:	learn: 44190.2401221	total: 221ms	remaining: 36m 40s
1:	learn: 43807.2506779	total: 433ms	remaining: 35m 53s
2:	learn: 43427.9301594	total: 643ms	remaining: 35m 32s
3:	learn: 43052.7457407	total: 849ms	remaining: 35m 12s
4:	learn: 42681.1766672	total: 1.05s	remaining: 34m 51s
5:	learn: 42313.6898227	total: 1.25s	remaining: 34m 41s
6:	learn: 41949.4895285	total: 1.46s	remaining: 34m 31s
7:	learn: 41589.2599584	total: 1.67s	remaining: 34m 35s
8:	learn: 41232.8243012	total: 1.88s	remaining: 34m 39s
9:	learn: 40880.5170143	total: 2.09s	remaining: 34m 38s
10:	learn: 40531.9450536	total: 2.29s	remaining: 34m 30s
11:	learn: 40186.6953318	total: 2.49s	remaining: 34m 24s
12:	learn: 39845.0902082	total: 2.7s	remaining: 34m 24s
13:	learn: 39506.8866958	total: 2.91s	remaining: 34m 25s
14:	learn: 39171.6548184	total: 3.12s	remaining: 34m 25s
15:	learn: 38840.4727725	total: 3.33s	remaining: 34m 26s
16:	learn: 38511.1647769	total: 3.53s	remaining: 34m 25s
17:	learn: 38186.0972109	total: 3.74s	rema

CatBoostRegressor Cross Validations...:  50%|█████     | 5/10 [2:27:14<2:30:10, 1802.03s/it]

0:	learn: 43045.6262989	total: 221ms	remaining: 36m 36s
1:	learn: 42671.4334951	total: 429ms	remaining: 35m 34s
2:	learn: 42300.8529412	total: 635ms	remaining: 35m 8s
3:	learn: 41934.5821800	total: 844ms	remaining: 34m 59s
4:	learn: 41572.0152263	total: 1.06s	remaining: 35m 17s
5:	learn: 41213.2111592	total: 1.28s	remaining: 35m 23s
6:	learn: 40857.8746979	total: 1.49s	remaining: 35m 17s
7:	learn: 40504.9800686	total: 1.7s	remaining: 35m 12s
8:	learn: 40156.8804559	total: 1.91s	remaining: 35m 10s
9:	learn: 39812.6096535	total: 2.12s	remaining: 35m 4s
10:	learn: 39471.9156609	total: 2.33s	remaining: 35m 2s
11:	learn: 39134.5570372	total: 2.54s	remaining: 35m 3s
12:	learn: 38800.3857331	total: 2.74s	remaining: 34m 58s
13:	learn: 38469.5147316	total: 2.96s	remaining: 35m 1s
14:	learn: 38142.5673273	total: 3.17s	remaining: 35m 2s
15:	learn: 37818.7637994	total: 3.39s	remaining: 35m 5s
16:	learn: 37498.7027101	total: 3.6s	remaining: 35m 8s
17:	learn: 37181.6531789	total: 3.81s	remaining: 35

CatBoostRegressor Cross Validations...:  60%|██████    | 6/10 [2:57:56<2:01:02, 1815.54s/it]

0:	learn: 43679.3182935	total: 225ms	remaining: 37m 16s
1:	learn: 43296.3977856	total: 433ms	remaining: 35m 57s
2:	learn: 42917.9302856	total: 645ms	remaining: 35m 41s
3:	learn: 42542.4041067	total: 861ms	remaining: 35m 43s
4:	learn: 42171.1450282	total: 1.08s	remaining: 35m 44s
5:	learn: 41803.1356775	total: 1.29s	remaining: 35m 42s
6:	learn: 41439.1714229	total: 1.5s	remaining: 35m 40s
7:	learn: 41078.7242327	total: 1.72s	remaining: 35m 38s
8:	learn: 40722.1558773	total: 1.93s	remaining: 35m 37s
9:	learn: 40368.5670841	total: 2.15s	remaining: 35m 36s
10:	learn: 40018.8883597	total: 2.36s	remaining: 35m 35s
11:	learn: 39673.1312701	total: 2.57s	remaining: 35m 33s
12:	learn: 39330.9631412	total: 2.79s	remaining: 35m 34s
13:	learn: 38992.1086380	total: 3.01s	remaining: 35m 35s
14:	learn: 38656.8512299	total: 3.22s	remaining: 35m 34s
15:	learn: 38324.3664541	total: 3.44s	remaining: 35m 34s
16:	learn: 37996.0474303	total: 3.65s	remaining: 35m 33s
17:	learn: 37671.2616865	total: 3.86s	rema

CatBoostRegressor Cross Validations...:  70%|███████   | 7/10 [3:28:49<1:31:23, 1827.92s/it]

0:	learn: 44646.4072995	total: 237ms	remaining: 39m 23s
1:	learn: 44252.7438243	total: 461ms	remaining: 38m 14s
2:	learn: 43862.8933197	total: 684ms	remaining: 37m 49s
3:	learn: 43476.7078576	total: 908ms	remaining: 37m 39s
4:	learn: 43095.0456725	total: 1.13s	remaining: 37m 33s
5:	learn: 42717.1605472	total: 1.35s	remaining: 37m 25s
6:	learn: 42343.7574075	total: 1.58s	remaining: 37m 21s
7:	learn: 41973.5319651	total: 1.8s	remaining: 37m 18s
8:	learn: 41606.4253994	total: 2.02s	remaining: 37m 15s
9:	learn: 41243.9475924	total: 2.24s	remaining: 37m 12s
10:	learn: 40885.4598898	total: 2.47s	remaining: 37m 11s
11:	learn: 40530.0015236	total: 2.69s	remaining: 37m 8s
12:	learn: 40177.9322290	total: 2.91s	remaining: 37m 5s
13:	learn: 39830.0688208	total: 3.13s	remaining: 37m 4s
14:	learn: 39485.4494607	total: 3.35s	remaining: 37m
15:	learn: 39144.5551111	total: 3.57s	remaining: 36m 59s
16:	learn: 38806.8952737	total: 3.79s	remaining: 36m 56s
17:	learn: 38472.8899774	total: 4.01s	remaining: 

CatBoostRegressor Cross Validations...:  80%|████████  | 8/10 [4:00:06<1:01:27, 1843.61s/it]

0:	learn: 45417.0436306	total: 240ms	remaining: 39m 46s
1:	learn: 45013.8203168	total: 468ms	remaining: 38m 49s
2:	learn: 44615.0211092	total: 696ms	remaining: 38m 29s
3:	learn: 44219.8005752	total: 925ms	remaining: 38m 22s
4:	learn: 43828.7605861	total: 1.16s	remaining: 38m 20s
5:	learn: 43441.9773469	total: 1.38s	remaining: 38m 17s
6:	learn: 43059.2283299	total: 1.61s	remaining: 38m 15s
7:	learn: 42680.2140678	total: 1.84s	remaining: 38m 12s
8:	learn: 42305.7172499	total: 2.07s	remaining: 38m 8s
9:	learn: 41934.3386554	total: 2.3s	remaining: 38m 5s
10:	learn: 41566.8619829	total: 2.52s	remaining: 38m 3s
11:	learn: 41201.7555327	total: 2.75s	remaining: 38m 2s
12:	learn: 40839.9183149	total: 2.98s	remaining: 38m 2s
13:	learn: 40483.6485388	total: 3.21s	remaining: 37m 59s
14:	learn: 40129.8349625	total: 3.44s	remaining: 37m 58s
15:	learn: 39780.6502786	total: 3.66s	remaining: 37m 57s
16:	learn: 39435.1906065	total: 3.89s	remaining: 37m 56s
17:	learn: 39093.2164670	total: 4.12s	remaining

CatBoostRegressor Cross Validations...:  90%|█████████ | 9/10 [4:31:48<31:01, 1861.89s/it]  

0:	learn: 46006.6344367	total: 252ms	remaining: 41m 46s
1:	learn: 45596.6851972	total: 486ms	remaining: 40m 18s
2:	learn: 45189.5489052	total: 721ms	remaining: 39m 51s
3:	learn: 44786.9060531	total: 953ms	remaining: 39m 31s
4:	learn: 44388.1751709	total: 1.18s	remaining: 39m 18s
5:	learn: 43994.7788877	total: 1.42s	remaining: 39m 9s
6:	learn: 43605.6432601	total: 1.65s	remaining: 39m 2s
7:	learn: 43219.1501865	total: 1.87s	remaining: 38m 50s
8:	learn: 42837.6829521	total: 2.1s	remaining: 38m 47s
9:	learn: 42459.8505569	total: 2.34s	remaining: 38m 45s
10:	learn: 42086.6433610	total: 2.56s	remaining: 38m 39s
11:	learn: 41715.7212787	total: 2.79s	remaining: 38m 37s
12:	learn: 41348.5013420	total: 3.03s	remaining: 38m 37s
13:	learn: 40986.0202317	total: 3.26s	remaining: 38m 36s
14:	learn: 40626.4485434	total: 3.49s	remaining: 38m 36s
15:	learn: 40272.0396122	total: 3.72s	remaining: 38m 34s
16:	learn: 39919.9260454	total: 3.96s	remaining: 38m 34s
17:	learn: 39572.0205027	total: 4.19s	remain

CatBoostRegressor Cross Validations...: 100%|██████████| 10/10 [5:03:55<00:00, 1823.53s/it]


1 FOLDS: CatBoostRegressor RMSE:360037445.0942484
2 FOLDS: CatBoostRegressor RMSE:327027666.82025695
3 FOLDS: CatBoostRegressor RMSE:34979545.61961954
4 FOLDS: CatBoostRegressor RMSE:26117742.627095766
5 FOLDS: CatBoostRegressor RMSE:14305075.64457352
6 FOLDS: CatBoostRegressor RMSE:13471551.719254022
7 FOLDS: CatBoostRegressor RMSE:12259201.904694853
8 FOLDS: CatBoostRegressor RMSE:11246721.285559416
9 FOLDS: CatBoostRegressor RMSE:9889533.353598425
10 FOLDS: CatBoostRegressor RMSE:7627818.522942163

CatBoostRegressor mean RMSE: 81696230.25918432


In [34]:
with open('saved_cb_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [35]:
X_test = dt_test.drop(['집값(target)'], axis=1)

# Test dataset에 대한 inference를 진행합니다.
real_test_pred = model.predict(X_test)

In [36]:
real_test_pred

array([152345.00887046, 273668.2913573 , 343992.57393928, ...,
        80973.96244658,  75777.24545648,  76537.69876542])

In [37]:
# 앞서 예측한 예측값들을 저장합니다.
preds_df = pd.DataFrame(real_test_pred.astype(int), columns=["target"])
preds_df.to_csv('output.csv', index=False)