In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 글꼴깨짐 방지
import matplotlib
import matplotlib.font_manager as fm

import warnings

# 경고 메시지를 무시하고 숨기거나
warnings.filterwarnings(action='ignore')

fm.get_fontconfig_fonts()
font_location = 'C:/Windows/Fonts/malgun.ttf' # For Windows
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

In [2]:
df = pd.read_csv('train.csv')

In [3]:
def preprocessing(df):
    # 오류 단지코드가 존재하는 행들을  사전에 제거
    df_error =  ['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
    #df_error =  ['C2335', 'C1327']
    df = df[~df['단지코드'].isin(df_error)].reset_index(drop=True)
    df.rename(columns = {'도보 10분거리 내 지하철역 수(환승노선 수 반영)':'지하철','도보 10분거리 내 버스정류장 수':'버스'},inplace=True)
    df.drop(columns=['임대보증금','임대료','자격유형','임대건물구분'],axis = 1,inplace=True)
    지역_비율 = (df.groupby(['지역'])['총세대수'].count())/(df.groupby(['지역'])['총세대수'].count().sum())*100
    지역_비율=지역_비율.reset_index(name='지역_비율')
    공급유형_비율 = (df.groupby(['공급유형'])['총세대수'].count())/(df.groupby(['공급유형'])['총세대수'].count().sum())*100
    공급유형_비율=공급유형_비율.reset_index(name='공급유형_비율')
    df = pd.merge(df,지역_비율, on='지역')
    df = pd.merge(df,공급유형_비율, on='공급유형')
    df.drop(columns=['지역','공급유형','단지코드'],axis = 1,inplace=True)
    df=df.dropna(axis=0)
    df = df[['총세대수', '전용면적', '전용면적별세대수', '공가수', '지하철', '버스', '단지내주차면수', '공급유형_비율',
       '지역_비율', '등록차량수']]
    return df

In [4]:
df = preprocessing(df)

In [5]:
#Splitting the data into train and test split
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score

from time import time

In [7]:
# test cycle
from catboost import CatBoostRegressor

model = CatBoostRegressor()

model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

Learning rate set to 0.0462
0:	learn: 411.3940370	total: 150ms	remaining: 2m 29s
1:	learn: 398.6229952	total: 151ms	remaining: 1m 15s
2:	learn: 385.9485745	total: 152ms	remaining: 50.7s
3:	learn: 374.2145419	total: 154ms	remaining: 38.3s
4:	learn: 362.5644627	total: 155ms	remaining: 30.8s
5:	learn: 351.6038597	total: 156ms	remaining: 25.9s
6:	learn: 341.3099915	total: 158ms	remaining: 22.4s
7:	learn: 330.8436447	total: 159ms	remaining: 19.7s
8:	learn: 321.0210445	total: 160ms	remaining: 17.6s
9:	learn: 311.6974552	total: 161ms	remaining: 16s
10:	learn: 302.4510286	total: 163ms	remaining: 14.6s
11:	learn: 294.1031836	total: 164ms	remaining: 13.5s
12:	learn: 285.9348168	total: 165ms	remaining: 12.5s
13:	learn: 277.6958269	total: 166ms	remaining: 11.7s
14:	learn: 270.3802012	total: 168ms	remaining: 11s
15:	learn: 263.5270581	total: 169ms	remaining: 10.4s
16:	learn: 256.8357906	total: 170ms	remaining: 9.83s
17:	learn: 250.5517068	total: 171ms	remaining: 9.34s
18:	learn: 244.2140671	total: 

220:	learn: 78.5576090	total: 1.23s	remaining: 4.32s
221:	learn: 78.3389899	total: 1.23s	remaining: 4.3s
222:	learn: 78.2136463	total: 1.23s	remaining: 4.28s
223:	learn: 78.0856808	total: 1.23s	remaining: 4.26s
224:	learn: 77.8874814	total: 1.23s	remaining: 4.25s
225:	learn: 77.5934558	total: 1.23s	remaining: 4.23s
226:	learn: 77.2340552	total: 1.24s	remaining: 4.21s
227:	learn: 76.9881187	total: 1.24s	remaining: 4.19s
228:	learn: 76.8325833	total: 1.24s	remaining: 4.17s
229:	learn: 76.6437053	total: 1.24s	remaining: 4.15s
230:	learn: 76.4687014	total: 1.24s	remaining: 4.13s
231:	learn: 76.2919558	total: 1.24s	remaining: 4.11s
232:	learn: 76.1271721	total: 1.24s	remaining: 4.09s
233:	learn: 76.0697629	total: 1.25s	remaining: 4.08s
234:	learn: 75.9292522	total: 1.25s	remaining: 4.06s
235:	learn: 75.8015160	total: 1.25s	remaining: 4.04s
236:	learn: 75.6335819	total: 1.25s	remaining: 4.02s
237:	learn: 75.5539555	total: 1.25s	remaining: 4s
238:	learn: 75.3537952	total: 1.25s	remaining: 3.9

495:	learn: 46.0061446	total: 1.6s	remaining: 1.63s
496:	learn: 45.9407168	total: 1.6s	remaining: 1.62s
497:	learn: 45.7637383	total: 1.6s	remaining: 1.61s
498:	learn: 45.7064299	total: 1.6s	remaining: 1.61s
499:	learn: 45.6184294	total: 1.6s	remaining: 1.6s
500:	learn: 45.5642754	total: 1.61s	remaining: 1.6s
501:	learn: 45.5047348	total: 1.61s	remaining: 1.59s
502:	learn: 45.3837834	total: 1.61s	remaining: 1.59s
503:	learn: 45.3348268	total: 1.61s	remaining: 1.59s
504:	learn: 45.1994233	total: 1.61s	remaining: 1.58s
505:	learn: 45.1350846	total: 1.61s	remaining: 1.58s
506:	learn: 45.1208827	total: 1.61s	remaining: 1.57s
507:	learn: 45.0745187	total: 1.62s	remaining: 1.56s
508:	learn: 45.0156744	total: 1.62s	remaining: 1.56s
509:	learn: 44.9390411	total: 1.62s	remaining: 1.55s
510:	learn: 44.8251563	total: 1.62s	remaining: 1.55s
511:	learn: 44.7910205	total: 1.62s	remaining: 1.55s
512:	learn: 44.7376232	total: 1.62s	remaining: 1.54s
513:	learn: 44.5885928	total: 1.62s	remaining: 1.54s


776:	learn: 31.9641923	total: 1.97s	remaining: 566ms
777:	learn: 31.9213640	total: 1.98s	remaining: 564ms
778:	learn: 31.9176564	total: 1.98s	remaining: 562ms
779:	learn: 31.8659921	total: 2s	remaining: 563ms
780:	learn: 31.8226413	total: 2.01s	remaining: 564ms
781:	learn: 31.7683357	total: 2.03s	remaining: 565ms
782:	learn: 31.7333590	total: 2.04s	remaining: 566ms
783:	learn: 31.6896580	total: 2.06s	remaining: 567ms
784:	learn: 31.6704516	total: 2.07s	remaining: 568ms
785:	learn: 31.6631141	total: 2.09s	remaining: 569ms
786:	learn: 31.6336992	total: 2.1s	remaining: 569ms
787:	learn: 31.6125346	total: 2.12s	remaining: 570ms
788:	learn: 31.5899124	total: 2.14s	remaining: 571ms
789:	learn: 31.4953551	total: 2.15s	remaining: 572ms
790:	learn: 31.4830477	total: 2.17s	remaining: 572ms
791:	learn: 31.4551043	total: 2.18s	remaining: 573ms
792:	learn: 31.4465036	total: 2.2s	remaining: 574ms
793:	learn: 31.4055715	total: 2.21s	remaining: 574ms
794:	learn: 31.3891906	total: 2.23s	remaining: 574m

In [8]:
ev_ = explained_variance_score(y_test, y_pred)
print("\tExplained variance:", ev_)
mae_ = mean_absolute_error(y_test, y_pred)
print("\tMean absolute error:", mae_)
r2_ = r2_score(y_test, y_pred)
print("\tR2 score:", r2_)
print()

	Explained variance: 0.9878099295078537
	Mean absolute error: 29.8928190870309
	R2 score: 0.9877734170745662



In [9]:
import joblib

joblib.dump(model, 'Catboost_GridSearchCV_model.pkl')

['Catboost_GridSearchCV_model.pkl']

In [10]:
# GridSearchCV parameters
parameters = {
    # tree의 깊이, default: 6
    'max_depth': [4, 6, 8, 10], 
    
    # 생성 가능한 최대 tree 갯수, default: 1000
    'n_estimators': [1000, 1500, 2000],
    
    # 오차 함수에서 조정하는 step의 크기, default: 자동 설정(0.03)
    'learning_rate': [0.01, 0.03, 0.1, None],
    
    # tree에 랜덤하게 부여되는 가중치, defaul: 1
    'random_strength': [1, 1.2, 1.5, 2, 4]
}


In [11]:
new_model = CatBoostRegressor(
   learning_rate=0.01, max_depth=6, n_estimators=1000, random_strength=2
)
new_model = new_model.fit(X_train, y_train)
y_pred = new_model.predict(X_test)

0:	learn: 422.2582674	total: 2.29ms	remaining: 2.29s
1:	learn: 419.6344325	total: 4.05ms	remaining: 2.02s
2:	learn: 417.1486737	total: 5.46ms	remaining: 1.81s
3:	learn: 414.3405325	total: 6.89ms	remaining: 1.71s
4:	learn: 411.8295994	total: 8.29ms	remaining: 1.65s
5:	learn: 409.0693087	total: 9.76ms	remaining: 1.62s
6:	learn: 406.1936662	total: 11.1ms	remaining: 1.57s
7:	learn: 403.4299096	total: 12.5ms	remaining: 1.55s
8:	learn: 400.7377064	total: 14ms	remaining: 1.54s
9:	learn: 398.0937484	total: 15.7ms	remaining: 1.55s
10:	learn: 395.4505755	total: 17.3ms	remaining: 1.55s
11:	learn: 392.8877627	total: 18.7ms	remaining: 1.54s
12:	learn: 390.4689759	total: 20ms	remaining: 1.52s
13:	learn: 387.7751224	total: 21.3ms	remaining: 1.5s
14:	learn: 385.2468747	total: 22.6ms	remaining: 1.49s
15:	learn: 382.9009108	total: 24ms	remaining: 1.47s
16:	learn: 380.3432102	total: 25.4ms	remaining: 1.47s
17:	learn: 378.0154954	total: 26.8ms	remaining: 1.46s
18:	learn: 375.7656530	total: 28ms	remaining:

188:	learn: 173.0450214	total: 345ms	remaining: 1.48s
189:	learn: 172.6062489	total: 365ms	remaining: 1.56s
190:	learn: 171.9620150	total: 386ms	remaining: 1.64s
191:	learn: 171.4530444	total: 409ms	remaining: 1.72s
192:	learn: 171.0350842	total: 427ms	remaining: 1.79s
193:	learn: 170.5678641	total: 445ms	remaining: 1.85s
194:	learn: 170.0651000	total: 464ms	remaining: 1.91s
195:	learn: 169.6358869	total: 481ms	remaining: 1.97s
196:	learn: 169.1372726	total: 499ms	remaining: 2.03s
197:	learn: 168.7366322	total: 516ms	remaining: 2.09s
198:	learn: 168.2746431	total: 533ms	remaining: 2.14s
199:	learn: 167.8981403	total: 551ms	remaining: 2.21s
200:	learn: 167.5394424	total: 571ms	remaining: 2.27s
201:	learn: 167.0937232	total: 590ms	remaining: 2.33s
202:	learn: 166.6369125	total: 594ms	remaining: 2.33s
203:	learn: 166.0720995	total: 596ms	remaining: 2.33s
204:	learn: 165.5744672	total: 598ms	remaining: 2.32s
205:	learn: 165.1703192	total: 600ms	remaining: 2.31s
206:	learn: 164.7896293	tota

355:	learn: 129.1468628	total: 1.39s	remaining: 2.51s
356:	learn: 129.0228652	total: 1.39s	remaining: 2.5s
357:	learn: 128.8087143	total: 1.39s	remaining: 2.5s
358:	learn: 128.6008037	total: 1.39s	remaining: 2.49s
359:	learn: 128.3725939	total: 1.4s	remaining: 2.48s
360:	learn: 128.2382793	total: 1.4s	remaining: 2.47s
361:	learn: 128.1441420	total: 1.4s	remaining: 2.46s
362:	learn: 127.9913013	total: 1.4s	remaining: 2.46s
363:	learn: 127.8597047	total: 1.4s	remaining: 2.45s
364:	learn: 127.6704420	total: 1.4s	remaining: 2.44s
365:	learn: 127.5432116	total: 1.4s	remaining: 2.43s
366:	learn: 127.4190703	total: 1.41s	remaining: 2.42s
367:	learn: 127.2990916	total: 1.41s	remaining: 2.42s
368:	learn: 127.1700495	total: 1.41s	remaining: 2.41s
369:	learn: 127.0518150	total: 1.41s	remaining: 2.4s
370:	learn: 126.9126842	total: 1.41s	remaining: 2.39s
371:	learn: 126.7753604	total: 1.41s	remaining: 2.38s
372:	learn: 126.6532021	total: 1.41s	remaining: 2.38s
373:	learn: 126.5592778	total: 1.42s	r

596:	learn: 107.0171091	total: 1.72s	remaining: 1.16s
597:	learn: 106.9422738	total: 1.72s	remaining: 1.16s
598:	learn: 106.8667098	total: 1.72s	remaining: 1.15s
599:	learn: 106.8170933	total: 1.72s	remaining: 1.15s
600:	learn: 106.7639909	total: 1.72s	remaining: 1.14s
601:	learn: 106.7099745	total: 1.72s	remaining: 1.14s
602:	learn: 106.6836933	total: 1.73s	remaining: 1.14s
603:	learn: 106.6529044	total: 1.73s	remaining: 1.13s
604:	learn: 106.5885376	total: 1.73s	remaining: 1.13s
605:	learn: 106.5206276	total: 1.73s	remaining: 1.12s
606:	learn: 106.4791693	total: 1.73s	remaining: 1.12s
607:	learn: 106.4150921	total: 1.73s	remaining: 1.12s
608:	learn: 106.3703322	total: 1.73s	remaining: 1.11s
609:	learn: 106.2980170	total: 1.74s	remaining: 1.11s
610:	learn: 106.2419671	total: 1.74s	remaining: 1.11s
611:	learn: 106.1428947	total: 1.74s	remaining: 1.1s
612:	learn: 106.0767739	total: 1.74s	remaining: 1.1s
613:	learn: 105.9707212	total: 1.74s	remaining: 1.09s
614:	learn: 105.9352874	total:

760:	learn: 97.8263154	total: 1.95s	remaining: 611ms
761:	learn: 97.7748617	total: 1.95s	remaining: 608ms
762:	learn: 97.7291085	total: 1.95s	remaining: 605ms
763:	learn: 97.7026519	total: 1.95s	remaining: 602ms
764:	learn: 97.6724390	total: 1.95s	remaining: 600ms
765:	learn: 97.6197449	total: 1.95s	remaining: 597ms
766:	learn: 97.5850194	total: 1.95s	remaining: 594ms
767:	learn: 97.4912969	total: 1.96s	remaining: 591ms
768:	learn: 97.4320975	total: 1.96s	remaining: 588ms
769:	learn: 97.3899274	total: 1.96s	remaining: 585ms
770:	learn: 97.3415126	total: 1.96s	remaining: 582ms
771:	learn: 97.2823405	total: 1.96s	remaining: 579ms
772:	learn: 97.2243541	total: 1.96s	remaining: 576ms
773:	learn: 97.1816555	total: 1.96s	remaining: 574ms
774:	learn: 97.1624131	total: 1.97s	remaining: 571ms
775:	learn: 97.1045542	total: 1.97s	remaining: 568ms
776:	learn: 97.0435148	total: 1.97s	remaining: 565ms
777:	learn: 96.9986033	total: 1.97s	remaining: 562ms
778:	learn: 96.9517874	total: 1.97s	remaining:

935:	learn: 88.2597894	total: 2.5s	remaining: 171ms
936:	learn: 88.2235964	total: 2.52s	remaining: 169ms
937:	learn: 88.1816694	total: 2.53s	remaining: 167ms
938:	learn: 88.1234511	total: 2.55s	remaining: 166ms
939:	learn: 88.0760122	total: 2.57s	remaining: 164ms
940:	learn: 88.0286172	total: 2.59s	remaining: 162ms
941:	learn: 87.9927633	total: 2.6s	remaining: 160ms
942:	learn: 87.9226189	total: 2.62s	remaining: 158ms
943:	learn: 87.8718647	total: 2.64s	remaining: 156ms
944:	learn: 87.8265946	total: 2.65s	remaining: 154ms
945:	learn: 87.7580395	total: 2.67s	remaining: 152ms
946:	learn: 87.6708320	total: 2.69s	remaining: 150ms
947:	learn: 87.6220250	total: 2.71s	remaining: 148ms
948:	learn: 87.5762282	total: 2.73s	remaining: 147ms
949:	learn: 87.5245346	total: 2.75s	remaining: 145ms
950:	learn: 87.4819130	total: 2.77s	remaining: 143ms
951:	learn: 87.4097793	total: 2.78s	remaining: 140ms
952:	learn: 87.3483285	total: 2.8s	remaining: 138ms
953:	learn: 87.2887024	total: 2.81s	remaining: 13

In [12]:
print("\tExplained variance:", explained_variance_score(y_test, y_pred))
print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
print("\tR2 score:", r2_score(y_test, y_pred))
print()

	Explained variance: 0.9405171045220978
	Mean absolute error: 70.72223195764994
	R2 score: 0.9404723370934964

