In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 글꼴깨짐 방지
import matplotlib
import matplotlib.font_manager as fm

import warnings

# 경고 메시지를 무시하고 숨기거나
warnings.filterwarnings(action='ignore')

fm.get_fontconfig_fonts()
font_location = 'C:/Windows/Fonts/malgun.ttf' # For Windows
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

In [2]:
df = pd.read_csv('train.csv')

In [3]:
def preprocessing(df):
    # 오류 단지코드가 존재하는 행들을  사전에 제거
    df_error =  ['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
    #df_error =  ['C2335', 'C1327']
    df = df[~df['단지코드'].isin(df_error)].reset_index(drop=True)
    df.rename(columns = {'도보 10분거리 내 지하철역 수(환승노선 수 반영)':'지하철','도보 10분거리 내 버스정류장 수':'버스'},inplace=True)
    df.drop(columns=['임대보증금','임대료','자격유형','임대건물구분'],axis = 1,inplace=True)
    지역_비율 = (df.groupby(['지역'])['총세대수'].count())/(df.groupby(['지역'])['총세대수'].count().sum())*100
    지역_비율=지역_비율.reset_index(name='지역_비율')
    공급유형_비율 = (df.groupby(['공급유형'])['총세대수'].count())/(df.groupby(['공급유형'])['총세대수'].count().sum())*100
    공급유형_비율=공급유형_비율.reset_index(name='공급유형_비율')
    df = pd.merge(df,지역_비율, on='지역')
    df = pd.merge(df,공급유형_비율, on='공급유형')
    df.drop(columns=['지역','공급유형','단지코드'],axis = 1,inplace=True)
    df=df.dropna(axis=0)
    df = df[['총세대수', '전용면적', '전용면적별세대수', '공가수', '지하철', '버스', '단지내주차면수', '공급유형_비율',
       '지역_비율', '등록차량수']]
    return df

In [4]:
df = preprocessing(df)

In [5]:
#Splitting the data into train and test split
from sklearn.model_selection import train_test_split
X = df.iloc[:, 1:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score

from time import time

In [7]:
# test cycle
from catboost import CatBoostRegressor

model = CatBoostRegressor()

model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

Learning rate set to 0.0462
0:	learn: 411.9835942	total: 145ms	remaining: 2m 24s
1:	learn: 399.2638885	total: 148ms	remaining: 1m 13s
2:	learn: 386.9914956	total: 150ms	remaining: 49.9s
3:	learn: 375.1169183	total: 152ms	remaining: 37.9s
4:	learn: 363.4540440	total: 154ms	remaining: 30.7s
5:	learn: 352.3173797	total: 157ms	remaining: 26.1s
6:	learn: 342.1581667	total: 160ms	remaining: 22.6s
7:	learn: 332.6173845	total: 162ms	remaining: 20.1s
8:	learn: 323.3894917	total: 164ms	remaining: 18.1s
9:	learn: 314.5482799	total: 166ms	remaining: 16.5s
10:	learn: 305.4289867	total: 169ms	remaining: 15.2s
11:	learn: 296.7823077	total: 171ms	remaining: 14.1s
12:	learn: 289.6188880	total: 173ms	remaining: 13.2s
13:	learn: 281.5116338	total: 176ms	remaining: 12.4s
14:	learn: 274.0491608	total: 178ms	remaining: 11.7s
15:	learn: 267.8042729	total: 180ms	remaining: 11.1s
16:	learn: 262.2963716	total: 182ms	remaining: 10.5s
17:	learn: 256.0377489	total: 184ms	remaining: 10.1s
18:	learn: 249.8287312	tot

190:	learn: 91.3940560	total: 568ms	remaining: 2.4s
191:	learn: 91.3601064	total: 570ms	remaining: 2.4s
192:	learn: 91.2203424	total: 572ms	remaining: 2.39s
193:	learn: 91.1210458	total: 574ms	remaining: 2.38s
194:	learn: 90.9224056	total: 576ms	remaining: 2.38s
195:	learn: 90.8516343	total: 578ms	remaining: 2.37s
196:	learn: 90.6358955	total: 580ms	remaining: 2.37s
197:	learn: 90.3956097	total: 583ms	remaining: 2.36s
198:	learn: 90.1737504	total: 585ms	remaining: 2.35s
199:	learn: 90.0531552	total: 587ms	remaining: 2.35s
200:	learn: 89.8491632	total: 589ms	remaining: 2.34s
201:	learn: 89.7299707	total: 591ms	remaining: 2.34s
202:	learn: 89.6108538	total: 594ms	remaining: 2.33s
203:	learn: 89.4271861	total: 596ms	remaining: 2.33s
204:	learn: 89.2275706	total: 598ms	remaining: 2.32s
205:	learn: 89.1168587	total: 600ms	remaining: 2.31s
206:	learn: 89.0207581	total: 602ms	remaining: 2.31s
207:	learn: 88.9378373	total: 604ms	remaining: 2.3s
208:	learn: 88.7763718	total: 606ms	remaining: 2.

377:	learn: 66.5105887	total: 942ms	remaining: 1.55s
378:	learn: 66.3522764	total: 945ms	remaining: 1.55s
379:	learn: 66.2057655	total: 947ms	remaining: 1.54s
380:	learn: 66.1050698	total: 949ms	remaining: 1.54s
381:	learn: 65.9970642	total: 951ms	remaining: 1.54s
382:	learn: 65.7840268	total: 952ms	remaining: 1.53s
383:	learn: 65.6766540	total: 954ms	remaining: 1.53s
384:	learn: 65.5724413	total: 956ms	remaining: 1.53s
385:	learn: 65.4384975	total: 959ms	remaining: 1.52s
386:	learn: 65.3606011	total: 961ms	remaining: 1.52s
387:	learn: 65.2842863	total: 963ms	remaining: 1.52s
388:	learn: 65.1441530	total: 965ms	remaining: 1.51s
389:	learn: 65.0320143	total: 967ms	remaining: 1.51s
390:	learn: 65.0173613	total: 969ms	remaining: 1.51s
391:	learn: 64.9142747	total: 971ms	remaining: 1.51s
392:	learn: 64.7923525	total: 973ms	remaining: 1.5s
393:	learn: 64.6953073	total: 976ms	remaining: 1.5s
394:	learn: 64.5912334	total: 978ms	remaining: 1.5s
395:	learn: 64.4893912	total: 980ms	remaining: 1.

570:	learn: 52.6236488	total: 1.33s	remaining: 997ms
571:	learn: 52.5403200	total: 1.33s	remaining: 994ms
572:	learn: 52.4584067	total: 1.33s	remaining: 992ms
573:	learn: 52.4193126	total: 1.33s	remaining: 990ms
574:	learn: 52.3201060	total: 1.33s	remaining: 987ms
575:	learn: 52.2605937	total: 1.34s	remaining: 984ms
576:	learn: 52.2301789	total: 1.34s	remaining: 982ms
577:	learn: 52.1454758	total: 1.34s	remaining: 980ms
578:	learn: 52.0393063	total: 1.34s	remaining: 977ms
579:	learn: 51.9726081	total: 1.34s	remaining: 974ms
580:	learn: 51.9190415	total: 1.35s	remaining: 972ms
581:	learn: 51.8825018	total: 1.35s	remaining: 969ms
582:	learn: 51.8476716	total: 1.35s	remaining: 967ms
583:	learn: 51.7706912	total: 1.35s	remaining: 964ms
584:	learn: 51.7241766	total: 1.35s	remaining: 961ms
585:	learn: 51.7043144	total: 1.36s	remaining: 959ms
586:	learn: 51.6873822	total: 1.36s	remaining: 956ms
587:	learn: 51.6682134	total: 1.36s	remaining: 953ms
588:	learn: 51.5962159	total: 1.36s	remaining:

759:	learn: 42.8863264	total: 1.7s	remaining: 536ms
760:	learn: 42.8488811	total: 1.7s	remaining: 534ms
761:	learn: 42.7783712	total: 1.7s	remaining: 532ms
762:	learn: 42.7344203	total: 1.7s	remaining: 529ms
763:	learn: 42.7231447	total: 1.71s	remaining: 527ms
764:	learn: 42.6593309	total: 1.71s	remaining: 525ms
765:	learn: 42.6433833	total: 1.71s	remaining: 522ms
766:	learn: 42.6309585	total: 1.71s	remaining: 520ms
767:	learn: 42.5846408	total: 1.71s	remaining: 518ms
768:	learn: 42.5174262	total: 1.72s	remaining: 516ms
769:	learn: 42.4998350	total: 1.72s	remaining: 513ms
770:	learn: 42.4910053	total: 1.72s	remaining: 511ms
771:	learn: 42.4547304	total: 1.72s	remaining: 509ms
772:	learn: 42.3905137	total: 1.72s	remaining: 506ms
773:	learn: 42.3498454	total: 1.73s	remaining: 504ms
774:	learn: 42.2589543	total: 1.73s	remaining: 502ms
775:	learn: 42.2073643	total: 1.73s	remaining: 499ms
776:	learn: 42.1677675	total: 1.73s	remaining: 497ms
777:	learn: 42.0903272	total: 1.73s	remaining: 495

955:	learn: 35.0711717	total: 2.08s	remaining: 95.9ms
956:	learn: 35.0433008	total: 2.08s	remaining: 93.7ms
957:	learn: 35.0259289	total: 2.09s	remaining: 91.5ms
958:	learn: 35.0006577	total: 2.09s	remaining: 89.3ms
959:	learn: 34.9416964	total: 2.09s	remaining: 87.1ms
960:	learn: 34.9025046	total: 2.09s	remaining: 84.9ms
961:	learn: 34.8880781	total: 2.1s	remaining: 82.8ms
962:	learn: 34.8276049	total: 2.1s	remaining: 80.6ms
963:	learn: 34.7848316	total: 2.1s	remaining: 78.4ms
964:	learn: 34.7545355	total: 2.1s	remaining: 76.2ms
965:	learn: 34.6639396	total: 2.1s	remaining: 74ms
966:	learn: 34.6287169	total: 2.1s	remaining: 71.8ms
967:	learn: 34.5248412	total: 2.11s	remaining: 69.7ms
968:	learn: 34.4951464	total: 2.11s	remaining: 67.5ms
969:	learn: 34.4730515	total: 2.11s	remaining: 65.3ms
970:	learn: 34.4262915	total: 2.11s	remaining: 63.1ms
971:	learn: 34.3936339	total: 2.12s	remaining: 60.9ms
972:	learn: 34.3507958	total: 2.12s	remaining: 58.7ms
973:	learn: 34.3234176	total: 2.12s	

In [8]:
ev_ = explained_variance_score(y_test, y_pred)
print("\tExplained variance:", ev_)
mae_ = mean_absolute_error(y_test, y_pred)
print("\tMean absolute error:", mae_)
r2_ = r2_score(y_test, y_pred)
print("\tR2 score:", r2_)
print()

	Explained variance: 0.9814145542462404
	Mean absolute error: 36.84444338132358
	R2 score: 0.9813679019618019



In [9]:
import joblib

joblib.dump(model, 'Catboost_GridSearchCV_model.pkl')

['Catboost_GridSearchCV_model.pkl']

In [10]:
# GridSearchCV parameters
parameters = {
    # tree의 깊이, default: 6
    'max_depth': [4, 6, 8, 10], 
    
    # 생성 가능한 최대 tree 갯수, default: 1000
    'n_estimators': [1000, 1500, 2000],
    
    # 오차 함수에서 조정하는 step의 크기, default: 자동 설정(0.03)
    'learning_rate': [0.01, 0.03, 0.1, None],
    
    # tree에 랜덤하게 부여되는 가중치, defaul: 1
    'random_strength': [1, 1.2, 1.5, 2, 4]
}

In [11]:
from sklearn.model_selection import GridSearchCV

model = CatBoostRegressor()
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)

grid = GridSearchCV(model, parameters, n_jobs=6, verbose=10)
model = grid.fit(X,y)

Learning rate set to 0.0462
0:	learn: 411.9835942	total: 2.75ms	remaining: 2.75s
1:	learn: 399.2638885	total: 5.23ms	remaining: 2.61s
2:	learn: 386.9914956	total: 7.94ms	remaining: 2.64s
3:	learn: 375.1169183	total: 10.2ms	remaining: 2.53s
4:	learn: 363.4540440	total: 12.3ms	remaining: 2.45s
5:	learn: 352.3173797	total: 14.5ms	remaining: 2.4s
6:	learn: 342.1581667	total: 16.9ms	remaining: 2.4s
7:	learn: 332.6173845	total: 19.6ms	remaining: 2.43s
8:	learn: 323.3894917	total: 22ms	remaining: 2.42s
9:	learn: 314.5482799	total: 24.3ms	remaining: 2.41s
10:	learn: 305.4289867	total: 26.7ms	remaining: 2.4s
11:	learn: 296.7823077	total: 29.1ms	remaining: 2.39s
12:	learn: 289.6188880	total: 31.5ms	remaining: 2.39s
13:	learn: 281.5116338	total: 34ms	remaining: 2.4s
14:	learn: 274.0491608	total: 36.3ms	remaining: 2.38s
15:	learn: 267.8042729	total: 38.7ms	remaining: 2.38s
16:	learn: 262.2963716	total: 40.9ms	remaining: 2.37s
17:	learn: 256.0377489	total: 43.2ms	remaining: 2.36s
18:	learn: 249.828

162:	learn: 95.9676339	total: 358ms	remaining: 1.84s
163:	learn: 95.8638938	total: 360ms	remaining: 1.83s
164:	learn: 95.4468934	total: 362ms	remaining: 1.83s
165:	learn: 95.3420387	total: 363ms	remaining: 1.82s
166:	learn: 95.1290070	total: 365ms	remaining: 1.82s
167:	learn: 95.0183151	total: 367ms	remaining: 1.82s
168:	learn: 94.8324981	total: 369ms	remaining: 1.81s
169:	learn: 94.7248298	total: 371ms	remaining: 1.81s
170:	learn: 94.5637551	total: 373ms	remaining: 1.81s
171:	learn: 94.4243934	total: 375ms	remaining: 1.81s
172:	learn: 94.2753135	total: 378ms	remaining: 1.8s
173:	learn: 94.1697875	total: 379ms	remaining: 1.8s
174:	learn: 93.9497356	total: 381ms	remaining: 1.8s
175:	learn: 93.5806094	total: 383ms	remaining: 1.79s
176:	learn: 93.5107812	total: 385ms	remaining: 1.79s
177:	learn: 93.3949014	total: 387ms	remaining: 1.79s
178:	learn: 93.2283208	total: 389ms	remaining: 1.78s
179:	learn: 93.0848439	total: 391ms	remaining: 1.78s
180:	learn: 93.0169542	total: 392ms	remaining: 1.

333:	learn: 70.9881893	total: 687ms	remaining: 1.37s
334:	learn: 70.8259105	total: 689ms	remaining: 1.37s
335:	learn: 70.7781732	total: 691ms	remaining: 1.36s
336:	learn: 70.7541960	total: 693ms	remaining: 1.36s
337:	learn: 70.7421966	total: 695ms	remaining: 1.36s
338:	learn: 70.6442756	total: 697ms	remaining: 1.36s
339:	learn: 70.4590743	total: 699ms	remaining: 1.35s
340:	learn: 70.4067605	total: 701ms	remaining: 1.35s
341:	learn: 70.3284894	total: 703ms	remaining: 1.35s
342:	learn: 70.1757092	total: 705ms	remaining: 1.35s
343:	learn: 69.9149513	total: 707ms	remaining: 1.35s
344:	learn: 69.7779279	total: 709ms	remaining: 1.34s
345:	learn: 69.6667080	total: 711ms	remaining: 1.34s
346:	learn: 69.6108046	total: 712ms	remaining: 1.34s
347:	learn: 69.5181460	total: 714ms	remaining: 1.34s
348:	learn: 69.4562041	total: 716ms	remaining: 1.34s
349:	learn: 69.3990579	total: 718ms	remaining: 1.33s
350:	learn: 69.2874501	total: 720ms	remaining: 1.33s
351:	learn: 69.2278087	total: 722ms	remaining:

522:	learn: 55.3830589	total: 1.06s	remaining: 968ms
523:	learn: 55.2911767	total: 1.06s	remaining: 966ms
524:	learn: 55.2397712	total: 1.06s	remaining: 964ms
525:	learn: 55.1987310	total: 1.07s	remaining: 962ms
526:	learn: 55.1361662	total: 1.07s	remaining: 960ms
527:	learn: 55.1260146	total: 1.07s	remaining: 958ms
528:	learn: 55.0446554	total: 1.07s	remaining: 956ms
529:	learn: 54.9243530	total: 1.07s	remaining: 954ms
530:	learn: 54.8703431	total: 1.08s	remaining: 952ms
531:	learn: 54.8272700	total: 1.08s	remaining: 949ms
532:	learn: 54.8019747	total: 1.08s	remaining: 947ms
533:	learn: 54.7248471	total: 1.08s	remaining: 945ms
534:	learn: 54.6390769	total: 1.08s	remaining: 943ms
535:	learn: 54.6063278	total: 1.09s	remaining: 941ms
536:	learn: 54.5086507	total: 1.09s	remaining: 938ms
537:	learn: 54.4792352	total: 1.09s	remaining: 936ms
538:	learn: 54.3218064	total: 1.09s	remaining: 934ms
539:	learn: 54.2887807	total: 1.09s	remaining: 932ms
540:	learn: 54.2795514	total: 1.1s	remaining: 

717:	learn: 44.6957912	total: 1.44s	remaining: 568ms
718:	learn: 44.6261086	total: 1.45s	remaining: 566ms
719:	learn: 44.5555145	total: 1.45s	remaining: 564ms
720:	learn: 44.5214351	total: 1.45s	remaining: 562ms
721:	learn: 44.4845039	total: 1.45s	remaining: 559ms
722:	learn: 44.4261581	total: 1.45s	remaining: 557ms
723:	learn: 44.3474043	total: 1.46s	remaining: 555ms
724:	learn: 44.3131736	total: 1.46s	remaining: 553ms
725:	learn: 44.2897177	total: 1.46s	remaining: 551ms
726:	learn: 44.2561738	total: 1.46s	remaining: 549ms
727:	learn: 44.2280178	total: 1.46s	remaining: 547ms
728:	learn: 44.2140263	total: 1.47s	remaining: 545ms
729:	learn: 44.1870269	total: 1.47s	remaining: 543ms
730:	learn: 44.1730874	total: 1.47s	remaining: 541ms
731:	learn: 44.1309723	total: 1.47s	remaining: 539ms
732:	learn: 44.1125676	total: 1.47s	remaining: 537ms
733:	learn: 44.0555920	total: 1.48s	remaining: 535ms
734:	learn: 44.0239527	total: 1.48s	remaining: 533ms
735:	learn: 43.9894361	total: 1.48s	remaining:

910:	learn: 36.7581195	total: 1.82s	remaining: 177ms
911:	learn: 36.7267996	total: 1.82s	remaining: 176ms
912:	learn: 36.6076813	total: 1.82s	remaining: 174ms
913:	learn: 36.5581078	total: 1.82s	remaining: 172ms
914:	learn: 36.5262828	total: 1.82s	remaining: 170ms
915:	learn: 36.5195524	total: 1.83s	remaining: 168ms
916:	learn: 36.4890087	total: 1.83s	remaining: 166ms
917:	learn: 36.4702309	total: 1.83s	remaining: 164ms
918:	learn: 36.4547018	total: 1.83s	remaining: 162ms
919:	learn: 36.3784211	total: 1.83s	remaining: 160ms
920:	learn: 36.3451611	total: 1.84s	remaining: 158ms
921:	learn: 36.2941205	total: 1.84s	remaining: 156ms
922:	learn: 36.2913606	total: 1.84s	remaining: 153ms
923:	learn: 36.2824282	total: 1.84s	remaining: 151ms
924:	learn: 36.2360684	total: 1.84s	remaining: 149ms
925:	learn: 36.1736130	total: 1.84s	remaining: 147ms
926:	learn: 36.1465822	total: 1.85s	remaining: 145ms
927:	learn: 36.1277510	total: 1.85s	remaining: 143ms
928:	learn: 36.0002921	total: 1.85s	remaining:

KeyboardInterrupt: 

In [None]:
model

In [None]:
params = model.best_params_
params

In [None]:
model.best_estimator_

In [None]:
new_model = CatBoostRegressor(
   learning_rate=0.01, max_depth=6, n_estimators=1000, random_strength=2
)
new_model = new_model.fit(X_train, y_train)
y_pred = new_model.predict(X_test)

In [None]:
print("\tExplained variance:", explained_variance_score(y_test, y_pred))
print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
print("\tR2 score:", r2_score(y_test, y_pred))
print()

In [None]:
scores_df = pd.DataFrame(grid.cv_results_)
scores_df

In [None]:
import joblib

joblib.dump(model, 'Catboost_GridSearchCV_model.pkl')