In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 글꼴깨짐 방지
import matplotlib
import matplotlib.font_manager as fm

import warnings

# 경고 메시지를 무시하고 숨기거나
warnings.filterwarnings(action='ignore')

fm.get_fontconfig_fonts()
font_location = 'C:/Windows/Fonts/malgun.ttf' # For Windows
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

In [2]:
df = pd.read_csv('train.csv')

In [3]:
def preprocessing(df):
    # 오류 단지코드가 존재하는 행들을  사전에 제거
    df_error =  ['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
    #df_error =  ['C2335', 'C1327']
    df = df[~df['단지코드'].isin(df_error)].reset_index(drop=True)
    df.rename(columns = {'도보 10분거리 내 지하철역 수(환승노선 수 반영)':'지하철','도보 10분거리 내 버스정류장 수':'버스'},inplace=True)
    df.drop(columns=['임대보증금','임대료','자격유형','임대건물구분'],axis = 1,inplace=True)
    지역_비율 = (df.groupby(['지역'])['총세대수'].count())/(df.groupby(['지역'])['총세대수'].count().sum())*100
    지역_비율=지역_비율.reset_index(name='지역_비율')
    공급유형_비율 = (df.groupby(['공급유형'])['총세대수'].count())/(df.groupby(['공급유형'])['총세대수'].count().sum())*100
    공급유형_비율=공급유형_비율.reset_index(name='공급유형_비율')
    df = pd.merge(df,지역_비율, on='지역')
    df = pd.merge(df,공급유형_비율, on='공급유형')
    df.drop(columns=['지역','공급유형','단지코드'],axis = 1,inplace=True)
    df=df.dropna(axis=0)
    df = df[['총세대수', '전용면적', '전용면적별세대수', '공가수', '지하철', '버스', '단지내주차면수', '공급유형_비율',
       '지역_비율', '등록차량수']]
    return df

In [4]:
df = preprocessing(df)

In [5]:
#Splitting the data into train and test split
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score

from time import time

In [7]:
# test cycle
from catboost import CatBoostRegressor

model = CatBoostRegressor()

model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)

Learning rate set to 0.0462
0:	learn: 411.3940370	total: 138ms	remaining: 2m 17s
1:	learn: 398.6229952	total: 139ms	remaining: 1m 9s
2:	learn: 385.9485745	total: 141ms	remaining: 46.7s
3:	learn: 374.2145419	total: 142ms	remaining: 35.3s
4:	learn: 362.5644627	total: 143ms	remaining: 28.5s
5:	learn: 351.6038597	total: 145ms	remaining: 24s
6:	learn: 341.3099915	total: 146ms	remaining: 20.7s
7:	learn: 330.8436447	total: 147ms	remaining: 18.3s
8:	learn: 321.0210445	total: 148ms	remaining: 16.3s
9:	learn: 311.6974552	total: 150ms	remaining: 14.8s
10:	learn: 302.4510286	total: 151ms	remaining: 13.6s
11:	learn: 294.1031836	total: 152ms	remaining: 12.5s
12:	learn: 285.9348168	total: 154ms	remaining: 11.7s
13:	learn: 277.6958269	total: 155ms	remaining: 10.9s
14:	learn: 270.3802012	total: 156ms	remaining: 10.2s
15:	learn: 263.5270581	total: 157ms	remaining: 9.67s
16:	learn: 256.8357906	total: 158ms	remaining: 9.16s
17:	learn: 250.5517068	total: 160ms	remaining: 8.71s
18:	learn: 244.2140671	total:

162:	learn: 90.7975279	total: 1.66s	remaining: 8.54s
163:	learn: 90.2727937	total: 1.71s	remaining: 8.69s
164:	learn: 89.9476174	total: 1.72s	remaining: 8.73s
165:	learn: 89.6657435	total: 1.74s	remaining: 8.76s
166:	learn: 89.4359616	total: 1.76s	remaining: 8.79s
167:	learn: 89.0583602	total: 1.78s	remaining: 8.81s
168:	learn: 88.7322450	total: 1.8s	remaining: 8.83s
169:	learn: 88.5340160	total: 1.81s	remaining: 8.85s
170:	learn: 88.4292670	total: 1.83s	remaining: 8.86s
171:	learn: 88.2054397	total: 1.84s	remaining: 8.87s
172:	learn: 87.9804908	total: 1.86s	remaining: 8.88s
173:	learn: 87.6875483	total: 1.88s	remaining: 8.91s
174:	learn: 87.3590240	total: 1.89s	remaining: 8.91s
175:	learn: 87.1271943	total: 1.91s	remaining: 8.92s
176:	learn: 86.8931955	total: 1.92s	remaining: 8.93s
177:	learn: 86.6041154	total: 1.94s	remaining: 8.94s
178:	learn: 86.4530993	total: 1.95s	remaining: 8.94s
179:	learn: 86.0888756	total: 1.96s	remaining: 8.94s
180:	learn: 85.8355197	total: 1.98s	remaining: 

319:	learn: 63.0308452	total: 2.23s	remaining: 4.75s
320:	learn: 62.9532285	total: 2.23s	remaining: 4.73s
321:	learn: 62.8513216	total: 2.24s	remaining: 4.71s
322:	learn: 62.7913666	total: 2.24s	remaining: 4.69s
323:	learn: 62.6643250	total: 2.24s	remaining: 4.67s
324:	learn: 62.4934634	total: 2.24s	remaining: 4.65s
325:	learn: 62.4507338	total: 2.24s	remaining: 4.63s
326:	learn: 62.3882145	total: 2.24s	remaining: 4.62s
327:	learn: 62.2928306	total: 2.24s	remaining: 4.6s
328:	learn: 62.1938186	total: 2.25s	remaining: 4.58s
329:	learn: 62.1455360	total: 2.25s	remaining: 4.56s
330:	learn: 61.9949602	total: 2.25s	remaining: 4.54s
331:	learn: 61.9196629	total: 2.25s	remaining: 4.53s
332:	learn: 61.8303751	total: 2.25s	remaining: 4.51s
333:	learn: 61.7485753	total: 2.25s	remaining: 4.49s
334:	learn: 61.6947382	total: 2.25s	remaining: 4.47s
335:	learn: 61.6164408	total: 2.25s	remaining: 4.46s
336:	learn: 61.4105356	total: 2.26s	remaining: 4.44s
337:	learn: 61.3770158	total: 2.26s	remaining: 

612:	learn: 39.4220455	total: 2.62s	remaining: 1.65s
613:	learn: 39.3627929	total: 2.62s	remaining: 1.65s
614:	learn: 39.2994514	total: 2.62s	remaining: 1.64s
615:	learn: 39.2581372	total: 2.62s	remaining: 1.64s
616:	learn: 39.2115974	total: 2.63s	remaining: 1.63s
617:	learn: 39.1813162	total: 2.63s	remaining: 1.62s
618:	learn: 39.1387918	total: 2.63s	remaining: 1.62s
619:	learn: 39.1217703	total: 2.63s	remaining: 1.61s
620:	learn: 39.0298680	total: 2.63s	remaining: 1.6s
621:	learn: 38.9812107	total: 2.63s	remaining: 1.6s
622:	learn: 38.9664745	total: 2.63s	remaining: 1.59s
623:	learn: 38.8681616	total: 2.63s	remaining: 1.59s
624:	learn: 38.8286317	total: 2.64s	remaining: 1.58s
625:	learn: 38.7796473	total: 2.64s	remaining: 1.57s
626:	learn: 38.7167326	total: 2.64s	remaining: 1.57s
627:	learn: 38.6610967	total: 2.64s	remaining: 1.56s
628:	learn: 38.5970566	total: 2.64s	remaining: 1.56s
629:	learn: 38.5628275	total: 2.64s	remaining: 1.55s
630:	learn: 38.5395727	total: 2.64s	remaining: 1

893:	learn: 28.3125864	total: 3s	remaining: 356ms
894:	learn: 28.2642459	total: 3.02s	remaining: 354ms
895:	learn: 28.2376653	total: 3.03s	remaining: 352ms
896:	learn: 28.2047361	total: 3.04s	remaining: 350ms
897:	learn: 28.2034444	total: 3.06s	remaining: 347ms
898:	learn: 28.1896154	total: 3.07s	remaining: 345ms
899:	learn: 28.1187264	total: 3.08s	remaining: 343ms
900:	learn: 28.0680050	total: 3.1s	remaining: 341ms
901:	learn: 28.0570734	total: 3.11s	remaining: 338ms
902:	learn: 28.0213422	total: 3.13s	remaining: 336ms
903:	learn: 27.9970931	total: 3.14s	remaining: 333ms
904:	learn: 27.9679349	total: 3.15s	remaining: 331ms
905:	learn: 27.9398673	total: 3.17s	remaining: 329ms
906:	learn: 27.9183473	total: 3.18s	remaining: 326ms
907:	learn: 27.9168791	total: 3.19s	remaining: 324ms
908:	learn: 27.8957703	total: 3.21s	remaining: 321ms
909:	learn: 27.8412012	total: 3.22s	remaining: 319ms
910:	learn: 27.8316843	total: 3.23s	remaining: 316ms
911:	learn: 27.7976527	total: 3.25s	remaining: 314

In [8]:
ev_ = explained_variance_score(y_test, y_pred)
print("\tExplained variance:", ev_)
mae_ = mean_absolute_error(y_test, y_pred)
print("\tMean absolute error:", mae_)
r2_ = r2_score(y_test, y_pred)
print("\tR2 score:", r2_)
print()

	Explained variance: 0.9878099295078537
	Mean absolute error: 29.8928190870309
	R2 score: 0.9877734170745662



In [9]:
import joblib

joblib.dump(model, 'Catboost_GridSearchCV_model.pkl')

['Catboost_GridSearchCV_model.pkl']

In [10]:
# GridSearchCV parameters
parameters = {
    # tree의 깊이, default: 6
    'max_depth': [4, 6, 8, 10], 
    
    # 생성 가능한 최대 tree 갯수, default: 1000
    'n_estimators': [1000, 1500, 2000],
    
    # 오차 함수에서 조정하는 step의 크기, default: 자동 설정(0.03)
    'learning_rate': [0.01, 0.03, 0.1, None],
    
    # tree에 랜덤하게 부여되는 가중치, defaul: 1
    'random_strength': [1, 1.2, 1.5, 2, 4]
}


In [11]:
new_model = CatBoostRegressor(
   learning_rate=0.01, max_depth=6, n_estimators=1000, random_strength=2
)
new_model = new_model.fit(X_train, y_train)
y_pred = new_model.predict(X_test)

0:	learn: 422.2582674	total: 1.8ms	remaining: 1.8s
1:	learn: 419.6344325	total: 3.39ms	remaining: 1.69s
2:	learn: 417.1486737	total: 4.85ms	remaining: 1.61s
3:	learn: 414.3405325	total: 6.08ms	remaining: 1.51s
4:	learn: 411.8295994	total: 7.66ms	remaining: 1.52s
5:	learn: 409.0693087	total: 9.2ms	remaining: 1.52s
6:	learn: 406.1936662	total: 10.9ms	remaining: 1.54s
7:	learn: 403.4299096	total: 12.5ms	remaining: 1.55s
8:	learn: 400.7377064	total: 13.8ms	remaining: 1.51s
9:	learn: 398.0937484	total: 15ms	remaining: 1.49s
10:	learn: 395.4505755	total: 16.3ms	remaining: 1.47s
11:	learn: 392.8877627	total: 17.8ms	remaining: 1.46s
12:	learn: 390.4689759	total: 19.1ms	remaining: 1.45s
13:	learn: 387.7751224	total: 20.9ms	remaining: 1.47s
14:	learn: 385.2468747	total: 23.1ms	remaining: 1.51s
15:	learn: 382.9009108	total: 24.4ms	remaining: 1.5s
16:	learn: 380.3432102	total: 25.9ms	remaining: 1.5s
17:	learn: 378.0154954	total: 27.3ms	remaining: 1.49s
18:	learn: 375.7656530	total: 28.8ms	remainin

166:	learn: 184.6005327	total: 250ms	remaining: 1.25s
167:	learn: 183.9074970	total: 252ms	remaining: 1.25s
168:	learn: 183.3338669	total: 253ms	remaining: 1.24s
169:	learn: 182.9465795	total: 254ms	remaining: 1.24s
170:	learn: 182.3749673	total: 256ms	remaining: 1.24s
171:	learn: 181.8885225	total: 258ms	remaining: 1.24s
172:	learn: 181.2992957	total: 259ms	remaining: 1.24s
173:	learn: 180.8229228	total: 260ms	remaining: 1.24s
174:	learn: 180.2519444	total: 262ms	remaining: 1.23s
175:	learn: 179.5256532	total: 263ms	remaining: 1.23s
176:	learn: 179.1141508	total: 265ms	remaining: 1.23s
177:	learn: 178.5669070	total: 266ms	remaining: 1.23s
178:	learn: 178.0484794	total: 268ms	remaining: 1.23s
179:	learn: 177.5511762	total: 269ms	remaining: 1.22s
180:	learn: 177.0590007	total: 270ms	remaining: 1.22s
181:	learn: 176.4597576	total: 272ms	remaining: 1.22s
182:	learn: 175.9651582	total: 273ms	remaining: 1.22s
183:	learn: 175.3565441	total: 274ms	remaining: 1.22s
184:	learn: 174.9051106	tota

343:	learn: 130.9799177	total: 782ms	remaining: 1.49s
344:	learn: 130.8411325	total: 783ms	remaining: 1.49s
345:	learn: 130.7237350	total: 785ms	remaining: 1.48s
346:	learn: 130.6290557	total: 786ms	remaining: 1.48s
347:	learn: 130.3618107	total: 787ms	remaining: 1.47s
348:	learn: 130.2547431	total: 789ms	remaining: 1.47s
349:	learn: 130.0813487	total: 790ms	remaining: 1.47s
350:	learn: 129.8843379	total: 791ms	remaining: 1.46s
351:	learn: 129.7450568	total: 793ms	remaining: 1.46s
352:	learn: 129.5636127	total: 794ms	remaining: 1.45s
353:	learn: 129.4632234	total: 795ms	remaining: 1.45s
354:	learn: 129.3391673	total: 796ms	remaining: 1.45s
355:	learn: 129.1468628	total: 798ms	remaining: 1.44s
356:	learn: 129.0228652	total: 799ms	remaining: 1.44s
357:	learn: 128.8087143	total: 800ms	remaining: 1.43s
358:	learn: 128.6008037	total: 801ms	remaining: 1.43s
359:	learn: 128.3725939	total: 803ms	remaining: 1.43s
360:	learn: 128.2382793	total: 804ms	remaining: 1.42s
361:	learn: 128.1441420	tota

525:	learn: 111.8482703	total: 1.02s	remaining: 920ms
526:	learn: 111.7783790	total: 1.02s	remaining: 918ms
527:	learn: 111.6795638	total: 1.02s	remaining: 915ms
528:	learn: 111.5790981	total: 1.02s	remaining: 913ms
529:	learn: 111.5493373	total: 1.03s	remaining: 910ms
530:	learn: 111.5211685	total: 1.03s	remaining: 908ms
531:	learn: 111.4514004	total: 1.03s	remaining: 906ms
532:	learn: 111.3848763	total: 1.03s	remaining: 903ms
533:	learn: 111.3233587	total: 1.03s	remaining: 900ms
534:	learn: 111.1971524	total: 1.03s	remaining: 898ms
535:	learn: 111.1105943	total: 1.03s	remaining: 896ms
536:	learn: 111.0090812	total: 1.03s	remaining: 893ms
537:	learn: 110.9513684	total: 1.04s	remaining: 891ms
538:	learn: 110.8866210	total: 1.04s	remaining: 888ms
539:	learn: 110.8242872	total: 1.04s	remaining: 886ms
540:	learn: 110.7560837	total: 1.04s	remaining: 884ms
541:	learn: 110.6676248	total: 1.04s	remaining: 881ms
542:	learn: 110.6323739	total: 1.04s	remaining: 879ms
543:	learn: 110.5618580	tota

685:	learn: 101.9498121	total: 1.23s	remaining: 565ms
686:	learn: 101.8663552	total: 1.24s	remaining: 563ms
687:	learn: 101.7792388	total: 1.24s	remaining: 561ms
688:	learn: 101.7315144	total: 1.24s	remaining: 559ms
689:	learn: 101.7027515	total: 1.24s	remaining: 557ms
690:	learn: 101.6399807	total: 1.24s	remaining: 555ms
691:	learn: 101.5936091	total: 1.24s	remaining: 553ms
692:	learn: 101.5438287	total: 1.24s	remaining: 551ms
693:	learn: 101.4789353	total: 1.25s	remaining: 549ms
694:	learn: 101.4002481	total: 1.25s	remaining: 547ms
695:	learn: 101.3587479	total: 1.25s	remaining: 545ms
696:	learn: 101.3003261	total: 1.25s	remaining: 543ms
697:	learn: 101.2218851	total: 1.25s	remaining: 541ms
698:	learn: 101.1863501	total: 1.25s	remaining: 539ms
699:	learn: 101.1213858	total: 1.25s	remaining: 537ms
700:	learn: 101.0908189	total: 1.25s	remaining: 535ms
701:	learn: 101.0494773	total: 1.26s	remaining: 533ms
702:	learn: 101.0029952	total: 1.26s	remaining: 531ms
703:	learn: 100.9324520	tota

863:	learn: 91.9353550	total: 1.75s	remaining: 276ms
864:	learn: 91.8995789	total: 1.75s	remaining: 273ms
865:	learn: 91.8418287	total: 1.75s	remaining: 271ms
866:	learn: 91.8073253	total: 1.75s	remaining: 269ms
867:	learn: 91.7741705	total: 1.76s	remaining: 267ms
868:	learn: 91.7423955	total: 1.76s	remaining: 265ms
869:	learn: 91.6765125	total: 1.76s	remaining: 263ms
870:	learn: 91.5985438	total: 1.76s	remaining: 261ms
871:	learn: 91.5287243	total: 1.76s	remaining: 259ms
872:	learn: 91.4576112	total: 1.76s	remaining: 256ms
873:	learn: 91.4136797	total: 1.76s	remaining: 254ms
874:	learn: 91.3807828	total: 1.76s	remaining: 252ms
875:	learn: 91.3251306	total: 1.77s	remaining: 250ms
876:	learn: 91.2396594	total: 1.77s	remaining: 248ms
877:	learn: 91.1997065	total: 1.77s	remaining: 246ms
878:	learn: 91.1318156	total: 1.77s	remaining: 244ms
879:	learn: 91.0679182	total: 1.77s	remaining: 242ms
880:	learn: 91.0237880	total: 1.77s	remaining: 240ms
881:	learn: 90.9895972	total: 1.77s	remaining:

In [12]:
print("\tExplained variance:", explained_variance_score(y_test, y_pred))
print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
print("\tR2 score:", r2_score(y_test, y_pred))
print()

	Explained variance: 0.9405171045220978
	Mean absolute error: 70.72223195764994
	R2 score: 0.9404723370934964

