In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 글꼴깨짐 방지
import matplotlib
import matplotlib.font_manager as fm

import warnings

# 경고 메시지를 무시하고 숨기거나
warnings.filterwarnings(action='ignore')

fm.get_fontconfig_fonts()
font_location = 'C:/Windows/Fonts/malgun.ttf' # For Windows
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

In [2]:
df = pd.read_csv('train.csv')

In [3]:
def preprocessing(df):
    # 오류 단지코드가 존재하는 행들을  사전에 제거
    df_error =  ['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
    #df_error =  ['C2335', 'C1327']
    df = df[~df['단지코드'].isin(df_error)].reset_index(drop=True)
    df.rename(columns = {'도보 10분거리 내 지하철역 수(환승노선 수 반영)':'지하철','도보 10분거리 내 버스정류장 수':'버스'},inplace=True)
    df.drop(columns=['임대보증금','임대료','자격유형','임대건물구분'],axis = 1,inplace=True)
    지역_비율 = (df.groupby(['지역'])['총세대수'].count())/(df.groupby(['지역'])['총세대수'].count().sum())*100
    지역_비율=지역_비율.reset_index(name='지역_비율')
    공급유형_비율 = (df.groupby(['공급유형'])['총세대수'].count())/(df.groupby(['공급유형'])['총세대수'].count().sum())*100
    공급유형_비율=공급유형_비율.reset_index(name='공급유형_비율')
    df = pd.merge(df,지역_비율, on='지역')
    df = pd.merge(df,공급유형_비율, on='공급유형')
    df.drop(columns=['지역','공급유형','단지코드'],axis = 1,inplace=True)
    df=df.dropna(axis=0)
    df = df[['총세대수', '전용면적', '전용면적별세대수', '공가수', '지하철', '버스', '단지내주차면수', '공급유형_비율',
       '지역_비율', '등록차량수']]
    return df

In [4]:
df = preprocessing(df)

In [9]:
#Splitting the data into train and test split
from sklearn.model_selection import train_test_split
X = df.iloc[:, 1:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score

from time import time

In [20]:
# test cycle
from catboost import CatBoostRegressor

model = CatBoostRegressor()
start = time()
model = model.fit(X_train, y_train)
train_time = time() - start
start = time()
y_pred = model.predict(X_test)
predict_time = time() - start

Learning rate set to 0.0462
0:	learn: 411.9835942	total: 15ms	remaining: 15s
1:	learn: 399.2638885	total: 31.1ms	remaining: 15.5s
2:	learn: 386.9914956	total: 44.8ms	remaining: 14.9s
3:	learn: 375.1169183	total: 58ms	remaining: 14.4s
4:	learn: 363.4540440	total: 70.9ms	remaining: 14.1s
5:	learn: 352.3173797	total: 85ms	remaining: 14.1s
6:	learn: 342.1581667	total: 98.5ms	remaining: 14s
7:	learn: 332.6173845	total: 112ms	remaining: 13.9s
8:	learn: 323.3894917	total: 126ms	remaining: 13.9s
9:	learn: 314.5482799	total: 140ms	remaining: 13.8s
10:	learn: 305.4289867	total: 154ms	remaining: 13.8s
11:	learn: 296.7823077	total: 170ms	remaining: 14s
12:	learn: 289.6188880	total: 185ms	remaining: 14.1s
13:	learn: 281.5116338	total: 200ms	remaining: 14.1s
14:	learn: 274.0491608	total: 213ms	remaining: 14s
15:	learn: 267.8042729	total: 226ms	remaining: 13.9s
16:	learn: 262.2963716	total: 240ms	remaining: 13.9s
17:	learn: 256.0377489	total: 254ms	remaining: 13.8s
18:	learn: 249.8287312	total: 266ms

167:	learn: 95.0183151	total: 1.17s	remaining: 5.8s
168:	learn: 94.8324981	total: 1.19s	remaining: 5.83s
169:	learn: 94.7248298	total: 1.2s	remaining: 5.86s
170:	learn: 94.5637551	total: 1.21s	remaining: 5.88s
171:	learn: 94.4243934	total: 1.23s	remaining: 5.9s
172:	learn: 94.2753135	total: 1.24s	remaining: 5.93s
173:	learn: 94.1697875	total: 1.25s	remaining: 5.95s
174:	learn: 93.9497356	total: 1.27s	remaining: 5.97s
175:	learn: 93.5806094	total: 1.28s	remaining: 5.99s
176:	learn: 93.5107812	total: 1.29s	remaining: 6.01s
177:	learn: 93.3949014	total: 1.3s	remaining: 6.03s
178:	learn: 93.2283208	total: 1.32s	remaining: 6.05s
179:	learn: 93.0848439	total: 1.33s	remaining: 6.07s
180:	learn: 93.0169542	total: 1.35s	remaining: 6.09s
181:	learn: 92.9429345	total: 1.36s	remaining: 6.1s
182:	learn: 92.7260447	total: 1.36s	remaining: 6.08s
183:	learn: 92.4873507	total: 1.36s	remaining: 6.06s
184:	learn: 92.2439107	total: 1.37s	remaining: 6.03s
185:	learn: 92.1278331	total: 1.37s	remaining: 6s
1

348:	learn: 69.4562041	total: 2.15s	remaining: 4.01s
349:	learn: 69.3990579	total: 2.15s	remaining: 4s
350:	learn: 69.2874501	total: 2.15s	remaining: 3.98s
351:	learn: 69.2278087	total: 2.15s	remaining: 3.97s
352:	learn: 69.0776187	total: 2.16s	remaining: 3.95s
353:	learn: 69.0156264	total: 2.16s	remaining: 3.94s
354:	learn: 69.0084336	total: 2.16s	remaining: 3.92s
355:	learn: 68.8642704	total: 2.16s	remaining: 3.91s
356:	learn: 68.8029537	total: 2.16s	remaining: 3.89s
357:	learn: 68.6232029	total: 2.16s	remaining: 3.88s
358:	learn: 68.5412126	total: 2.16s	remaining: 3.86s
359:	learn: 68.4258613	total: 2.16s	remaining: 3.85s
360:	learn: 68.2748798	total: 2.17s	remaining: 3.83s
361:	learn: 68.1519135	total: 2.17s	remaining: 3.82s
362:	learn: 68.0646404	total: 2.17s	remaining: 3.81s
363:	learn: 67.9560663	total: 2.17s	remaining: 3.79s
364:	learn: 67.8922442	total: 2.17s	remaining: 3.78s
365:	learn: 67.6512213	total: 2.17s	remaining: 3.76s
366:	learn: 67.5980562	total: 2.17s	remaining: 3.

510:	learn: 56.1082747	total: 2.35s	remaining: 2.25s
511:	learn: 56.0782236	total: 2.35s	remaining: 2.24s
512:	learn: 56.0563397	total: 2.36s	remaining: 2.24s
513:	learn: 55.9606057	total: 2.37s	remaining: 2.24s
514:	learn: 55.8989318	total: 2.39s	remaining: 2.25s
515:	learn: 55.8384931	total: 2.4s	remaining: 2.25s
516:	learn: 55.8054151	total: 2.42s	remaining: 2.26s
517:	learn: 55.7513132	total: 2.43s	remaining: 2.26s
518:	learn: 55.7112675	total: 2.44s	remaining: 2.26s
519:	learn: 55.5734761	total: 2.45s	remaining: 2.27s
520:	learn: 55.5136823	total: 2.47s	remaining: 2.27s
521:	learn: 55.4190171	total: 2.48s	remaining: 2.27s
522:	learn: 55.3830589	total: 2.49s	remaining: 2.27s
523:	learn: 55.2911767	total: 2.51s	remaining: 2.28s
524:	learn: 55.2397712	total: 2.52s	remaining: 2.28s
525:	learn: 55.1987310	total: 2.53s	remaining: 2.28s
526:	learn: 55.1361662	total: 2.55s	remaining: 2.29s
527:	learn: 55.1260146	total: 2.56s	remaining: 2.29s
528:	learn: 55.0446554	total: 2.57s	remaining: 

763:	learn: 42.7231447	total: 2.93s	remaining: 904ms
764:	learn: 42.6593309	total: 2.93s	remaining: 901ms
765:	learn: 42.6433833	total: 2.94s	remaining: 899ms
766:	learn: 42.6309585	total: 2.96s	remaining: 898ms
767:	learn: 42.5846408	total: 2.97s	remaining: 898ms
768:	learn: 42.5174262	total: 2.98s	remaining: 897ms
769:	learn: 42.4998350	total: 3s	remaining: 895ms
770:	learn: 42.4910053	total: 3.01s	remaining: 894ms
771:	learn: 42.4547304	total: 3.02s	remaining: 892ms
772:	learn: 42.3905137	total: 3.03s	remaining: 891ms
773:	learn: 42.3498454	total: 3.05s	remaining: 890ms
774:	learn: 42.2589543	total: 3.06s	remaining: 888ms
775:	learn: 42.2073643	total: 3.07s	remaining: 887ms
776:	learn: 42.1677675	total: 3.08s	remaining: 885ms
777:	learn: 42.0903272	total: 3.1s	remaining: 884ms
778:	learn: 42.0129768	total: 3.11s	remaining: 882ms
779:	learn: 41.9896125	total: 3.12s	remaining: 880ms
780:	learn: 41.9571981	total: 3.13s	remaining: 879ms
781:	learn: 41.8660517	total: 3.15s	remaining: 877

930:	learn: 35.8706971	total: 3.9s	remaining: 289ms
931:	learn: 35.8333015	total: 3.92s	remaining: 286ms
932:	learn: 35.8054354	total: 3.93s	remaining: 282ms
933:	learn: 35.7603972	total: 3.94s	remaining: 279ms
934:	learn: 35.7034993	total: 3.95s	remaining: 275ms
935:	learn: 35.6805851	total: 3.97s	remaining: 271ms
936:	learn: 35.6681855	total: 3.98s	remaining: 267ms
937:	learn: 35.6469547	total: 3.99s	remaining: 264ms
938:	learn: 35.6267041	total: 4s	remaining: 260ms
939:	learn: 35.6039481	total: 4.01s	remaining: 256ms
940:	learn: 35.6016335	total: 4.02s	remaining: 252ms
941:	learn: 35.5363861	total: 4.04s	remaining: 248ms
942:	learn: 35.4654193	total: 4.04s	remaining: 244ms
943:	learn: 35.4488156	total: 4.04s	remaining: 240ms
944:	learn: 35.4246263	total: 4.04s	remaining: 235ms
945:	learn: 35.4055531	total: 4.05s	remaining: 231ms
946:	learn: 35.3827462	total: 4.05s	remaining: 227ms
947:	learn: 35.3524105	total: 4.05s	remaining: 222ms
948:	learn: 35.3269150	total: 4.05s	remaining: 218

In [21]:
print("\tTraining time: %0.3fs" % train_time)
print("\tPrediction time: %0.3fs" % predict_time)
print("\tExplained variance:", explained_variance_score(y_test, y_pred))
print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
print("\tR2 score:", r2_score(y_test, y_pred))
print()

	Training time: 4.515s
	Prediction time: 0.008s
	Explained variance: 0.9814145542462404
	Mean absolute error: 36.84444338132358
	R2 score: 0.9813679019618019



In [None]:
# GridSearchCV parameters
parameters = {
    
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(model, parameters, n_jobs=6, verbose=10)
model = grid.fit(X,y)

print(model.best_params_,'\n')
print(model.best_estimator_,'\n')

In [None]:
model

In [None]:
scores_df = pd.DataFrame(grid.cv_results_)
scores_df

In [None]:
grid.best_score_

In [None]:
import joblib

joblib.dump(gs_model, 'Catboost_GridSearchCV_model.pkl')