In [247]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 글꼴깨짐 방지
import matplotlib
import matplotlib.font_manager as fm

import warnings

# 경고 메시지를 무시하고 숨기거나
warnings.filterwarnings(action='ignore')

fm.get_fontconfig_fonts()
font_location = 'C:/Windows/Fonts/malgun.ttf' # For Windows
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

In [248]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0


In [249]:
def preprocessing(df):
    # 오류 단지코드가 존재하는 행들을  사전에 제거
    df_error =  ['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
    #df_error =  ['C2335', 'C1327']
    df = df[~df['단지코드'].isin(df_error)].reset_index(drop=True)
    df.rename(columns = {'도보 10분거리 내 지하철역 수(환승노선 수 반영)':'지하철','도보 10분거리 내 버스정류장 수':'버스'},inplace=True)
    df.drop(columns=['임대보증금','임대료','자격유형','임대건물구분'],axis = 1,inplace=True)
    지역_비율 = (df.groupby(['지역'])['총세대수'].count())/(df.groupby(['지역'])['총세대수'].count().sum())*100
    지역_비율=지역_비율.reset_index(name='지역_비율')
    공급유형_비율 = (df.groupby(['공급유형'])['총세대수'].count())/(df.groupby(['공급유형'])['총세대수'].count().sum())*100
    공급유형_비율=공급유형_비율.reset_index(name='공급유형_비율')
    df = pd.merge(df,지역_비율, on='지역')
    df = pd.merge(df,공급유형_비율, on='공급유형')
    df.drop(columns=['지역','공급유형','단지코드'],axis = 1,inplace=True)
    df=df.dropna(axis=0)
    df = df[['총세대수', '전용면적', '전용면적별세대수', '공가수', '지하철', '버스', '단지내주차면수', '공급유형_비율',
       '지역_비율', '등록차량수']]
    return df

In [250]:
df = preprocessing(df)

In [251]:
!pip install XGBoost

Defaulting to user installation because normal site-packages is not writeable


In [252]:
#Splitting the data into train and test split
from sklearn.model_selection import train_test_split
X = df.iloc[:, 1:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [253]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score

from time import time

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [254]:
regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    XGBRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]

In [255]:
head = 10
for model in regressors[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print()

KNeighborsRegressor()
	Training time: 0.004s
	Prediction time: 0.003s
	Explained variance: 0.8260783220523491
	Mean absolute error: 108.38919925512104
	R2 score: 0.8260188657053464

GradientBoostingRegressor()
	Training time: 0.178s
	Prediction time: 0.002s
	Explained variance: 0.9244758101356931
	Mean absolute error: 80.11029879321003
	R2 score: 0.9244607786132508

KNeighborsRegressor()
	Training time: 0.004s
	Prediction time: 0.004s
	Explained variance: 0.8260783220523491
	Mean absolute error: 108.38919925512104
	R2 score: 0.8260188657053464

ExtraTreesRegressor()
	Training time: 0.212s
	Prediction time: 0.013s
	Explained variance: 0.9974473958993291
	Mean absolute error: 8.144413407821228
	R2 score: 0.9974460199845662

RandomForestRegressor()
	Training time: 0.408s
	Prediction time: 0.013s
	Explained variance: 0.985648326318817
	Mean absolute error: 21.816480446927375
	R2 score: 0.9856460828028577

DecisionTreeRegressor()
	Training time: 0.007s
	Prediction time: 0.001s
	Explained va

In [256]:
parameters = {'n_estimators':[100],
              'learning_rate' : [0.1],
              'max_depth':[3],#트리의 최대 깊이
              'gamma' : [0],#분할을 수행하는데 필요한 최소 손실 감소를 지정, loss function에 따라 조정해야함
              'colsample_bytree':[0.9],#각 트리마다의 feature 샘플링 비율
              'eval_metric': ['mae'],
              'scale_pos_weight':[0.5],
              'objective':['reg:linear'],
              'subsample': [0.5],#각 트리마다의 관측 데이터 샘플링 비율, 값을 적게 주면 오버피팅, 더 작게 주면 언더 피팅
              'min_child_weight': [22],
              'seed': [1337],
              'eta': [0.02],
              
             }

In [257]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(XGBRegressor(),parameters,verbose=10)
model = grid.fit(X,y)
print(model.best_params_,'\n')
print(model.best_estimator_,'\n')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START colsample_bytree=0.9, eta=0.02, eval_metric=mae, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=22, n_estimators=100, objective=reg:linear, scale_pos_weight=0.5, seed=1337, subsample=0.5
[CV 1/5; 1/1] END colsample_bytree=0.9, eta=0.02, eval_metric=mae, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=22, n_estimators=100, objective=reg:linear, scale_pos_weight=0.5, seed=1337, subsample=0.5;, score=0.403 total time=   0.0s
[CV 2/5; 1/1] START colsample_bytree=0.9, eta=0.02, eval_metric=mae, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=22, n_estimators=100, objective=reg:linear, scale_pos_weight=0.5, seed=1337, subsample=0.5
[CV 2/5; 1/1] END colsample_bytree=0.9, eta=0.02, eval_metric=mae, gamma=0, learning_rate=0.1, max_depth=3, min_child_weight=22, n_estimators=100, objective=reg:linear, scale_pos_weight=0.5, seed=1337, subsample=0.5;, score=0.844 total time=   0.0s
[CV 3/5

In [258]:
model

GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_bi...
                                    monotone_constraints=None, n_estimators=100,
                                    n_jobs=None, num_parallel_tree=None,
                                    predictor=None, random_state=None, ...),
             param_grid={'colsample_bytree': [0.9], 'eta': [0.02],
        

In [259]:
scores_df = pd.DataFrame(grid.cv_results_)
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_eta,param_eval_metric,param_gamma,param_learning_rate,param_max_depth,...,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.055348,0.00572,0.002611,0.000495,0.9,0.02,mae,0,0.1,3,...,0.5,"{'colsample_bytree': 0.9, 'eta': 0.02, 'eval_m...",0.403446,0.844446,0.709554,0.804589,0.731556,0.698718,0.155452,1


In [260]:
grid.best_score_

0.6987182360417263

In [261]:
import joblib

joblib.dump(model, 'XGBRegressor.pkl')

['XGBRegressor.pkl']

In [262]:
XGBRegressor = joblib.load('XGBRegressor.pkl')
pred = XGBRegressor.predict(X)
pred

array([475.4874 , 503.50873, 491.30405, ..., 684.4114 , 720.46814,
       802.7541 ], dtype=float32)