In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 글꼴깨짐 방지
import matplotlib
import matplotlib.font_manager as fm

import warnings

# 경고 메시지를 무시하고 숨기거나
warnings.filterwarnings(action='ignore')

fm.get_fontconfig_fonts()
font_location = 'C:/Windows/Fonts/malgun.ttf' # For Windows
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)
font_name = fm.FontProperties(fname=font_location).get_name()
matplotlib.rc('font', family=font_name)

In [2]:
df = pd.read_csv('train.csv')

In [3]:
def preprocessing(df):
    # 오류 단지코드가 존재하는 행들을  사전에 제거
    df_error =  ['C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
    #df_error =  ['C2335', 'C1327']
    df = df[~df['단지코드'].isin(df_error)].reset_index(drop=True)
    df.rename(columns = {'도보 10분거리 내 지하철역 수(환승노선 수 반영)':'지하철','도보 10분거리 내 버스정류장 수':'버스'},inplace=True)
    df.drop(columns=['임대보증금','임대료','자격유형','임대건물구분'],axis = 1,inplace=True)
    지역_비율 = (df.groupby(['지역'])['총세대수'].count())/(df.groupby(['지역'])['총세대수'].count().sum())*100
    지역_비율=지역_비율.reset_index(name='지역_비율')
    공급유형_비율 = (df.groupby(['공급유형'])['총세대수'].count())/(df.groupby(['공급유형'])['총세대수'].count().sum())*100
    공급유형_비율=공급유형_비율.reset_index(name='공급유형_비율')
    df = pd.merge(df,지역_비율, on='지역')
    df = pd.merge(df,공급유형_비율, on='공급유형')
    df.drop(columns=['지역','공급유형','단지코드'],axis = 1,inplace=True)
    df=df.dropna(axis=0)
    df = df[['총세대수', '전용면적', '전용면적별세대수', '공가수', '지하철', '버스', '단지내주차면수', '공급유형_비율',
       '지역_비율', '등록차량수']]
    return df

In [4]:
df = preprocessing(df)

In [5]:
#Splitting the data into train and test split
from sklearn.model_selection import train_test_split
X = df.iloc[:, 1:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score

from time import time

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [7]:
regressors = [
    KNeighborsRegressor(),
    GradientBoostingRegressor(),
    KNeighborsRegressor(),
    ExtraTreesRegressor(),
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    XGBRegressor(),
    LinearRegression(),
    Lasso(),
    Ridge()
]

In [8]:
head = 10
for model in regressors[:head]:
    start = time()
    model.fit(X_train, y_train)
    train_time = time() - start
    start = time()
    y_pred = model.predict(X_test)
    predict_time = time()-start    
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print("\tMean absolute error:", mean_absolute_error(y_test, y_pred))
    print("\tR2 score:", r2_score(y_test, y_pred))
    print()

KNeighborsRegressor()
	Training time: 0.003s
	Prediction time: 0.002s
	Explained variance: 0.8260783220523491
	Mean absolute error: 108.38919925512104
	R2 score: 0.8260188657053464

GradientBoostingRegressor()
	Training time: 0.120s
	Prediction time: 0.001s
	Explained variance: 0.924364697996993
	Mean absolute error: 80.24586319771741
	R2 score: 0.9243483477911816

KNeighborsRegressor()
	Training time: 0.001s
	Prediction time: 0.002s
	Explained variance: 0.8260783220523491
	Mean absolute error: 108.38919925512104
	R2 score: 0.8260188657053464

ExtraTreesRegressor()
	Training time: 0.142s
	Prediction time: 0.008s
	Explained variance: 0.9977296516170802
	Mean absolute error: 7.691359404096836
	R2 score: 0.9977227892993479

RandomForestRegressor()
	Training time: 0.522s
	Prediction time: 0.010s
	Explained variance: 0.9835115414637607
	Mean absolute error: 22.944320297951585
	R2 score: 0.9835011153681112

DecisionTreeRegressor()
	Training time: 0.014s
	Prediction time: 0.001s
	Explained va

In [9]:
parameters = { 'loss' : ['ls', 'lad', 'huber', 'quantile'],
                  'learning_rate' : (0.05,0.25,0.50,1),
                  'criterion' : ['mae'],
                  'max_features' : ['auto', 'sqrt', 'log2']
                 }

In [11]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(GradientBoostingRegressor(),parameters, n_jobs=-1, verbose=10)
model = grid.fit(X,y)
print(model.best_params_,'\n')
print(model.best_estimator_,'\n')

Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'criterion': 'mae', 'learning_rate': 0.25, 'loss': 'huber', 'max_features': 'sqrt'} 

GradientBoostingRegressor(criterion='mae', learning_rate=0.25, loss='huber',
                          max_features='sqrt') 



In [12]:
model

GridSearchCV(estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid={'criterion': ['mae'],
                         'learning_rate': (0.05, 0.25, 0.5, 1),
                         'loss': ['ls', 'lad', 'huber', 'quantile'],
                         'max_features': ['auto', 'sqrt', 'log2']},
             verbose=10)

In [13]:
scores_df = pd.DataFrame(grid.cv_results_)
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_learning_rate,param_loss,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,12.860881,0.615922,0.0022,0.0004002097,mae,0.05,ls,auto,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.147497,0.830828,0.697457,0.785221,0.728245,0.63785,0.249451,9
1,4.741872,0.182703,0.002,3.234067e-07,mae,0.05,ls,sqrt,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.512974,0.666298,0.643087,0.737372,0.523174,0.616581,0.086283,15
2,6.032202,0.188027,0.001852,0.0008348661,mae,0.05,ls,log2,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.336413,0.783504,0.656489,0.767597,0.628132,0.634427,0.160808,10
3,9.80169,0.519026,0.002062,0.0006852419,mae,0.05,lad,auto,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.610708,0.654268,0.662513,0.702255,0.630767,0.652102,0.030975,4
4,3.511001,0.075894,0.002019,3.729019e-05,mae,0.05,lad,sqrt,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.572824,0.516433,0.672175,0.614036,0.402242,0.555542,0.092037,23
5,4.71621,0.144636,0.002501,0.001002026,mae,0.05,lad,log2,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.578246,0.600167,0.657725,0.673384,0.435673,0.589039,0.084372,20
6,13.39413,0.677414,0.00207,0.001326393,mae,0.05,huber,auto,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.249517,0.726433,0.696761,0.773209,0.695655,0.628315,0.191483,13
7,4.993728,0.319486,0.0022,0.001469256,mae,0.05,huber,sqrt,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.580947,0.633461,0.630913,0.680855,0.615731,0.628381,0.032236,12
8,6.23296,0.240274,0.0028,0.0009794913,mae,0.05,huber,log2,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",0.466345,0.681353,0.666653,0.754997,0.629162,0.639702,0.095831,8
9,10.505282,0.841654,0.0018,0.0003996134,mae,0.05,quantile,auto,"{'criterion': 'mae', 'learning_rate': 0.05, 'l...",-0.495611,-0.028097,-0.612724,-4.186576,0.552519,-0.954098,1.667515,48


In [14]:
grid.best_score_

0.6913042837631791

In [15]:
import joblib

joblib.dump(model, 'GradientBoostingRegressor.pkl')

['GradientBoostingRegressor.pkl']

In [17]:
GradientBoostingRegressor = joblib.load('GradientBoostingRegressor.pkl')
pred = GradientBoostingRegressor.predict(X)
pred

array([574.34639834, 533.09886905, 522.20146648, ..., 588.01608572,
       634.9109916 , 713.43272134])