In [97]:
from pycaret.regression import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for korean plotting
plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

# Pandas option
pd.set_option('display.max_columns', None)

In [109]:
# data load
df = pd.read_csv('assets_preprocessed.csv')
df['아파트_단지']=df['아파트_단지'].fillna(0)
df= df.drop('geometry',axis=1)

In [111]:
reg = setup(
    data = df, target='거래금액', session_id=123, 
    transformation=True, normalize=True, numeric_imputation='mode', categorical_imputation='mode',
    remove_multicollinearity=True, multicollinearity_threshold=0.95, 
    ordinal_features={'살인':[1,2,3,4,5], '강도':[1,2,3,4,5], '강간':[1,2,3,4,5], '절도':[1,2,3,4,5], '폭력':[1,2,3,4,5], '방화':[1,2,3,4,5],
    '마약':[1,2,3,4,5], '약취':[1,2,3,4,5], '도박':[1,2,3,4,5]}, categorical_features=['1층유무','용도_판매','용도_제2종근린생활'],
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,거래금액
2,Target type,Regression
3,Data shape,"(5233, 34)"
4,Train data shape,"(3663, 34)"
5,Test data shape,"(1570, 34)"
6,Ordinal features,12
7,Numeric features,33
8,Categorical features,3
9,Preprocess,True


In [112]:
compare_models(errors='raise')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
et,Extra Trees Regressor,14973.6339,1371325468.5129,36252.48,0.9183,0.5186,0.4939,0.221
lightgbm,Light Gradient Boosting Machine,16517.2671,1543306444.9376,37502.7391,0.91,0.5577,0.5118,0.042
rf,Random Forest Regressor,16806.9379,1602698455.2981,39091.2765,0.905,0.5424,0.5412,0.48
gbr,Gradient Boosting Regressor,20173.6168,1563526592.1573,38909.2775,0.904,0.6564,0.7048,0.21
dt,Decision Tree Regressor,20614.6876,2796535657.1529,51886.869,0.8233,0.6635,0.5382,0.022
knn,K Neighbors Regressor,24433.493,2872011801.6,52576.9727,0.8232,0.7079,0.7793,0.015
ada,AdaBoost Regressor,52004.3571,4153760161.8968,63889.8895,0.7466,1.3991,4.4509,0.069
br,Bayesian Ridge,54871.3469,11086963181.3318,104403.685,0.3404,1.262,3.1226,0.014
llar,Lasso Least Angle Regression,55396.3887,11086820860.7672,104421.2123,0.3398,1.2805,3.1809,0.013
lar,Least Angle Regression,55505.9884,11087525755.2579,104430.5033,0.3395,1.2723,3.1863,0.013


In [113]:
lgbm = create_model('lightgbm')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,14399.0038,651292771.4775,25520.4383,0.9569,0.64,0.4609
1,16221.8195,928566357.762,30472.3868,0.9396,0.5919,0.6129
2,16886.7798,1536125325.2583,39193.4347,0.9117,0.5687,0.4954
3,15252.2057,844525859.8798,29060.7271,0.9454,0.4664,0.4219
4,17406.9218,1079791024.3657,32860.1738,0.8943,0.5701,0.58
5,15010.6815,1054104918.0477,32466.982,0.8868,0.5407,0.5283
6,16456.6507,1330611451.382,36477.5472,0.9312,0.5706,0.5233
7,21103.3778,4524503737.4691,67264.4314,0.7985,0.5707,0.577
8,17084.2253,2437942940.7566,49375.5298,0.8746,0.5092,0.4801
9,15351.0049,1045600062.9777,32335.7397,0.9606,0.5483,0.4381


In [None]:
tuned_lgbm = tune_model(lgbm)

In [116]:
plot_model(tuned_lgbm, plot = 'parameter')

Unnamed: 0,Parameters
boosting_type,gbdt
class_weight,
colsample_bytree,1.0
importance_type,split
learning_rate,0.1
max_depth,-1
min_child_samples,20
min_child_weight,0.001
min_split_gain,0.0
n_estimators,100


In [115]:
plot_model(tuned_lgbm, plot = 'error', save=True)

'Prediction Error.png'

In [117]:
# FeatureImportancePlot
plot_model(tuned_lgbm, plot='feature_all', save=True)

'Feature Importance (All).png'

In [127]:
# AUC plot
plot_model(tuned_lgbm, plot='learning', save=True)

'Learning Curve.png'

In [123]:
# Feature Importance as df
featureimportance = pd.DataFrame([tuned_lgbm.feature_name_, tuned_lgbm.feature_importances_]).T
featureimportance.sort_values(by=1, ascending=False)
featureimportance = featureimportance.reset_index(drop=True)
featureimportance.columns = ['Col','featureimportance']
featureimportance.to_csv('featureimportance.csv', index=False)
featureimportance

Unnamed: 0,Col,featureimportance
0,전용면적,720
1,1층유무,94
2,연식,350
3,용도_제2종근린생활,37
4,용도_판매,8
5,살인,30
6,강도,37
7,강간,20
8,절도,20
9,폭력,18


In [131]:
et = create_model('et')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,12533.0661,783084680.7347,27983.6502,0.9482,0.4735,0.4114
1,15287.4465,920102063.2969,30333.1842,0.9401,0.5677,0.5694
2,15502.581,1348748194.4779,36725.3073,0.9225,0.4891,0.4366
3,14042.2724,1114706430.6823,33387.2196,0.928,0.44,0.3665
4,15551.0933,902591210.9755,30043.1558,0.9117,0.5519,0.5854
5,14735.483,1089047559.1335,33000.7206,0.8831,0.5405,0.4747
6,14401.0411,1382267325.3502,37178.8559,0.9285,0.5106,0.4594
7,17412.0943,3065626824.3293,55368.103,0.8634,0.5796,0.6135
8,15174.7153,1834374058.4305,42829.5933,0.9056,0.5503,0.5806
9,15096.5463,1272706337.7187,35675.01,0.952,0.4833,0.4417


In [132]:
tuned_et = tune_model(et)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,18988.4305,1125866313.3859,33553.9314,0.9256,0.6333,0.7462
1,20062.1571,1142488570.7953,33800.7185,0.9256,0.6819,0.8543
2,21664.2562,1738915036.3967,41700.3002,0.9,0.656,0.8087
3,19324.7249,1136038089.1346,33705.1641,0.9266,0.5838,0.6438
4,19133.7251,903192359.412,30053.1589,0.9116,0.633,0.7588
5,19465.7471,1298293143.0116,36031.8351,0.8606,0.6756,0.8219
6,20692.6595,1682215858.7934,41014.8249,0.913,0.6739,0.7998
7,23872.9488,3149044285.7156,56116.346,0.8597,0.701,0.8875
8,21650.9481,2664194193.1282,51615.8328,0.8629,0.6732,0.8694
9,21490.0326,1802749348.0613,42458.7959,0.9321,0.6341,0.7411


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [135]:
tuned_et.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 123,
 'verbose': 0,
 'warm_start': False}