In [23]:
from pycaret.regression import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for korean plotting
plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False

# Pandas option
pd.set_option('display.max_columns', None)

In [24]:
# data load
df = pd.read_csv('df_preprocessed.csv')

# Endog, Exog, add Constatnt
X = df[df.columns[~df.columns.str.contains('생활인구')]]
y = df['총_생활인구']
df = pd.concat([X, y], axis=1)
df = df.drop(['전체범죄', '매출건수'], axis=1)
df = df.dropna(axis=0, subset=['총_생활인구'])

In [25]:
commercial_type = df[df.columns[df.columns.str.contains('상권타입')]]
commercial_type.columns = ['골목상권','관광특구','발달상권','전통시장']
commercial_type = pd.Series(commercial_type.columns[np.where(commercial_type!=0)[1]])

df = df.drop(df.columns[df.columns.str.contains('상권타입')], axis=1)
df['상권타입'] = pd.Categorical(commercial_type, categories=commercial_type.unique(), ordered=False)
# df.info()

In [26]:
reg = setup(
    data = df, target='총_생활인구', session_id=123, 
    transformation=False, numeric_imputation='mode', categorical_imputation='mode',
    remove_multicollinearity=True, multicollinearity_threshold=0.95, 
    ordinal_features={'살인':[1,2,3,4,5], '강도':[1,2,3,4,5], '강간':[1,2,3,4,5], '절도':[1,2,3,4,5], '폭력':[1,2,3,4,5], '방화':[1,2,3,4,5],
    '마약':[1,2,3,4,5], '약취':[1,2,3,4,5], '도박':[1,2,3,4,5]}, categorical_features=['상권타입']
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,총_생활인구
2,Target type,Regression
3,Data shape,"(1668, 24)"
4,Train data shape,"(1167, 24)"
5,Test data shape,"(501, 24)"
6,Ordinal features,9
7,Numeric features,19
8,Categorical features,1
9,Rows with missing values,11.2%


In [27]:
compare_models(errors='raise')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.382,0.2678,0.5133,0.7064,0.2605,8.3381,0.083
rf,Random Forest Regressor,0.3874,0.2761,0.5208,0.6975,0.2602,2.9242,0.153
et,Extra Trees Regressor,0.3878,0.2813,0.5255,0.6918,0.2581,5.5776,0.108
lightgbm,Light Gradient Boosting Machine,0.3898,0.2814,0.5268,0.6902,0.2607,2.7153,0.029
huber,Huber Regressor,0.3971,0.2942,0.5371,0.6785,0.2663,6.7125,0.018
br,Bayesian Ridge,0.4002,0.2936,0.5375,0.6778,0.2653,9.2682,0.015
ridge,Ridge Regression,0.4005,0.2942,0.5381,0.6771,0.2654,9.2342,0.012
lr,Linear Regression,0.4006,0.2943,0.5382,0.677,0.2655,9.2213,0.648
lar,Least Angle Regression,0.4009,0.2947,0.5386,0.6765,0.2657,9.2234,0.012
ada,AdaBoost Regressor,0.4345,0.3271,0.5684,0.6389,0.2811,8.3119,0.05


In [28]:
gbr = create_model('gbr', fold=5)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3646,0.2402,0.4901,0.7243,0.2485,1.4955
1,0.3785,0.2779,0.5272,0.678,0.2542,2.5531
2,0.3625,0.2165,0.4652,0.7513,0.2405,31.1488
3,0.3956,0.2695,0.5192,0.7053,0.2669,1.6667
4,0.4056,0.3251,0.5702,0.6801,0.2912,2.362
Mean,0.3813,0.2658,0.5144,0.7078,0.2603,7.8452
Std,0.0169,0.0368,0.0355,0.0277,0.0177,11.6586


In [29]:
tuned_gbr = tune_model(gbr)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3469,0.2047,0.4524,0.7192,0.2381,1.5748
1,0.3951,0.2902,0.5387,0.7049,0.266,1.7205
2,0.3809,0.2333,0.483,0.7008,0.2436,2.7609
3,0.4154,0.3619,0.6016,0.6123,0.268,1.8848
4,0.3686,0.2305,0.4801,0.7053,0.2223,20.2698
5,0.3689,0.262,0.5119,0.7243,0.2766,1.4031
6,0.4244,0.2909,0.5394,0.6795,0.2643,1.1822
7,0.4326,0.3408,0.5838,0.6351,0.2807,1.7873
8,0.38,0.2446,0.4945,0.7022,0.2703,1.796
9,0.4579,0.4025,0.6344,0.6637,0.3001,3.1246


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [30]:
plot_model(tuned_gbr, plot = 'parameter')

Unnamed: 0,Parameters
alpha,0.9
ccp_alpha,0.0
criterion,friedman_mse
init,
learning_rate,0.1
loss,squared_error
max_depth,3
max_features,
max_leaf_nodes,
min_impurity_decrease,0.0


In [31]:
plot_model(tuned_gbr, plot = 'error', save=True)

'Prediction Error.png'

In [32]:
# FeatureImportancePlot
plot_model(tuned_gbr, plot='feature_all', save=True)

'Feature Importance (All).png'

In [33]:
# Feature Importance as df
featureimportance = pd.DataFrame([tuned_gbr.feature_names_in_, tuned_gbr.feature_importances_]).T
featureimportance.sort_values(by=1, ascending=False)
featureimportance = featureimportance.reset_index(drop=True)
featureimportance.to_csv('featureimportance.csv', index=False)
featureimportance

Unnamed: 0,0,1
0,Bus,0.024823
1,Subway,0.02002
2,유흥업소,0.021155
3,살인,0.000674
4,강도,0.000883
5,강간,0.000992
6,절도,0.004839
7,폭력,0.006507
8,방화,0.004243
9,마약,0.001761
