In [1]:
import pandas as pd
import numpy as np
import openpyxl

data = pd.read_excel('data/분석데이터.xlsx', engine='openpyxl')
data.head()

Unnamed: 0,년/월,시도,산업_대분류,구인인원,구직건수,취업건수,사업체 수,실업급여 수급자 수,각 지역별 인구,GDP(실질),금리,CLI 지수,소비자 심리지수
0,201801,서울,"농업, 임업 및 어업",0,0,0,254,0,9851767,1812.0,1.5,100.373,110.7
1,201801,서울,광업,0,0,0,25,0,9851767,1812.0,1.5,100.373,110.7
2,201801,서울,제조업,33,0,1,32866,0,9851767,1812.0,1.5,100.373,110.7
3,201801,서울,"전기, 가스, 증기 및 공기조절 공급업",1,0,0,96,0,9851767,1812.0,1.5,100.373,110.7
4,201801,서울,"수도, 하수 및 폐기물 처리, 원료 재생업",0,0,0,396,0,9851767,1812.0,1.5,100.373,110.7


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23460 entries, 0 to 23459
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   년/월         23460 non-null  int64  
 1   시도          23460 non-null  object 
 2   산업_대분류      23460 non-null  object 
 3   구인인원        23460 non-null  int64  
 4   구직건수        23460 non-null  int64  
 5   취업건수        23460 non-null  int64  
 6   사업체 수       23460 non-null  int64  
 7   실업급여 수급자 수  23460 non-null  int64  
 8   각 지역별 인구    23460 non-null  int64  
 9   GDP(실질)     23460 non-null  float64
 10  금리          23460 non-null  float64
 11  CLI 지수      23460 non-null  float64
 12  소비자 심리지수    23460 non-null  float64
dtypes: float64(4), int64(7), object(2)
memory usage: 2.3+ MB


In [3]:
data.columns=['date', 'city', 'industry', 'job_offer', 'job_search', 'employment',
              'no_company', 'unemployment', 'population', 'GDP', 'i_rate', 'CLI', 'CFI']

In [4]:
data['year'] = data['date'].astype('str').str[:4].astype('int')
data['month'] = data['date'].astype('str').str[4:].astype('int')
data.drop('date', axis=1, inplace=True)

In [5]:
print(len(data) * 0.8)

18768.0


In [6]:
train_data = data.iloc[:18768, :] # 2018 ~ 2021년 데이터
test_data = data.iloc[18768:, :] # 2022년 데이터
print(len(train_data), len(test_data))

18768 4692


In [7]:
# 독립변수 / 종속변수 분리
train_x = train_data.drop(['job_offer'], axis=1)
train_y = train_data['job_offer']

test_x = test_data.drop(['job_offer'], axis=1)
test_y = test_data['job_offer']

In [8]:
# 데이터 전처리
from sklearn.preprocessing import LabelEncoder

object_features = ['city', 'industry']

for feature in object_features:
    le = LabelEncoder()
    le = le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])
    
    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[feature]= le.transform(test_x[feature])

In [9]:
train_x.head()

Unnamed: 0,city,industry,job_search,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month
0,8,7,0,0,254,0,9851767,1812.0,1.5,100.373,110.7,2018,1
1,8,3,0,0,25,0,9851767,1812.0,1.5,100.373,110.7,2018,1
2,8,20,0,1,32866,0,9851767,1812.0,1.5,100.373,110.7,2018,1
3,8,17,0,0,96,0,9851767,1812.0,1.5,100.373,110.7,2018,1
4,8,13,0,0,396,0,9851767,1812.0,1.5,100.373,110.7,2018,1


In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# MinMaxScaler를 이용한 수치 데이터 스케일링
scale_columns = ['job_search', 'employment', 'no_company', 'unemployment', 'population', 'GDP', 'i_rate', 'CLI', 'CFI']
scaler = MinMaxScaler()
scaler.fit(train_x[scale_columns])
train_x[scale_columns] = scaler.transform(train_x[scale_columns])
test_x[scale_columns] = scaler.transform(test_x[scale_columns])

train_x.head()

Unnamed: 0,city,industry,job_search,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month
0,8,7,0.0,0.0,0.001845,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1
1,8,3,0.0,0.0,0.000182,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1
2,8,20,0.0,0.001037,0.238704,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1
3,8,17,0.0,0.0,0.000697,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1
4,8,13,0.0,0.0,0.002876,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1


하이퍼파라미터 조절하지 않은 기본 모델들로 학습

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(train_x, train_y)
pred = model_rf.predict(test_x)

score = mean_squared_error(test_y, pred)
print(score)

470.4975163469736


In [13]:
from sklearn.ensemble import ExtraTreesRegressor

model_et = ExtraTreesRegressor(random_state=42)
model_et.fit(train_x, train_y)
pred = model_et.predict(test_x)

score = mean_squared_error(test_y, pred)
print(score)

204.18973418584824


In [14]:
from lightgbm import LGBMRegressor

model_lgb = LGBMRegressor(random_state=42, force_col_wise=True)
model_lgb.fit(train_x, train_y)
pred = model_lgb.predict(test_x)

score = mean_squared_error(test_y, pred)
print(score)

[LightGBM] [Info] Total Bins 1193
[LightGBM] [Info] Number of data points in the train set: 18768, number of used features: 13
[LightGBM] [Info] Start training from score 3.869512
197.1161967119241


In [15]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor(random_state=42)
model_xgb.fit(train_x, train_y)
pred = model_xgb.predict(test_x)

score = mean_squared_error(test_y, pred)
print(score)

760.2326610347827


In [16]:
from catboost import CatBoostRegressor

model_cat = CatBoostRegressor(random_seed=42)
model_cat.fit(train_x, train_y)
pred = model_cat.predict(test_x)

score = mean_squared_error(test_y, pred)
print(score)

Learning rate set to 0.06507
0:	learn: 26.4526326	total: 49.6ms	remaining: 49.5s
1:	learn: 25.2941883	total: 52.3ms	remaining: 26.1s
2:	learn: 24.2379175	total: 54.8ms	remaining: 18.2s
3:	learn: 23.2150875	total: 57.4ms	remaining: 14.3s
4:	learn: 22.2965561	total: 59.9ms	remaining: 11.9s
5:	learn: 21.4517840	total: 62.4ms	remaining: 10.3s
6:	learn: 20.6867138	total: 64.9ms	remaining: 9.21s
7:	learn: 19.8753022	total: 67.5ms	remaining: 8.37s
8:	learn: 19.0731312	total: 70.2ms	remaining: 7.73s
9:	learn: 18.3635850	total: 72.7ms	remaining: 7.2s
10:	learn: 17.6651900	total: 75.1ms	remaining: 6.75s
11:	learn: 17.0807305	total: 77.6ms	remaining: 6.39s
12:	learn: 16.4875027	total: 80ms	remaining: 6.07s
13:	learn: 15.9082843	total: 82.6ms	remaining: 5.82s
14:	learn: 15.3333313	total: 85.1ms	remaining: 5.59s
15:	learn: 14.8942913	total: 87.5ms	remaining: 5.38s
16:	learn: 14.3840496	total: 89.9ms	remaining: 5.2s
17:	learn: 13.9849827	total: 92.4ms	remaining: 5.04s
18:	learn: 13.4716638	total: 94

In [18]:
from pycaret.regression import *
setup_rgs = setup(data=train_data, target='job_offer', test_data=test_data, session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,job_offer
2,Target type,Regression
3,Original data shape,"(23460, 14)"
4,Transformed data shape,"(23460, 52)"
5,Transformed train set shape,"(18768, 52)"
6,Transformed test set shape,"(4692, 52)"
7,Numeric features,11
8,Categorical features,2
9,Preprocess,True


In [None]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [None]:
top5 = compare_models(n_select=5, sort='MSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,1.5531,60.6267,7.4803,0.9076,0.4511,0.6967,1.921
et,Extra Trees Regressor,1.5358,66.254,7.7606,0.8942,0.4363,0.7517,1.512
xgboost,Extreme Gradient Boosting,1.6024,69.4975,8.0546,0.8892,0.4456,0.737,0.698
rf,Random Forest Regressor,1.578,70.5891,8.0565,0.8841,0.442,0.7482,2.199
gbr,Gradient Boosting Regressor,1.8581,72.6009,8.13,0.89,0.5298,0.8802,0.533
dt,Decision Tree Regressor,1.9215,90.6043,9.0605,0.8687,0.5248,1.0002,0.062
ada,AdaBoost Regressor,4.3162,97.2991,9.5426,0.8542,1.2238,1.4752,0.311
lr,Linear Regression,4.3311,294.5791,16.8243,0.5245,1.034,1.8333,0.961
ridge,Ridge Regression,4.3303,295.8166,16.8619,0.5221,1.0364,1.812,0.043
br,Bayesian Ridge,4.3186,296.5624,16.8813,0.5206,1.0358,1.7982,0.07


In [None]:
top5

[<catboost.core.CatBoostRegressor at 0x1702f9460b0>,
 ExtraTreesRegressor(n_jobs=-1, random_state=42),
 XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=-1, num_parallel_tree=None,
              predictor=None, random_state=42, ...),
 RandomForestRegressor(n_jobs=-1, random_state=42),
 GradientBoostingRegressor(random_state=42)]

In [None]:
tuned_top5 = [tune_model(i) for i in top5]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.5994,29.3707,5.4195,0.8917,0.5688,1.0573
1,1.8563,178.0784,13.3446,0.7618,0.5059,0.7544
2,1.7179,36.2725,6.0227,0.9171,0.5961,0.9073
3,1.7249,30.2817,5.5029,0.9499,0.5773,1.1006
4,1.8245,69.324,8.3261,0.8479,0.5704,0.9601
5,1.6687,19.9514,4.4667,0.9446,0.5921,1.2181
6,2.1939,85.6576,9.2551,0.8755,0.6186,0.933
7,2.4531,131.8737,11.4836,0.8823,0.5951,0.8205
8,2.3082,61.9384,7.8701,0.959,0.6134,0.5706
9,2.3529,99.1746,9.9586,0.9281,0.6403,0.8841


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.0486,80.1866,8.9547,0.7043,0.6303,0.8779
1,2.2489,155.4114,12.4664,0.7921,0.6419,0.7997
2,1.8958,57.2324,7.5652,0.8692,0.6175,0.7038
3,1.9378,41.0085,6.4038,0.9321,0.6135,0.9058
4,2.011,73.0106,8.5446,0.8398,0.6168,0.8235
5,1.9136,52.3436,7.2349,0.8546,0.6069,0.8864
6,2.1365,55.5796,7.4552,0.9192,0.6342,0.7279
7,2.7713,141.1775,11.8818,0.874,0.642,0.6966
8,2.3866,54.4313,7.3778,0.964,0.6121,0.5558
9,2.2492,84.1605,9.1739,0.939,0.5913,0.5973


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.5594,32.3201,5.6851,0.8808,0.5244,0.7876
1,1.7164,153.329,12.3826,0.7949,0.4645,0.5662
2,1.4647,31.8726,5.6456,0.9271,0.5217,0.5539
3,1.5071,35.3614,5.9465,0.9415,0.5011,0.6722
4,1.8927,86.464,9.2986,0.8102,0.5617,0.8517
5,1.4538,21.8373,4.673,0.9393,0.5294,0.6926
6,1.9227,47.4912,6.8914,0.931,0.5648,0.6462
7,2.3812,145.3336,12.0554,0.8702,0.5518,0.6684
8,2.1858,54.466,7.3801,0.9639,0.5847,0.5886
9,2.1783,67.6426,8.2245,0.951,0.5819,0.7768


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.8422,30.3013,5.5047,0.8883,0.6651,0.9256
1,2.1324,211.2141,14.5332,0.7175,0.552,0.6057
2,1.7342,49.196,7.014,0.8875,0.5723,0.6267
3,1.8978,66.805,8.1734,0.8894,0.579,0.7461
4,1.836,68.8996,8.3006,0.8488,0.6069,0.6692
5,1.548,19.1297,4.3737,0.9469,0.5843,0.82
6,2.2798,66.7711,8.1714,0.903,0.668,0.8053
7,2.7502,146.1551,12.0895,0.8695,0.6412,0.7431
8,2.407,78.9815,8.8872,0.9477,0.6134,0.5492
9,2.5117,111.9153,10.579,0.9189,0.6895,0.6619


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.2261,14.6541,3.8281,0.946,0.4443,0.7703
1,1.5234,141.213,11.8833,0.8111,0.4221,0.6907
2,1.3636,37.1182,6.0925,0.9152,0.4637,0.6951
3,1.2827,31.1183,5.5784,0.9485,0.417,0.6869
4,1.6001,72.4124,8.5095,0.8411,0.4737,0.7583
5,1.2914,21.062,4.5893,0.9415,0.4808,0.8082
6,1.6978,48.1089,6.9361,0.9301,0.4915,0.7305
7,2.1369,116.6708,10.8014,0.8958,0.5213,0.6939
8,1.8742,34.3221,5.8585,0.9773,0.5074,0.6077
9,1.7356,47.1667,6.8678,0.9658,0.4915,0.7652


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
tuned_top5

[<catboost.core.CatBoostRegressor at 0x1702f8895d0>,
 ExtraTreesRegressor(n_jobs=-1, random_state=42),
 XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.5, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=8, max_leaves=None,
              min_child_weight=4, missing=nan, monotone_constraints=None,
              n_estimators=130, n_jobs=-1, num_parallel_tree=None,
              predictor=None, random_state=42, ...),
 RandomForestRegressor(n_jobs=-1, random_state=42),
 GradientBoostingRegressor(learning_rate=0.05, max_depth=6, max_features='sqrt',
                  

In [22]:
# 혼합 모델 생성
blender_5 = blend_models(tuned_top5)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.2659,28.5695,5.345,0.8947,0.4229,0.7377
1,1.4562,143.8322,11.993,0.8076,0.384,0.6015
2,1.3378,46.1412,6.7927,0.8945,0.4098,0.6248
3,1.2424,28.5255,5.3409,0.9528,0.4124,0.6223
4,1.5012,73.954,8.5997,0.8377,0.4219,0.6764
5,1.1281,15.2578,3.9061,0.9576,0.4292,0.6781
6,1.6531,46.1013,6.7898,0.933,0.4648,0.7089
7,2.0442,114.11,10.6822,0.8981,0.4801,0.6545
8,1.6613,28.9405,5.3796,0.9808,0.4443,0.5264
9,1.7376,48.9609,6.9972,0.9645,0.4645,0.7202


In [23]:
final_model = finalize_model(blender_5)
prediction = predict_model(final_model, data=test_x)

In [24]:
prediction

Unnamed: 0,city,industry,job_search,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month,prediction_label
18768,8,7,0.000000,0.000000,0.001692,0.000000,0.694365,1.469541,0.4,0.517779,0.839572,2022,1,-0.164270
18769,8,3,0.000000,0.000000,0.000167,0.000000,0.694365,1.469541,0.4,0.517779,0.839572,2022,1,-0.164270
18770,8,20,0.000000,0.000000,0.229633,0.034174,0.694365,1.469541,0.4,0.517779,0.839572,2022,1,-0.164270
18771,8,17,0.000000,0.000000,0.000835,0.000000,0.694365,1.469541,0.4,0.517779,0.839572,2022,1,-0.164270
18772,8,13,0.000000,0.000000,0.003472,0.000000,0.694365,1.469541,0.4,0.517779,0.839572,2022,1,-0.164270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23455,14,22,0.000000,0.000000,0.010466,0.000000,0.029736,1.469541,2.2,0.040242,0.451872,2022,12,-0.114073
23456,14,0,0.000000,0.000000,0.000000,0.000000,0.029736,1.469541,2.2,0.040242,0.451872,2022,12,-0.114073
23457,14,5,0.000000,0.000000,0.000007,0.000000,0.029736,1.469541,2.2,0.040242,0.451872,2022,12,-0.114073
23458,14,11,0.000000,0.016598,0.000000,0.000000,0.029736,1.469541,2.2,0.040242,0.451872,2022,12,-0.114073


In [31]:
real_pred = pd.DataFrame({'real':test_y, 'pred':prediction['prediction_label']})
real_pred['pred'] = real_pred['pred'].astype('int')
real_pred

Unnamed: 0,real,pred
18768,0,0
18769,0,0
18770,32,0
18771,0,0
18772,0,0
...,...,...
23455,0,0
23456,0,0
23457,0,0
23458,0,0


In [34]:
real_pred.loc[real_pred['real'] != real_pred['pred']]['pred'].value_counts()

0    1197
Name: pred, dtype: int64

예측을 모두 0으로 해버림

In [35]:
data.head()

Unnamed: 0,city,industry,job_offer,job_search,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month
0,서울,"농업, 임업 및 어업",0,0,0,254,0,9851767,1812.0,1.5,100.373,110.7,2018,1
1,서울,광업,0,0,0,25,0,9851767,1812.0,1.5,100.373,110.7,2018,1
2,서울,제조업,33,0,1,32866,0,9851767,1812.0,1.5,100.373,110.7,2018,1
3,서울,"전기, 가스, 증기 및 공기조절 공급업",1,0,0,96,0,9851767,1812.0,1.5,100.373,110.7,2018,1
4,서울,"수도, 하수 및 폐기물 처리, 원료 재생업",0,0,0,396,0,9851767,1812.0,1.5,100.373,110.7,2018,1


In [26]:
data2 = data.copy()
# 구직건수 제외
data2.drop('job_search', axis=1, inplace=True)
# 산업군이 분류불능 및 해당없음 제외
data2 = data2.loc[(data2['industry'] != '분류불능') & (data2['industry'] != '해당없음')]
data2

Unnamed: 0,city,industry,job_offer,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month
0,서울,"농업, 임업 및 어업",0,0,254,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
1,서울,광업,0,0,25,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
2,서울,제조업,33,1,32866,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
3,서울,"전기, 가스, 증기 및 공기조절 공급업",1,0,96,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
4,서울,"수도, 하수 및 폐기물 처리, 원료 재생업",0,0,396,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23453,제주,보건업 및 사회복지 서비스업,0,0,2066,0,678159,1968.8,3.25,98.75089,90.2,2022,12
23454,제주,"예술, 스포츠 및 여가 관련 서비스업",0,0,0,0,678159,1968.8,3.25,98.75089,90.2,2022,12
23455,제주,"협회 및 단체, 수리 및 기타 개인 서비스업",0,0,1441,0,678159,1968.8,3.25,98.75089,90.2,2022,12
23456,제주,가구 내 고용활동 및 달리 분류되지 않은 자가소비 생산활동,0,0,0,0,678159,1968.8,3.25,98.75089,90.2,2022,12


In [27]:
data2.groupby('industry')[['job_offer',	'employment']].sum().sort_values('job_offer', ascending=False)

Unnamed: 0_level_0,job_offer,employment
industry,Unnamed: 1_level_1,Unnamed: 2_level_1
정보통신업,59161,2479
제조업,17192,421
도매 및 소매업,7929,240
"전문, 과학 및 기술 서비스업",6491,218
"사업시설 관리, 사업 지원 및 임대 서비스업",2249,71
건설업,2152,38
운수 및 창고업,977,33
"협회 및 단체, 수리 및 기타 개인 서비스업",638,30
보건업 및 사회복지 서비스업,513,23
금융 및 보험업,455,17


In [62]:
data2.groupby('industry')[['job_offer',	'employment']].mean().sort_values('job_offer', ascending=False)

Unnamed: 0_level_0,job_offer,employment
industry,Unnamed: 1_level_1,Unnamed: 2_level_1
정보통신업,58.00098,2.430392
제조업,16.854902,0.412745
도매 및 소매업,7.773529,0.235294
"전문, 과학 및 기술 서비스업",6.363725,0.213725
"사업시설 관리, 사업 지원 및 임대 서비스업",2.204902,0.069608
건설업,2.109804,0.037255
운수 및 창고업,0.957843,0.032353
"협회 및 단체, 수리 및 기타 개인 서비스업",0.62549,0.029412
보건업 및 사회복지 서비스업,0.502941,0.022549
금융 및 보험업,0.446078,0.016667


In [28]:
# 산업군 예술, 스포츠 및 여가 관련 서비스업, 교육서비스업 제외
data2 = data2.loc[(data2['industry'] != '예술, 스포츠 및 여가 관련 서비스업') & (data2['industry'] != '교육서비스업')]
data2.reset_index(drop=True, inplace=True)
data2

Unnamed: 0,city,industry,job_offer,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month
0,서울,"농업, 임업 및 어업",0,0,254,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
1,서울,광업,0,0,25,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
2,서울,제조업,33,1,32866,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
3,서울,"전기, 가스, 증기 및 공기조절 공급업",1,0,96,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
4,서울,"수도, 하수 및 폐기물 처리, 원료 재생업",0,0,396,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19375,제주,"공공행정, 국방 및 사회보장 행정",0,0,126,0,678159,1968.8,3.25,98.75089,90.2,2022,12
19376,제주,보건업 및 사회복지 서비스업,0,0,2066,0,678159,1968.8,3.25,98.75089,90.2,2022,12
19377,제주,"협회 및 단체, 수리 및 기타 개인 서비스업",0,0,1441,0,678159,1968.8,3.25,98.75089,90.2,2022,12
19378,제주,가구 내 고용활동 및 달리 분류되지 않은 자가소비 생산활동,0,0,0,0,678159,1968.8,3.25,98.75089,90.2,2022,12


In [29]:
print(len(data2) * 0.8)

15504.0


In [33]:
train_data = data2.iloc[:15504, :] # 2018 ~ 2021년 데이터
test_data = data2.iloc[15504:, :] # 2022년 데이터
print(len(train_data), len(test_data))

15504 3876


In [34]:
from pycaret.regression import *
setup_rgs = setup(data=train_data, target='job_offer', test_data=test_data, session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,job_offer
2,Target type,Regression
3,Original data shape,"(19380, 13)"
4,Transformed data shape,"(19380, 47)"
5,Transformed train set shape,"(15504, 47)"
6,Transformed test set shape,"(3876, 47)"
7,Numeric features,10
8,Categorical features,2
9,Preprocess,True


In [35]:
top5 = compare_models(n_select=5, sort='MSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,1.857,79.1712,8.322,0.9055,0.4849,0.7055,2.229
et,Extra Trees Regressor,1.8955,87.2684,8.9628,0.8798,0.4832,0.7654,2.148
rf,Random Forest Regressor,1.9236,95.4587,9.2017,0.8776,0.4805,0.7545,3.252
gbr,Gradient Boosting Regressor,2.168,103.131,9.6066,0.8637,0.5568,0.8586,0.84
xgboost,Extreme Gradient Boosting,2.0736,114.2279,10.1389,0.8656,0.496,0.7743,0.183
ada,AdaBoost Regressor,5.4489,152.2973,11.6737,0.8,1.2659,1.7247,0.262
dt,Decision Tree Regressor,2.636,240.8655,14.7628,0.6931,0.5849,1.0501,0.106
lr,Linear Regression,4.3859,267.605,15.8851,0.6798,0.9833,1.7106,0.284
ridge,Ridge Regression,4.3992,268.8611,15.927,0.6778,0.9862,1.72,0.06
br,Bayesian Ridge,4.368,269.4303,15.9435,0.6771,0.978,1.6992,0.073


In [37]:
tuned_top5 = [tune_model(i) for i in top5]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.275,112.8784,10.6244,0.6539,0.5717,1.0837
1,2.4023,238.6217,15.4474,0.7353,0.5718,0.8585
2,1.9465,55.7964,7.4697,0.8941,0.5784,0.8292
3,2.0395,41.3953,6.4339,0.9431,0.5914,1.0696
4,2.1333,88.685,9.4173,0.8386,0.5965,0.9649
5,2.0342,52.1553,7.2219,0.8798,0.5829,1.0122
6,2.3997,96.328,9.8147,0.8838,0.6057,0.8424
7,2.981,193.0319,13.8936,0.857,0.6332,0.828
8,3.2675,151.7503,12.3187,0.9166,0.7065,0.6758
9,2.9306,187.1929,13.6818,0.8875,0.6698,0.8272


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.4451,52.526,7.2475,0.8389,0.7693,0.7959
1,2.9115,243.1858,15.5944,0.7302,0.7757,0.7176
2,2.351,49.7135,7.0508,0.9057,0.7614,0.625
3,2.4725,44.9638,6.7055,0.9382,0.7484,0.7945
4,2.6204,94.5414,9.7232,0.828,0.7751,0.7014
5,2.3685,59.2461,7.6971,0.8635,0.7605,0.7697
6,2.8214,87.5301,9.3558,0.8944,0.7867,0.6522
7,3.5358,173.5311,13.1731,0.8715,0.7782,0.6019
8,3.2168,73.4734,8.5717,0.9596,0.7616,0.5111
9,2.9522,106.6082,10.3251,0.9359,0.7369,0.5119


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.965,21.0352,4.5864,0.9355,0.6747,0.8685
1,2.4562,252.4892,15.8899,0.7199,0.6133,0.6023
2,1.9481,60.1179,7.7536,0.8859,0.6014,0.6081
3,2.0607,50.043,7.0741,0.9312,0.6025,0.7838
4,2.2064,94.9784,9.7457,0.8272,0.6176,0.7021
5,1.9069,30.2359,5.4987,0.9303,0.655,0.8009
6,2.4876,63.2031,7.95,0.9238,0.6934,0.7857
7,3.0499,121.0772,11.0035,0.9103,0.6829,0.7431
8,2.8467,88.487,9.4068,0.9514,0.6495,0.5519
9,2.7575,133.9914,11.5755,0.9194,0.6696,0.7468


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.6443,31.8226,5.6412,0.9024,0.4897,0.833
1,1.8886,169.7645,13.0294,0.8117,0.4646,0.6659
2,1.5635,47.4694,6.8898,0.9099,0.4794,0.6833
3,1.5573,42.4461,6.5151,0.9417,0.4631,0.7399
4,1.9383,92.584,9.6221,0.8315,0.4956,0.7835
5,1.6592,33.0358,5.7477,0.9239,0.5377,0.7987
6,2.1683,69.9406,8.3631,0.9157,0.5499,0.8199
7,2.63,138.9933,11.7895,0.897,0.5652,0.7671
8,2.2344,52.2551,7.2288,0.9713,0.542,0.5837
9,2.1811,66.4794,8.1535,0.96,0.5293,0.7887


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.6497,30.7705,5.5471,0.9056,0.5008,0.7907
1,1.8582,185.1872,13.6084,0.7945,0.4621,0.5672
2,1.8037,55.3289,7.4383,0.895,0.5075,0.71
3,1.6919,45.1007,6.7157,0.938,0.4771,0.6311
4,1.8874,101.1211,10.0559,0.816,0.4662,0.6572
5,1.756,41.0142,6.4042,0.9055,0.5757,0.7293
6,2.216,73.3906,8.5668,0.9115,0.5611,0.8154
7,2.6465,191.112,13.8243,0.8584,0.54,0.7115
8,2.2203,40.3864,6.355,0.9778,0.5324,0.6367
9,2.5123,95.9535,9.7956,0.9423,0.5878,0.8779


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [45]:
tuned_top5

[<catboost.core.CatBoostRegressor at 0x7f5666d7d3a0>,
 ExtraTreesRegressor(n_jobs=-1, random_state=42),
 RandomForestRegressor(bootstrap=False, max_depth=11, max_features='log2',
                       min_impurity_decrease=0.0001, min_samples_leaf=3,
                       min_samples_split=7, n_estimators=30, n_jobs=-1,
                       random_state=42),
 GradientBoostingRegressor(learning_rate=0.05, max_depth=6, max_features='sqrt',
                           min_impurity_decrease=0.3, min_samples_leaf=4,
                           min_samples_split=10, n_estimators=270,
                           random_state=42, subsample=0.7),
 XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
      

In [38]:
# 혼합 모델 생성
blender_5 = blend_models(tuned_top5)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.5364,26.3562,5.1338,0.9192,0.4917,0.7461
1,1.8611,197.5029,14.0536,0.7809,0.4537,0.5922
2,1.5581,45.6113,6.7536,0.9135,0.4661,0.6317
3,1.5841,39.7741,6.3067,0.9453,0.4582,0.6396
4,1.8431,91.1614,9.5478,0.8341,0.4669,0.6587
5,1.4539,25.7762,5.077,0.9406,0.4887,0.6785
6,2.0651,60.2603,7.7628,0.9273,0.5354,0.7514
7,2.429,127.3199,11.2836,0.9057,0.5303,0.6692
8,1.9286,32.4273,5.6945,0.9822,0.4908,0.5275
9,2.1951,71.0887,8.4314,0.9573,0.5193,0.7226


In [39]:
final_model = finalize_model(blender_5)
prediction = predict_model(final_model, data=test_x)

In [40]:
real_pred = pd.DataFrame({'real':test_y, 'pred':prediction['prediction_label']})
real_pred['pred'] = real_pred['pred'].astype('int')
real_pred

Unnamed: 0,real,pred
18768,0,0
18769,0,0
18770,32,0
18771,0,0
18772,0,0
...,...,...
23455,0,0
23456,0,0
23457,0,0
23458,0,0


In [41]:
real_pred.loc[real_pred['real'] != real_pred['pred']]['pred'].value_counts()

0    1197
5      24
Name: pred, dtype: int64

여전히 0으로 예측이 다수

In [50]:
data3 = data.copy()
# 구직건수 제외
data3.drop('job_search', axis=1, inplace=True)
# 산업군이 분류불능 및 해당없음 제외
data3 = data3.loc[(data3['industry'] != '분류불능') & (data3['industry'] != '해당없음')]
data3

Unnamed: 0,city,industry,job_offer,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month
0,서울,"농업, 임업 및 어업",0,0,254,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
1,서울,광업,0,0,25,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
2,서울,제조업,33,1,32866,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
3,서울,"전기, 가스, 증기 및 공기조절 공급업",1,0,96,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
4,서울,"수도, 하수 및 폐기물 처리, 원료 재생업",0,0,396,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23453,제주,보건업 및 사회복지 서비스업,0,0,2066,0,678159,1968.8,3.25,98.75089,90.2,2022,12
23454,제주,"예술, 스포츠 및 여가 관련 서비스업",0,0,0,0,678159,1968.8,3.25,98.75089,90.2,2022,12
23455,제주,"협회 및 단체, 수리 및 기타 개인 서비스업",0,0,1441,0,678159,1968.8,3.25,98.75089,90.2,2022,12
23456,제주,가구 내 고용활동 및 달리 분류되지 않은 자가소비 생산활동,0,0,0,0,678159,1968.8,3.25,98.75089,90.2,2022,12


In [51]:
# 상위 15개 업종 선정
top_industry_15 = list(data3.groupby('industry')[['job_offer',	'employment']].sum().sort_values('job_offer', ascending=False)[:15].index)
data3 = data3.loc[data3['industry'].isin(top_industry_15)]
data3

Unnamed: 0,city,industry,job_offer,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month
0,서울,"농업, 임업 및 어업",0,0,254,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
2,서울,제조업,33,1,32866,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
3,서울,"전기, 가스, 증기 및 공기조절 공급업",1,0,96,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
5,서울,건설업,6,0,53694,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
6,서울,도매 및 소매업,38,0,125584,0,9851767,1812.0,1.50,100.37300,110.7,2018,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23449,제주,"전문, 과학 및 기술 서비스업",1,0,1561,3,678159,1968.8,3.25,98.75089,90.2,2022,12
23450,제주,"사업시설 관리, 사업 지원 및 임대 서비스업",0,0,1448,8,678159,1968.8,3.25,98.75089,90.2,2022,12
23451,제주,"공공행정, 국방 및 사회보장 행정",0,0,126,0,678159,1968.8,3.25,98.75089,90.2,2022,12
23453,제주,보건업 및 사회복지 서비스업,0,0,2066,0,678159,1968.8,3.25,98.75089,90.2,2022,12


In [53]:
print(len(data3) * 0.8)

12240.0


In [54]:
train_data = data3.iloc[:12240, :] # 2018 ~ 2021년 데이터
test_data = data3.iloc[12240:, :] # 2022년 데이터
print(len(train_data), len(test_data))

12240 3060


In [58]:
from pycaret.regression import *
setup_rgs = setup(data=train_data, target='job_offer', test_data=test_data, session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,job_offer
2,Target type,Regression
3,Original data shape,"(15300, 13)"
4,Transformed data shape,"(15300, 43)"
5,Transformed train set shape,"(12240, 43)"
6,Transformed test set shape,"(3060, 43)"
7,Numeric features,10
8,Categorical features,2
9,Preprocess,True


In [59]:
top5 = compare_models(n_select=5, sort='MSE', turbo=False)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,2.3427,101.3636,9.3527,0.9035,0.5438,0.7065,2.553
et,Extra Trees Regressor,2.3927,112.2153,10.1594,0.8772,0.5391,0.7566,1.861
rf,Random Forest Regressor,2.4495,120.4723,10.3901,0.878,0.5425,0.7509,2.937
xgboost,Extreme Gradient Boosting,2.5289,126.2616,10.375,0.8887,0.554,0.7731,0.218
gbr,Gradient Boosting Regressor,2.7114,128.8353,10.7511,0.8698,0.6208,0.854,0.84
ada,AdaBoost Regressor,6.4461,172.4615,12.8517,0.8053,1.452,1.9701,0.268
lr,Linear Regression,5.3299,334.9839,17.7769,0.6806,1.0402,1.8248,0.35
ridge,Ridge Regression,5.3489,337.1831,17.8421,0.6779,1.0419,1.8409,0.072
br,Bayesian Ridge,5.3087,337.9506,17.8611,0.6772,1.0325,1.8167,0.077
ard,Automatic Relevance Determination,5.1212,346.0936,18.0827,0.6692,0.9563,1.7272,0.107


In [60]:
tuned_top5 = [tune_model(i) for i in top5]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.5523,69.5638,8.3405,0.83,0.6651,1.1089
1,2.9149,296.4248,17.217,0.739,0.6222,0.8737
2,2.4863,80.331,8.9628,0.8789,0.6642,0.9266
3,2.4371,44.8055,6.6937,0.9511,0.6295,1.0741
4,2.6991,126.5076,11.2476,0.8171,0.6467,0.959
5,2.6507,77.6952,8.8145,0.8576,0.6709,1.1507
6,3.1594,114.2623,10.6894,0.8903,0.6857,0.943
7,3.6256,223.7057,14.9568,0.8682,0.685,0.7958
8,4.1498,217.4597,14.7465,0.9049,0.7941,0.6957
9,3.7853,286.7241,16.9329,0.8631,0.7696,0.8744


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.4643,30.989,5.5668,0.9243,0.7757,0.9035
1,3.0958,285.007,16.8821,0.7491,0.7182,0.7396
2,2.3415,53.946,7.3448,0.9187,0.7004,0.6423
3,2.646,78.0575,8.835,0.9148,0.7068,0.8212
4,2.8124,116.61,10.7986,0.8314,0.7219,0.7615
5,2.7693,98.5477,9.9271,0.8194,0.765,0.8894
6,3.1493,97.1038,9.8541,0.9068,0.7836,0.7735
7,3.7311,201.1633,14.1832,0.8815,0.7304,0.695
8,3.8024,167.3781,12.9375,0.9268,0.7207,0.5205
9,3.4031,186.9286,13.6722,0.9108,0.7327,0.6666


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.4885,44.1043,6.6411,0.8922,0.7086,0.8658
1,2.9758,299.2607,17.2992,0.7365,0.6705,0.5825
2,2.3915,81.0128,9.0007,0.8779,0.6641,0.6373
3,2.4282,60.7852,7.7965,0.9336,0.6522,0.7358
4,2.7115,112.251,10.5949,0.8377,0.7134,0.7353
5,2.2344,39.6583,6.2975,0.9273,0.6898,0.7778
6,2.949,85.1011,9.225,0.9183,0.7105,0.7216
7,3.5657,161.3763,12.7034,0.9049,0.7041,0.7978
8,3.3738,130.6903,11.432,0.9429,0.684,0.5458
9,3.4581,170.221,13.0469,0.9187,0.7556,0.8571


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.1839,39.1402,6.2562,0.9044,0.5763,0.8841
1,2.2428,205.4034,14.3319,0.8192,0.5206,0.6034
2,2.3757,105.2541,10.2593,0.8413,0.5627,0.7084
3,2.2517,65.9821,8.1229,0.928,0.5615,0.7092
4,2.5538,150.4328,12.2651,0.7825,0.5412,0.6879
5,1.9618,23.7785,4.8763,0.9564,0.6296,0.7643
6,2.9333,121.8373,11.038,0.8831,0.6131,0.7556
7,3.4363,241.5474,15.5418,0.8577,0.6223,0.7082
8,2.904,58.9149,7.6756,0.9742,0.5961,0.689
9,3.2673,93.7213,9.681,0.9553,0.6411,0.8705


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.0863,46.3782,6.8102,0.8867,0.5488,0.9154
1,2.3287,246.5885,15.7031,0.7829,0.5273,0.6558
2,2.0131,70.1797,8.3773,0.8942,0.5577,0.7617
3,1.9187,47.8379,6.9165,0.9478,0.4996,0.7028
4,2.341,124.1192,11.1409,0.8205,0.5293,0.7314
5,1.8875,29.1686,5.4008,0.9465,0.5778,0.8057
6,2.7328,82.9841,9.1096,0.9204,0.6145,0.829
7,3.1072,164.222,12.8149,0.9032,0.616,0.6979
8,2.5444,46.4586,6.8161,0.9797,0.589,0.5892
9,2.9637,100.7939,10.0396,0.9519,0.6629,0.8463


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [61]:
tuned_top5

[<catboost.core.CatBoostRegressor at 0x7f566d000cd0>,
 ExtraTreesRegressor(max_depth=11, max_features='log2',
                     min_impurity_decrease=0.0001, min_samples_leaf=3,
                     min_samples_split=7, n_estimators=30, n_jobs=-1,
                     random_state=42),
 RandomForestRegressor(bootstrap=False, max_depth=11, max_features='log2',
                       min_impurity_decrease=0.0001, min_samples_leaf=3,
                       min_samples_split=7, n_estimators=30, n_jobs=-1,
                       random_state=42),
 XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_thr

In [63]:
# 혼합 모델 생성
blender_5 = blend_models(tuned_top5)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.991,29.4077,5.4229,0.9281,0.5861,0.792
1,2.4758,258.9698,16.0925,0.772,0.5605,0.5982
2,2.0562,53.4635,7.3119,0.9194,0.5714,0.6366
3,2.0364,48.7949,6.9853,0.9467,0.5526,0.666
4,2.4371,117.7923,10.8532,0.8297,0.5706,0.6649
5,1.88,33.6933,5.8046,0.9382,0.5705,0.7055
6,2.679,77.301,8.7921,0.9258,0.6274,0.7075
7,3.1932,178.1402,13.3469,0.895,0.6247,0.6503
8,2.8101,69.6919,8.3482,0.9695,0.5961,0.5303
9,2.9578,105.4929,10.271,0.9496,0.6464,0.7535


In [64]:
final_model = finalize_model(blender_5)
prediction = predict_model(final_model, data=test_x)

In [65]:
real_pred = pd.DataFrame({'real':test_y, 'pred':prediction['prediction_label']})
real_pred['pred'] = real_pred['pred'].astype('int')
real_pred

Unnamed: 0,real,pred
18768,0,0
18769,0,0
18770,32,0
18771,0,0
18772,0,0
...,...,...
23455,0,0
23456,0,0
23457,0,0
23458,0,0


In [None]:
real_pred.loc[real_pred['real'] != real_pred['pred']]['pred'].value_counts()

In [14]:
data4 = data.copy()

# 상위 5개 업종 선정
top_industry_5 = list(data4.groupby('industry')[['job_offer', 'employment']].sum().sort_values('job_offer', ascending=False)[:5].index)
data4 = data4.loc[data4['industry'].isin(top_industry_5)]
data4

print(len(data4) * 0.8)

4080.0


In [15]:
# 종속변수: job_offer
train_data = data4.iloc[:4080, :] # 2018 ~ 2021년 데이터
test_data = data4.iloc[4080:, :] # 2022년 데이터
print(len(train_data), len(test_data))

4080 1020


In [17]:
# 데이터 전처리
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

object_features = ['city', 'industry']

for feature in object_features:
    le = LabelEncoder()
    le = le.fit(train_data[feature])
    train_data[feature] = le.transform(train_data[feature])
    
    for label in np.unique(test_data[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_data[feature] = le.transform(test_data[feature])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[feature] = le.transform(train_data[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[feature] = le.transform(test_data[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[feature] = le.transform(train_data[feature])
A value is trying to be set on a copy o

In [18]:
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler를 이용한 수치 데이터 스케일링
scale_columns = ['job_search', 'employment', 'no_company', 'unemployment', 'population', 'GDP', 'i_rate', 'CLI', 'CFI']
scaler = MinMaxScaler()
scaler.fit(train_data[scale_columns])
train_data[scale_columns] = scaler.transform(train_data[scale_columns])
test_data[scale_columns] = scaler.transform(test_data[scale_columns])

train_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data[scale_columns] = scaler.transform(train_data[scale_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data[scale_columns] = scaler.transform(test_data[scale_columns])


Unnamed: 0,city,industry,job_offer,job_search,employment,no_company,unemployment,population,GDP,i_rate,CLI,CFI,year,month
2,8,4,33,0.0,0.032258,0.238046,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1
6,8,0,38,0.0,0.0,0.912035,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1
9,8,3,303,0.0,0.290323,0.195215,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1
12,8,2,50,0.0,0.064516,0.298497,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1
13,8,1,14,0.0,0.032258,0.13041,0.0,0.720403,0.0,0.8,0.416524,1.0,2018,1


In [19]:
from pycaret.regression import *
setup_rgs = setup(data=train_data, target='job_offer', test_data=test_data, session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,job_offer
2,Target type,Regression
3,Original data shape,"(5100, 14)"
4,Transformed data shape,"(5100, 14)"
5,Transformed train set shape,"(4080, 14)"
6,Transformed test set shape,"(1020, 14)"
7,Numeric features,13
8,Preprocess,True
9,Imputation type,simple


In [20]:
top5 = compare_models(n_select=5, sort='MSE', turbo=False)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,5.7097,344.9948,17.3304,0.8792,0.6687,0.7076,1.757
et,Extra Trees Regressor,6.0714,384.3943,18.5349,0.8486,0.6734,0.766,0.424
lightgbm,Light Gradient Boosting Machine,6.2046,393.4678,18.6539,0.8628,0.6697,0.6866,34.397
gbr,Gradient Boosting Regressor,6.5456,402.4654,18.8625,0.858,0.7604,0.835,0.229
rf,Random Forest Regressor,6.1059,409.865,19.1518,0.8513,0.6728,0.7614,0.745
xgboost,Extreme Gradient Boosting,6.5113,475.6428,20.565,0.8393,0.6929,0.7682,0.091
ada,AdaBoost Regressor,14.5779,541.1663,22.8013,0.7856,1.6989,3.8782,0.124
knn,K Neighbors Regressor,8.6021,929.5486,28.2488,0.7388,0.799,0.8918,0.02
ard,Automatic Relevance Determination,11.0824,1016.0041,30.9339,0.6556,1.0365,1.4862,0.026
br,Bayesian Ridge,11.342,1016.2315,30.9506,0.6547,1.0732,1.59,0.014


In [21]:
tuned_top5 = [tune_model(i) for i in top5]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8.9637,791.8455,28.1397,0.3015,0.8533,1.7033
1,7.463,1115.7433,33.4027,0.6565,0.7661,0.9215
2,6.1631,278.3855,16.6849,0.8519,0.7479,0.818
3,5.6647,158.3163,12.5824,0.9389,0.7352,0.8771
4,6.825,395.6526,19.891,0.7985,0.8169,0.9426
5,5.3268,137.8156,11.7395,0.9105,0.7609,1.0123
6,6.8628,352.7487,18.7816,0.8786,0.7643,0.849
7,7.8257,595.3851,24.4005,0.8754,0.7519,0.7445
8,9.2371,756.4542,27.5037,0.8826,0.7367,0.5833
9,8.4756,671.8443,25.92,0.8877,0.8173,0.9466


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4.7411,73.6527,8.5821,0.935,0.7812,0.84
1,7.3782,1139.9407,33.763,0.649,0.8175,0.8439
2,5.5338,213.8995,14.6253,0.8862,0.8201,0.9152
3,5.4347,152.5239,12.3501,0.9411,0.7767,0.8511
4,6.5277,318.15,17.8368,0.838,0.9183,1.019
5,5.7243,161.3143,12.701,0.8952,0.8162,1.0096
6,6.4834,216.7356,14.7219,0.9254,0.8347,0.9829
7,8.5052,602.2053,24.5399,0.8739,0.8884,0.9069
8,7.0916,207.8884,14.4183,0.9677,0.8159,0.7561
9,8.2178,484.9802,22.0223,0.9189,0.8783,1.0412


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,6.0204,264.6071,16.2667,0.7666,0.7273,1.0203
1,7.119,1067.0556,32.6658,0.6714,0.754,0.8746
2,6.0196,262.5045,16.202,0.8603,0.7935,0.8964
3,5.2201,138.9825,11.7891,0.9463,0.7593,0.8882
4,6.71,342.8199,18.5154,0.8254,0.8122,0.8825
5,5.5766,180.9833,13.453,0.8824,0.7774,0.9212
6,7.0874,305.7445,17.4856,0.8948,0.8767,0.991
7,8.922,701.3408,26.4828,0.8532,0.8247,0.8157
8,7.3495,162.5169,12.7482,0.9748,0.8549,0.8568
9,8.6971,538.7693,23.2114,0.9099,0.7932,0.9302


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.6482,172.6228,13.1386,0.8477,0.7239,1.0347
1,5.9444,845.9134,29.0846,0.7395,0.6971,0.664
2,5.2696,284.2257,16.859,0.8488,0.6894,0.7531
3,4.5084,160.9867,12.6881,0.9378,0.612,0.6715
4,6.0479,362.1359,19.0299,0.8156,0.6832,0.6995
5,3.6751,56.8494,7.5399,0.9631,0.5801,0.6913
6,5.9224,199.0011,14.1068,0.9315,0.7403,0.7469
7,7.1738,529.1358,23.003,0.8892,0.7124,0.7143
8,6.7899,229.8383,15.1604,0.9643,0.6523,0.5284
9,7.7152,357.1029,18.8972,0.9403,0.7839,0.8824


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.5776,195.4086,13.9789,0.8276,0.7632,1.0966
1,6.1913,780.3683,27.9351,0.7597,0.6841,0.6687
2,5.2184,296.1016,17.2076,0.8424,0.6683,0.6652
3,4.601,132.7981,11.5238,0.9487,0.6259,0.6674
4,6.2164,385.6314,19.6375,0.8036,0.7446,0.7338
5,4.5129,94.4083,9.7164,0.9387,0.7034,0.821
6,5.9434,173.6626,13.1781,0.9402,0.7533,0.7913
7,7.5509,469.993,21.6793,0.9016,0.7142,0.6748
8,7.0712,240.077,15.4944,0.9627,0.6763,0.5521
9,7.8448,444.4601,21.0822,0.9257,0.7337,0.795


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [25]:
tuned_top5

[<catboost.core.CatBoostRegressor at 0x7fed9b48a400>,
 ExtraTreesRegressor(max_depth=11, max_features='log2',
                     min_impurity_decrease=0.0001, min_samples_leaf=3,
                     min_samples_split=7, n_estimators=30, n_jobs=-1,
                     random_state=42),
 LGBMRegressor(n_jobs=-1, random_state=42),
 GradientBoostingRegressor(learning_rate=0.05, max_depth=6, max_features='sqrt',
                           min_impurity_decrease=0.3, min_samples_leaf=4,
                           min_samples_split=10, n_estimators=270,
                           random_state=42, subsample=0.7),
 RandomForestRegressor(bootstrap=False, max_depth=11, max_features='log2',
                       min_impurity_decrease=0.0001, min_samples_leaf=3,
                       min_samples_split=7, n_estimators=30, n_jobs=-1,
                       random_state=42)]

In [22]:
# 혼합 모델 생성
blender_5 = blend_models(tuned_top5)

final_model = finalize_model(blender_5)
prediction = predict_model(final_model, data=test_x)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,4.8627,119.8643,10.9483,0.8943,0.6916,0.8865
1,6.1697,941.8384,30.6894,0.71,0.6828,0.6538
2,5.1306,234.3223,15.3076,0.8753,0.6906,0.7172
3,4.3922,117.9327,10.8597,0.9545,0.6217,0.6492
4,6.1247,359.0758,18.9493,0.8171,0.72,0.7247
5,3.8516,68.0651,8.2502,0.9558,0.6094,0.6903
6,5.9891,202.341,14.2247,0.9304,0.7408,0.7669
7,7.2325,494.8789,22.2459,0.8964,0.746,0.7023
8,6.0632,136.7771,11.6952,0.9788,0.6934,0.5587
9,7.443,401.9809,20.0495,0.9328,0.743,0.7835


In [23]:
real_pred = pd.DataFrame({'real':test_y, 'pred':prediction['prediction_label']})
real_pred['pred'] = real_pred['pred'].astype('int')
real_pred

Unnamed: 0,real,pred
18768,0,29
18769,0,44
18770,32,32
18771,0,29
18772,0,29
...,...,...
23455,0,3
23456,0,1
23457,0,3
23458,0,3


In [24]:
real_pred.loc[real_pred['real'] != real_pred['pred']]['pred'].value_counts()

7      484
8      455
10     351
6      323
9      322
      ... 
168      1
133      1
130      1
124      1
92       1
Name: pred, Length: 156, dtype: int64