In [38]:
from os.path import join
import numpy as np
import pandas as pd
from feature.dataset import make_dataset
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from category_encoders import *
from sklearn.model_selection import StratifiedKFold
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
import lightgbm as lgb


In [24]:
train = pd.read_parquet('./jeju_data/train_48_T40.parquet')
test = pd.read_parquet('./jeju_data/test_48_T40.parquet')

In [25]:
X = train.drop(["target"], axis=1)
y = train["target"] 

In [55]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4701217 entries, 0 to 4701216
Data columns (total 48 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   day_of_week            int32  
 1   base_hour              int64  
 2   lane_count             int64  
 3   road_rating            int64  
 4   road_name              int32  
 5   multi_linked           int64  
 6   connect_code           int64  
 7   maximum_speed_limit    float64
 8   weight_restricted      float64
 9   road_type              int64  
 10  start_node_name        int32  
 11  start_latitude         float64
 12  start_longitude        float64
 13  start_turn_restricted  int32  
 14  end_node_name          int32  
 15  end_latitude           float64
 16  end_longitude          float64
 17  end_turn_restricted    int32  
 18  sin_time               float64
 19  cos_time               float64
 20  group_time             int32  
 21  month                  int64  
 22  week              

In [56]:
X.columns

Index(['day_of_week', 'base_hour', 'lane_count', 'road_rating', 'road_name',
       'multi_linked', 'connect_code', 'maximum_speed_limit',
       'weight_restricted', 'road_type', 'start_node_name', 'start_latitude',
       'start_longitude', 'start_turn_restricted', 'end_node_name',
       'end_latitude', 'end_longitude', 'end_turn_restricted', 'sin_time',
       'cos_time', 'group_time', 'month', 'week', 'post_holiday',
       'pre_holiday', 'holiday', 'season', 'vacation', 'distance', 'jeju_dist',
       'seogwi_dist', 'hanra_dist', 'sungsan_dist', 'joongmoon_dist',
       'node_TF', 'diff', 'road_name_set', 'section_speed_time',
       'start_speed_time', 'end_speed_time', 'section_speed', 'start_speed',
       'end_speed', 'end_cartesian', 'tour_count', 'turn_restricted', 'rest',
       'location_cluster'],
      dtype='object')

In [64]:
cat_features=['day_of_week', 'base_hour', 'lane_count', 'road_rating', 'road_name',
       'multi_linked', 'connect_code', 'weight_restricted', 'road_type', 'start_node_name',
       'start_turn_restricted', 'end_node_name', 'end_turn_restricted', 'group_time', 'month',
       'week', 'post_holiday', 'pre_holiday', 'holiday', 'season', 'vacation',
       'node_TF', 'road_name_set', 'end_cartesian', 'turn_restricted', 'rest',
       'location_cluster']

for f in cat_features:
    X[f] = X[f].astype('str').astype('category')
    test[f] = test[f].astype('str').astype('category')

In [66]:
import pandas as pd
sample_submission = pd.read_csv('./jeju_data/sample_submission.csv')

params = {'learning_rate': 0.034851454397775004, 'n_estimators': 3365, 'max_depth': 16, 'random_strength': 1, 'l2_leaf_reg': 2.498331420490028e-05}


from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

CAT_model= {}

for f in range(5):
      print(f'===================================={f+1}============================================')
      train_idx, val_idx = folds[f]
      
      x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]
      
      CAT = CatBoostRegressor(**params, verbose=100, task_type="GPU")
      CAT.fit(x_train, y_train,
              cat_features=cat_features,
              early_stopping_rounds=100)
      
      y_pred = CAT.predict(x_val)
      mae = mean_absolute_error(y_val, y_pred)
      print(f"{f + 1} Fold MAE = {mae}")
      CAT_model[f] = CAT
      print(f'================================================================================\n\n')
              

for fold in range(5):
    sample_submission['target'] += CAT_model[fold].predict(test)/5    
sample_submission.to_csv("./catcat.csv", index=False)

The least populated class in y has only 1 members, which is less than n_splits=5.


0:	learn: 15.4820461	total: 1.27s	remaining: 1h 11m 15s
100:	learn: 5.0600090	total: 1m 45s	remaining: 56m 45s
200:	learn: 4.6840913	total: 3m 28s	remaining: 54m 35s
300:	learn: 4.5261142	total: 5m 10s	remaining: 52m 37s
400:	learn: 4.4244796	total: 6m 51s	remaining: 50m 44s
500:	learn: 4.3369056	total: 8m 34s	remaining: 48m 59s
600:	learn: 4.2650735	total: 10m 18s	remaining: 47m 24s
700:	learn: 4.1939312	total: 12m 1s	remaining: 45m 43s
800:	learn: 4.1311246	total: 13m 44s	remaining: 43m 59s
900:	learn: 4.0736065	total: 15m 26s	remaining: 42m 13s
1000:	learn: 4.0225795	total: 17m 8s	remaining: 40m 28s
1100:	learn: 3.9712809	total: 18m 52s	remaining: 38m 48s
1200:	learn: 3.9222985	total: 20m 35s	remaining: 37m 5s
1300:	learn: 3.8756449	total: 22m 20s	remaining: 35m 26s
1400:	learn: 3.8326334	total: 24m 3s	remaining: 33m 43s
1500:	learn: 3.7894692	total: 25m 45s	remaining: 31m 58s
1600:	learn: 3.7511213	total: 27m 28s	remaining: 30m 16s
1700:	learn: 3.7144983	total: 29m 11s	remaining: 2

In [68]:
df_imp = pd.DataFrame({'imp':CAT.feature_importances_}, index = CAT.feature_names_)
df_imp = df_imp[df_imp.imp > 0].sort_values('imp').copy()
df_imp

Unnamed: 0,imp
road_type,0.009411
connect_code,0.009731
node_TF,0.029395
end_turn_restricted,0.030714
start_turn_restricted,0.041257
multi_linked,0.043873
diff,0.139198
maximum_speed_limit,0.302735
turn_restricted,0.465392
post_holiday,0.518181
