In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from dataset_cluster_xgb1 import make_dataset
from os.path import join
import pandas as pd

train_path = join('jeju_data', 'train_new.parquet')
test_path = join('jeju_data', 'test_new.parquet')
sample_submission = pd.read_csv('./jeju_data/sample_submission.csv')

x_train, y_train, test = make_dataset(train_path, test_path)

In [None]:
X = x_train.copy()
y = y_train.copy()

In [None]:
X.head(5)

Unnamed: 0,day_of_week,lane_count,road_rating,multi_linked,connect_code,maximum_speed_limit,weight_restricted,height_restricted,road_type,start_latitude,start_longitude,start_turn_restricted,end_latitude,end_longitude,end_turn_restricted,distance,week,time,sin_time,cos_time,month,post_holiday,pre_holiday,holiday,season,vacation,turn_restricted,location_cluster,rest
0,1,1,106,0,0,60.0,32400.0,0.0,3,33.427747,126.662612,0,33.427749,126.662335,0,0.025711,0.0,2,-0.965926,-0.258819,6,0.0,0.0,0.0,3,0,0,2,0.0
1,1,2,103,0,0,60.0,0.0,0.0,0,33.50073,126.529107,1,33.504811,126.52624,0,0.525891,0.0,3,-0.707107,0.707107,7,0.0,0.0,0.0,3,1,1,1,0.0
2,4,2,103,0,0,80.0,0.0,0.0,0,33.279145,126.368598,0,33.280072,126.362147,0,0.608399,1.0,1,0.965926,-0.258819,10,1.0,1.0,0.0,0,0,0,3,1.0
3,0,2,107,0,0,50.0,0.0,0.0,0,33.246081,126.567204,0,33.245565,126.566228,0,0.107352,0.0,2,-0.258819,-0.965926,3,0.0,0.0,0.0,2,0,0,0,0.0
4,6,2,103,0,0,80.0,0.0,0.0,0,33.462214,126.326551,0,33.462677,126.330152,0,0.337949,0.0,1,0.866025,-0.5,10,0.0,1.0,0.0,0,0,0,3,1.0


In [None]:
X.columns

Index(['day_of_week', 'lane_count', 'road_rating', 'multi_linked',
       'connect_code', 'maximum_speed_limit', 'weight_restricted',
       'height_restricted', 'road_type', 'start_latitude', 'start_longitude',
       'start_turn_restricted', 'end_latitude', 'end_longitude',
       'end_turn_restricted', 'distance', 'week', 'time', 'sin_time',
       'cos_time', 'month', 'post_holiday', 'pre_holiday', 'holiday', 'season',
       'vacation', 'turn_restricted', 'location_cluster', 'rest'],
      dtype='object')

In [None]:
params = {'n_estimators': 4519, 'max_depth': 13, 'min_child_weight': 22, 'gamma': 3, 'learning_rate': 0.016, 'colsample_bytree': 0.7242144738048932, 'lambda': 0.007923992135983554, 'alpha': 0.13203244419302776, 'subsample': 0.8}

xgb_model = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0).fit(X, y)
y_pred = xgb_model.predict(test)

import pandas as pd

sample_submission = pd.read_csv('./jeju_data/sample_submission.csv')
sample_submission['target'] = y_pred
sample_submission.to_csv("./submit_xgb_optunadelout.csv", index=False)

In [None]:
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error


params = {'n_estimators': 4519, 'max_depth': 13, 'min_child_weight': 22, 'gamma': 3, 'learning_rate': 0.016, 'colsample_bytree': 0.7242144738048932, 'lambda': 0.007923992135983554, 'alpha': 0.13203244419302776, 'subsample': 0.8}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

folds = []

for train_idx, val_idx in skf.split(X, y):
    folds.append((train_idx, val_idx))

XGB_model= {}

for f in range(5):
      print(f'===================================={f+1}============================================')
      train_idx, val_idx = folds[f]
      
      x_train, x_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]
      
      XGB = XGBRegressor(**params, tree_method='gpu_hist', gpu_id=0)
      XGB.fit(x_train, y_train)
      
      y_pred = XGB.predict(x_val)
      mae = mean_absolute_error(y_val, y_pred)
      print(f"{f + 1} Fold MAE = {mae}")
      XGB_model[f] = XGB
      print(f'================================================================================\n\n')
              



1 Fold MAE = 3.0206840155728276


2 Fold MAE = 3.0195202375346724


3 Fold MAE = 3.016215505847796


4 Fold MAE = 3.0167651937080127


5 Fold MAE = 3.015737466135109




In [None]:
sample_submission = pd.read_csv('./jeju_data/sample_submission.csv')
for fold in range(5):
    sample_submission['target'] += XGB_model[fold].predict(test) / 5
    
sample_submission.to_csv("./submit_xgb_fold.csv", index=False)

In [None]:
df_imp = pd.DataFrame({'importances':XGB.feature_importances_}, index = XGB.feature_names_in_)
df_imp = df_imp[df_imp.importances > 0].sort_values('importances').copy()
df_imp

Unnamed: 0,importances
day_of_week,0.000774
post_holiday,0.000799
pre_holiday,0.000986
rest,0.001031
holiday,0.001039
vacation,0.001168
week,0.001405
month,0.00148
multi_linked,0.001504
sin_time,0.003641


In [28]:
import joblib

joblib.dump(XGB_model, 'xgb_model.pkl')

['xgb_model.pkl']