In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn. metrics import mean_absolute_error as mae

from catboost import CatBoostRegressor

import optuna
from optuna import trial
from optuna.samplers import TPESampler



In [61]:
# Load

train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

le = LabelEncoder()

In [62]:
# Drop

train = train.drop(['id', 'road_name', 'vehicle_restricted', 'height_restricted', 'start_node_name', 'end_node_name', 'start_longitude', 'end_latitude'], axis='columns')
test = test.drop(['id', 'road_name', 'vehicle_restricted', 'height_restricted', 'start_node_name', 'end_node_name', 'start_longitude', 'end_latitude'], axis='columns')

print("Drop Done.")

Drop Done.


In [63]:
# Factor

train['road_rating'] = le.fit_transform(train['road_rating'])
test['road_rating'] = le.fit_transform(test['road_rating'])

train['lane_count'] = le.fit_transform(train['lane_count'])
test['lane_count'] = le.fit_transform(test['lane_count'])

train['road_type'] = le.fit_transform(train['road_type'])
test['road_type'] = le.fit_transform(test['road_type'])

print("Factor Done.")


Factor Done.


In [64]:
# Custom Preprocess

train['maximum_speed_limit'] = train['maximum_speed_limit'].astype('int32')
test['maximum_speed_limit'] = test['maximum_speed_limit'].astype('int32')

train['weight_restricted'] = train['weight_restricted'].astype('int32')
test['weight_restricted'] = test['weight_restricted'].astype('int32')

## node turn resticted 
def combine_turns(df):
    turn_restricted = []
    for s,e in zip(df['start_turn_restricted'], df['end_turn_restricted']):
        if (s=='없음') & (e=='없음'):   turn_restricted.append(0)
        elif (s=='없음') & (e=='있음'): turn_restricted.append(1)
        elif (s=='있음') & (e=='없음'): turn_restricted.append(2)
        else:                          turn_restricted.append(3)

    return turn_restricted

train['start_turn_restricted'] = combine_turns(train)
train['start_turn_restricted'] = le.fit_transform(train['start_turn_restricted'])
train.rename(columns={'start_turn_restricted' : 'turn_restricted'}, inplace=True)
train = train.drop(['end_turn_restricted'], axis='columns')
test['start_turn_restricted'] = combine_turns(test)
test['start_turn_restricted'] = le.fit_transform(test['start_turn_restricted'])
test.rename(columns={'start_turn_restricted' : 'turn_restricted'}, inplace=True)
test = test.drop(['end_turn_restricted'], axis='columns')

## day_of_week 평일-휴일
day_dict = {'월':0, '화':0, '수':0, '목':0, '금':0,
            '토':1, '일':1}
train['day_of_week'].replace(day_dict, inplace=True)
test['day_of_week'].replace(day_dict, inplace=True)

print("Custom Preprocess Done.")

Custom Preprocess Done.


In [None]:
# Optuna

def objective(trial):
    param = {
      "random_state" : 42,
      'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.1),
      'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
      "n_estimators":trial.suggest_int("n_estimators", 2000, 7000),
      "max_depth":trial.suggest_int("max_depth", 4, 16),
      'random_strength' :trial.suggest_int('random_strength', 0, 100),
      "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
      "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
      "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
      "max_bin": trial.suggest_int("max_bin", 200, 500),
      'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
    }

    train_data, val_data, train_label, val_label = train_test_split(train.iloc[:,:-1], train['target'], test_size=0.16)
    cat = CatBoostRegressor(**param)
    cat = cat.fit(train_data, train_label,
                  eval_set=[(val_data, val_label)],
                  verbose=0,
                  early_stopping_rounds=35)
    
    return mae(val_label, cat.predict(val_data))

sampler = TPESampler(42)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=15)
print("Best Score:", study.best_value)
print("Best trial", study.best_trial.params)

In [None]:
# Model Train

data = train.iloc[:,:-1]
label = train['target']

train_data, val_data, train_label, val_label = train_test_split(data, label, test_size=0.2)

model = CatBoostRegressor(
        learning_rate=0.04557838437986454,
        bagging_temperature=8.869692984509859,
        n_estimators=6627,
        max_depth=13,
        random_strength=38,
        colsample_bylevel=0.7379492985439079,
        l2_leaf_reg=1.980433349323205e-05,
        min_child_samples=14,
        max_bin=337,
        od_type='Iter',
        loss_function='MAE'
).fit(X=train_data, y=train_label, 
    eval_set=(val_data, val_label), verbose=700,
    early_stoping_rounds=25)

In [19]:
## Inference

sub = pd.read_csv('./sample_submission.csv')

sub['target'] = model.predict(test)
sub.to_csv('./result/submit.csv', index = False)