In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error as mae

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, StackingRegressor
from catboost import CatBoostRegressor

from pycaret.regression import *

import optuna
from optuna import trial
from optuna.samplers import TPESampler

In [6]:
# Load

train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

In [7]:
# Drop

train = train.drop(['id', 'road_name', 'vehicle_restricted', 'height_restricted', 'start_node_name', 'end_node_name', 'start_longitude', 'end_latitude'], axis='columns')
test = test.drop(['id', 'road_name', 'vehicle_restricted', 'height_restricted', 'start_node_name', 'end_node_name', 'start_longitude', 'end_latitude'], axis='columns')

print("Drop Done.")

Drop Done.


In [8]:
# Custom Preprocess

le = LabelEncoder()

train['maximum_speed_limit'] = train['maximum_speed_limit'].astype('int32')
test['maximum_speed_limit'] = test['maximum_speed_limit'].astype('int32')

train['weight_restricted'] = train['weight_restricted'].astype('int32')
test['weight_restricted'] = test['weight_restricted'].astype('int32')

## node turn resticted 
def combine_turns(df):
    turn_restricted = []
    for s,e in zip(df['start_turn_restricted'], df['end_turn_restricted']):
        if (s=='없음') & (e=='없음'):   turn_restricted.append(0)
        elif (s=='없음') & (e=='있음'): turn_restricted.append(1)
        elif (s=='있음') & (e=='없음'): turn_restricted.append(2)
        else:                          turn_restricted.append(3)

    return turn_restricted

train['start_turn_restricted'] = combine_turns(train)
train['start_turn_restricted'] = le.fit_transform(train['start_turn_restricted'])
train.rename(columns={'start_turn_restricted' : 'turn_restricted'}, inplace=True)
train = train.drop(['end_turn_restricted'], axis='columns')
test['start_turn_restricted'] = combine_turns(test)
test['start_turn_restricted'] = le.fit_transform(test['start_turn_restricted'])
test.rename(columns={'start_turn_restricted' : 'turn_restricted'}, inplace=True)
test = test.drop(['end_turn_restricted'], axis='columns')

## day_of_week 평일-휴일
day_dict = {'월':0, '화':0, '수':0, '목':0, '금':0,
            '토':1, '일':1}
train['day_of_week'].replace(day_dict, inplace=True)
test['day_of_week'].replace(day_dict, inplace=True)

print("Custom Preprocess Done.")

Custom Preprocess Done.


In [9]:
# Factor

train['road_rating'] = le.fit_transform(train['road_rating'])
test['road_rating'] = le.fit_transform(test['road_rating'])

train['lane_count'] = le.fit_transform(train['lane_count'])
test['lane_count'] = le.fit_transform(test['lane_count'])

train['road_type'] = le.fit_transform(train['road_type'])
test['road_type'] = le.fit_transform(test['road_type'])

# train['day_of_week'] = train['day_of_week'].astype('object')
# test['day_of_week'] = test['day_of_week'].astype('object')

# train['base_hour'] = train['base_hour'].astype('object')
# test['base_hour'] = test['base_hour'].astype('object')

# train['lane_count'] = train['lane_count'].astype('object')
# test['lane_count'] = test['lane_count'].astype('object')

# train['road_rating'] = train['road_rating'].astype('object')
# test['road_rating'] = test['road_rating'].astype('object')

# train['multi_linked'] = train['multi_linked'].astype('object')
# test['multi_linked'] = test['multi_linked'].astype('object')

# train['connect_code'] = train['connect_code'].astype('object')
# test['connect_code'] = test['connect_code'].astype('object')

# train['road_type'] = train['road_type'].astype('object')
# test['road_type'] = test['road_type'].astype('object')

# train['turn_restricted'] = train['turn_restricted'].astype('object')
# test['turn_restricted'] = test['turn_restricted'].astype('object')

print("Factor Done.")

Factor Done.


In [94]:
# Stacking

data = train.iloc[:,:-1]
label = train['target']

train_data, val_data, train_label, val_label = train_test_split(data, label, test_size=0.16, random_state=42)

# # declare model
xgb = XGBRegressor(learning_rate=0.1)
lgb = LGBMRegressor()
cb = CatBoostRegressor(learning_rate=0.04557838437986454,
        bagging_temperature=8.869692984509859,
        n_estimators=6627,
        max_depth=13,
        random_strength=38,
        colsample_bylevel=0.7379492985439079,
        l2_leaf_reg=1.980433349323205e-05,
        min_child_samples=14,
        max_bin=337,
        od_type='Iter',
        one_hot_max_size=24,
        loss_function='MAE')

# Stacking model
estimators = [('xgb',xgb), ('lgb',lgb), ('cb',cb)]
stackingmodel = StackingRegressor(estimators=estimators, final_estimator=cb, n_jobs=-1, passthrough=True, verbose=300)
stackingmodel.fit(train_data, train_label)

stack_pred = stackingmodel.predict(val_data)
score = mae(stack_pred, val_label)
print(score)

In [None]:
## Inference

sub = pd.read_csv('./sample_submission.csv')

sub['target'] = stackingmodel.predict(test)
sub.to_csv('./result/submit26.csv', index = False)

In [None]:
# Model Train

data = train.iloc[:,:-1]
label = train['target']

train_data, val_data, train_label, val_label = train_test_split(data, label, test_size=0.16)

model = CatBoostRegressor(
        learning_rate=0.035,
        n_estimators=10000,
        loss_function='MAE'
).fit(X=train_data, y=train_label, 
    eval_set=(val_data, val_label), verbose=1000,
    early_stopping_rounds=25)

In [None]:
## Inference

sub = pd.read_csv('./sample_submission.csv')

sub['target'] = stackingmodel.predict(test)
sub.to_csv('./result/submit24.csv', index = False)