In [1]:
#데이터 분석 4종 세트
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 모델들, 성능 평가
# (저는 일반적으로 정형데이터로 머신러닝 분석할 때는 이 2개 모델은 그냥 돌려봅니다. 특히 RF가 테스트하기 좋습니다.)
from lightgbm.sklearn import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor

# 상관관계 분석, VIF : 다중공산성 제거
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : optuna를 사용하기 위함
from sklearn.model_selection import KFold
from functools import partial

# hyper-parameter tuning을 위한 라이브러리, optuna
import optuna

In [2]:
train = pd.read_csv("../../data/train.csv")
test = pd.read_csv("../../data/test.csv")

In [3]:
train

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO
0,ACCIDENT_00000,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,...,상해없음,보행자,여,70세,중상,0,1,0,0,5
1,ACCIDENT_00001,2019-01-01 00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,...,상해없음,보행자,남,61세,경상,0,0,1,0,3
2,ACCIDENT_00002,2019-01-01 01,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,...,상해없음,보행자,남,38세,경상,0,0,1,0,3
3,ACCIDENT_00003,2019-01-01 02,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,36세,중상,0,1,0,0,5
4,ACCIDENT_00004,2019-01-01 04,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,52세,경상,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,2021-12-31 19,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,측면충돌,신호위반,...,상해없음,이륜,남,28세,경상,0,0,1,0,3
39605,ACCIDENT_39605,2021-12-31 19,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,측면충돌,안전거리미확보,...,상해없음,승용,남,52세,경상,0,0,1,0,3
39606,ACCIDENT_39606,2021-12-31 21,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,측면충돌,교차로운행방법위반,...,중상,승용,남,73세,중상,0,2,0,0,10
39607,ACCIDENT_39607,2021-12-31 22,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,여,57세,경상,0,0,1,0,3


In [4]:
test

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형
0,ACCIDENT_39609,2022-01-01 01,토요일,맑음,대구광역시 수성구 상동,교차로 - 교차로안,건조,차대사람
1,ACCIDENT_39610,2022-01-01 01,토요일,맑음,대구광역시 수성구 지산동,단일로 - 기타,건조,차대사람
2,ACCIDENT_39611,2022-01-01 04,토요일,맑음,대구광역시 수성구 수성동2가,교차로 - 교차로안,건조,차대차
3,ACCIDENT_39612,2022-01-01 04,토요일,맑음,대구광역시 수성구 신매동,단일로 - 기타,건조,차대차
4,ACCIDENT_39613,2022-01-01 06,토요일,맑음,대구광역시 달서구 감삼동,교차로 - 교차로안,건조,차대차
...,...,...,...,...,...,...,...,...
10958,ACCIDENT_50567,2022-12-31 18,토요일,맑음,대구광역시 남구 대명동,단일로 - 터널,건조,차대차
10959,ACCIDENT_50568,2022-12-31 18,토요일,맑음,대구광역시 수성구 시지동,단일로 - 기타,건조,차대차
10960,ACCIDENT_50569,2022-12-31 20,토요일,맑음,대구광역시 수성구 연호동,단일로 - 기타,건조,차대차
10961,ACCIDENT_50570,2022-12-31 20,토요일,맑음,대구광역시 수성구 범물동,교차로 - 교차로부근,건조,차대차


## Data Wrangiling

In [5]:
def wrangiling(data_set):
    print("==================")
    print("       Shape      ")
    print("==================")

    display(data_set.shape)

    print("==================")
    print("        Info      ")
    print("==================")

    display(data_set.info())

    print("==================")
    print("      Columns     ")
    print("==================")

    display(data_set.columns)

    print("==================")
    print("        NaN       ")
    print("==================")

    display(data_set.isna().sum())

    print("==================")
    print("    Duplicated    ")
    print("==================")

    display(data_set[data_set.duplicated()])

    print("==================")
    print("    Description   ")
    print("==================")

    display(data_set.describe())

    print("==================")
    print("      Unique      ")
    print("==================")

    display(data_set.nunique())

In [6]:
wrangiling(train)

       Shape      


(39609, 23)

        Info      
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39609 entries, 0 to 39608
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           39609 non-null  object
 1   사고일시         39609 non-null  object
 2   요일           39609 non-null  object
 3   기상상태         39609 non-null  object
 4   시군구          39609 non-null  object
 5   도로형태         39609 non-null  object
 6   노면상태         39609 non-null  object
 7   사고유형         39609 non-null  object
 8   사고유형 - 세부분류  39609 non-null  object
 9   법규위반         39609 non-null  object
 10  가해운전자 차종     39609 non-null  object
 11  가해운전자 성별     39609 non-null  object
 12  가해운전자 연령     39609 non-null  object
 13  가해운전자 상해정도   39609 non-null  object
 14  피해운전자 차종     38618 non-null  object
 15  피해운전자 성별     38618 non-null  object
 16  피해운전자 연령     38618 non-null  object
 17  피해운전자 상해정도   38618 non-null  object
 18  사망자수         39609 non-null  int64 
 19  중상자수  

None

      Columns     


Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형',
       '사고유형 - 세부분류', '법규위반', '가해운전자 차종', '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도',
       '피해운전자 차종', '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도', '사망자수', '중상자수',
       '경상자수', '부상자수', 'ECLO'],
      dtype='object')

        NaN       


ID               0
사고일시             0
요일               0
기상상태             0
시군구              0
도로형태             0
노면상태             0
사고유형             0
사고유형 - 세부분류      0
법규위반             0
가해운전자 차종         0
가해운전자 성별         0
가해운전자 연령         0
가해운전자 상해정도       0
피해운전자 차종       991
피해운전자 성별       991
피해운전자 연령       991
피해운전자 상해정도     991
사망자수             0
중상자수             0
경상자수             0
부상자수             0
ECLO             0
dtype: int64

    Duplicated    


Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO


    Description   


Unnamed: 0,사망자수,중상자수,경상자수,부상자수,ECLO
count,39609.0,39609.0,39609.0,39609.0,39609.0
mean,0.007776,0.262365,1.070085,0.126865,4.726704
std,0.090109,0.500845,0.992034,0.39467,3.207206
min,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,3.0
50%,0.0,0.0,1.0,0.0,3.0
75%,0.0,0.0,1.0,0.0,6.0
max,2.0,6.0,22.0,10.0,74.0


      Unique      


ID             39609
사고일시           18057
요일                 7
기상상태               6
시군구              199
도로형태              11
노면상태               6
사고유형               3
사고유형 - 세부분류       14
법규위반              11
가해운전자 차종          12
가해운전자 성별           3
가해운전자 연령          89
가해운전자 상해정도         6
피해운전자 차종          13
피해운전자 성별           3
피해운전자 연령          97
피해운전자 상해정도         6
사망자수               3
중상자수               7
경상자수              18
부상자수               9
ECLO              46
dtype: int64

In [7]:
wrangiling(test)

       Shape      


(10963, 8)

        Info      
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10963 entries, 0 to 10962
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      10963 non-null  object
 1   사고일시    10963 non-null  object
 2   요일      10963 non-null  object
 3   기상상태    10963 non-null  object
 4   시군구     10963 non-null  object
 5   도로형태    10963 non-null  object
 6   노면상태    10963 non-null  object
 7   사고유형    10963 non-null  object
dtypes: object(8)
memory usage: 685.3+ KB


None

      Columns     


Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형'], dtype='object')

        NaN       


ID      0
사고일시    0
요일      0
기상상태    0
시군구     0
도로형태    0
노면상태    0
사고유형    0
dtype: int64

    Duplicated    


Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형


    Description   


Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형
count,10963,10963,10963,10963,10963,10963,10963,10963
unique,10963,5548,7,5,192,11,6,3
top,ACCIDENT_39609,2022-11-10 18,금요일,맑음,대구광역시 남구 대명동,단일로 - 기타,건조,차대차
freq,1,10,1743,10321,422,5039,10394,8559


      Unique      


ID      10963
사고일시     5548
요일          7
기상상태        5
시군구       192
도로형태       11
노면상태        6
사고유형        3
dtype: int64

## Data Preprocessing

In [8]:
train = train.drop(["ID", "사고유형 - 세부분류", "법규위반", "가해운전자 차종", "가해운전자 성별", "가해운전자 연령",
                    "가해운전자 상해정도", "피해운전자 차종", "피해운전자 성별", "피해운전자 연령", "피해운전자 상해정도",
                    "사망자수", "중상자수", "부상자수", "ECLO"], axis=1)

In [9]:
train["date_time"] = pd.to_datetime(train['사고일시'])
train['Month'] = train.date_time.dt.month
train['Day'] = train.date_time.dt.day
train['Hour'] = train.date_time.dt.hour
train = train.drop(["사고일시"], axis = 1)
train = train.drop(["date_time"], axis = 1)

In [10]:
train["Week"]=np.where(train["요일"].isin(["월요일", "화요일", "수요일", "목요일", "금요일"]), 0, 1)

In [11]:
train["Week"].unique()

array([0, 1])

In [12]:
train = train.drop(["요일"], axis = 1)
train

Unnamed: 0,기상상태,시군구,도로형태,노면상태,사고유형,경상자수,Month,Day,Hour,Week
0,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,0,1,1,0,0
1,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,1,1,1,0,0
2,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,1,1,1,1,0
3,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,0,1,1,2,0
4,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,1,1,1,4,0
...,...,...,...,...,...,...,...,...,...,...
39604,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,1,12,31,19,0
39605,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,1,12,31,19,0
39606,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,0,12,31,21,0
39607,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,1,12,31,22,0


In [13]:
train["Weather"] = np.where(train["기상상태"].isin(["맑음"]), 0, 1)
train = train.drop(["기상상태"], axis=1)
train

Unnamed: 0,시군구,도로형태,노면상태,사고유형,경상자수,Month,Day,Hour,Week,Weather
0,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,0,1,1,0,0,0
1,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,1,1,1,0,0,1
2,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,1,1,1,1,0,0
3,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,0,1,1,2,0,0
4,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,1,1,1,4,0,0
...,...,...,...,...,...,...,...,...,...,...
39604,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,1,12,31,19,0,0
39605,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,1,12,31,19,0,0
39606,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,0,12,31,21,0,0
39607,대구광역시 달서구 장동,기타 - 기타,건조,차대차,1,12,31,22,0,0


In [14]:
train[["Road_Shape1", "Road_Shape2"]] = train["도로형태"].str.split("-", expand=True)
train = train.drop(["도로형태"], axis=1)
train

Unnamed: 0,시군구,노면상태,사고유형,경상자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2
0,대구광역시 중구 대신동,건조,차대사람,0,1,1,0,0,0,단일로,기타
1,대구광역시 달서구 감삼동,건조,차대사람,1,1,1,0,0,1,단일로,기타
2,대구광역시 수성구 두산동,건조,차대사람,1,1,1,1,0,0,단일로,기타
3,대구광역시 북구 복현동,건조,차대차,0,1,1,2,0,0,단일로,기타
4,대구광역시 동구 신암동,건조,차대차,1,1,1,4,0,0,단일로,기타
...,...,...,...,...,...,...,...,...,...,...,...
39604,대구광역시 수성구 수성동3가,건조,차대차,1,12,31,19,0,0,교차로,교차로안
39605,대구광역시 달서구 상인동,건조,차대차,1,12,31,19,0,0,단일로,기타
39606,대구광역시 달서구 월성동,건조,차대차,0,12,31,21,0,0,교차로,교차로안
39607,대구광역시 달서구 장동,건조,차대차,1,12,31,22,0,0,기타,기타


In [15]:
train["Road_stat"]=np.where(train["노면상태"].isin(["건조"]), 0, 1)
train = train.drop(["노면상태"], axis=1)
train

Unnamed: 0,시군구,사고유형,경상자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2,Road_stat
0,대구광역시 중구 대신동,차대사람,0,1,1,0,0,0,단일로,기타,0
1,대구광역시 달서구 감삼동,차대사람,1,1,1,0,0,1,단일로,기타,0
2,대구광역시 수성구 두산동,차대사람,1,1,1,1,0,0,단일로,기타,0
3,대구광역시 북구 복현동,차대차,0,1,1,2,0,0,단일로,기타,0
4,대구광역시 동구 신암동,차대차,1,1,1,4,0,0,단일로,기타,0
...,...,...,...,...,...,...,...,...,...,...,...
39604,대구광역시 수성구 수성동3가,차대차,1,12,31,19,0,0,교차로,교차로안,0
39605,대구광역시 달서구 상인동,차대차,1,12,31,19,0,0,단일로,기타,0
39606,대구광역시 달서구 월성동,차대차,0,12,31,21,0,0,교차로,교차로안,0
39607,대구광역시 달서구 장동,차대차,1,12,31,22,0,0,기타,기타,0


In [16]:
train["사고유형"].value_counts()

사고유형
차대차     31785
차대사람     6833
차량단독      991
Name: count, dtype: int64

In [17]:
onehot = pd.get_dummies(train[['사고유형']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['사고유형'], axis = 1)
train

Unnamed: 0,시군구,경상자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독
0,대구광역시 중구 대신동,0,1,1,0,0,0,단일로,기타,0,True,False,False
1,대구광역시 달서구 감삼동,1,1,1,0,0,1,단일로,기타,0,True,False,False
2,대구광역시 수성구 두산동,1,1,1,1,0,0,단일로,기타,0,True,False,False
3,대구광역시 북구 복현동,0,1,1,2,0,0,단일로,기타,0,False,True,False
4,대구광역시 동구 신암동,1,1,1,4,0,0,단일로,기타,0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,대구광역시 수성구 수성동3가,1,12,31,19,0,0,교차로,교차로안,0,False,True,False
39605,대구광역시 달서구 상인동,1,12,31,19,0,0,단일로,기타,0,False,True,False
39606,대구광역시 달서구 월성동,0,12,31,21,0,0,교차로,교차로안,0,False,True,False
39607,대구광역시 달서구 장동,1,12,31,22,0,0,기타,기타,0,False,True,False


In [18]:
train[["시", "구", "동"]] = train["시군구"].str.split(" ", expand=True)
train = train.drop(["시군구"], axis=1)
train

Unnamed: 0,경상자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,시,구,동
0,0,1,1,0,0,0,단일로,기타,0,True,False,False,대구광역시,중구,대신동
1,1,1,1,0,0,1,단일로,기타,0,True,False,False,대구광역시,달서구,감삼동
2,1,1,1,1,0,0,단일로,기타,0,True,False,False,대구광역시,수성구,두산동
3,0,1,1,2,0,0,단일로,기타,0,False,True,False,대구광역시,북구,복현동
4,1,1,1,4,0,0,단일로,기타,0,False,True,False,대구광역시,동구,신암동
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,1,12,31,19,0,0,교차로,교차로안,0,False,True,False,대구광역시,수성구,수성동3가
39605,1,12,31,19,0,0,단일로,기타,0,False,True,False,대구광역시,달서구,상인동
39606,0,12,31,21,0,0,교차로,교차로안,0,False,True,False,대구광역시,달서구,월성동
39607,1,12,31,22,0,0,기타,기타,0,False,True,False,대구광역시,달서구,장동


In [19]:
train = train.drop(["시"], axis=1)
train

Unnamed: 0,경상자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,구,동
0,0,1,1,0,0,0,단일로,기타,0,True,False,False,중구,대신동
1,1,1,1,0,0,1,단일로,기타,0,True,False,False,달서구,감삼동
2,1,1,1,1,0,0,단일로,기타,0,True,False,False,수성구,두산동
3,0,1,1,2,0,0,단일로,기타,0,False,True,False,북구,복현동
4,1,1,1,4,0,0,단일로,기타,0,False,True,False,동구,신암동
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,1,12,31,19,0,0,교차로,교차로안,0,False,True,False,수성구,수성동3가
39605,1,12,31,19,0,0,단일로,기타,0,False,True,False,달서구,상인동
39606,0,12,31,21,0,0,교차로,교차로안,0,False,True,False,달서구,월성동
39607,1,12,31,22,0,0,기타,기타,0,False,True,False,달서구,장동


In [20]:
onehot = pd.get_dummies(train[['Road_Shape1']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['Road_Shape1'], axis = 1)
train

Unnamed: 0,경상자수,Month,Day,Hour,Week,Weather,Road_Shape2,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,구,동,Road_Shape1_교차로,Road_Shape1_기타,Road_Shape1_단일로,Road_Shape1_미분류,Road_Shape1_주차장
0,0,1,1,0,0,0,기타,0,True,False,False,중구,대신동,False,False,True,False,False
1,1,1,1,0,0,1,기타,0,True,False,False,달서구,감삼동,False,False,True,False,False
2,1,1,1,1,0,0,기타,0,True,False,False,수성구,두산동,False,False,True,False,False
3,0,1,1,2,0,0,기타,0,False,True,False,북구,복현동,False,False,True,False,False
4,1,1,1,4,0,0,기타,0,False,True,False,동구,신암동,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,1,12,31,19,0,0,교차로안,0,False,True,False,수성구,수성동3가,True,False,False,False,False
39605,1,12,31,19,0,0,기타,0,False,True,False,달서구,상인동,False,False,True,False,False
39606,0,12,31,21,0,0,교차로안,0,False,True,False,달서구,월성동,True,False,False,False,False
39607,1,12,31,22,0,0,기타,0,False,True,False,달서구,장동,False,True,False,False,False


In [21]:
onehot = pd.get_dummies(train[['Road_Shape2']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['Road_Shape2'], axis = 1)
train

Unnamed: 0,경상자수,Month,Day,Hour,Week,Weather,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,...,Road_Shape2_ 고가도로위,Road_Shape2_ 교량,Road_Shape2_ 교차로부근,Road_Shape2_ 교차로안,Road_Shape2_ 교차로횡단보도내,Road_Shape2_ 기타,Road_Shape2_ 미분류,Road_Shape2_ 주차장,Road_Shape2_ 지하차도(도로)내,Road_Shape2_ 터널
0,0,1,1,0,0,0,0,True,False,False,...,False,False,False,False,False,True,False,False,False,False
1,1,1,1,0,0,1,0,True,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1,1,1,1,0,0,0,True,False,False,...,False,False,False,False,False,True,False,False,False,False
3,0,1,1,2,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False
4,1,1,1,4,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,1,12,31,19,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False
39605,1,12,31,19,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False
39606,0,12,31,21,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False
39607,1,12,31,22,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False


In [22]:
onehot = pd.get_dummies(train[['구']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['구'], axis = 1)
train

Unnamed: 0,경상자수,Month,Day,Hour,Week,Weather,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,...,Road_Shape2_ 지하차도(도로)내,Road_Shape2_ 터널,구_남구,구_달서구,구_달성군,구_동구,구_북구,구_서구,구_수성구,구_중구
0,0,1,1,0,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,False,True
1,1,1,1,0,0,1,0,True,False,False,...,False,False,False,True,False,False,False,False,False,False
2,1,1,1,1,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,True,False
3,0,1,1,2,0,0,0,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,1,1,1,4,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,1,12,31,19,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,True,False
39605,1,12,31,19,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False
39606,0,12,31,21,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False
39607,1,12,31,22,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False


In [23]:
onehot = pd.get_dummies(train[['동']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['동'], axis = 1)
train

Unnamed: 0,경상자수,Month,Day,Hour,Week,Weather,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,...,동_하서동,동_학정동,동_향촌동,동_현풍읍,동_호림동,동_호산동,동_화원읍,동_화전동,동_황금동,동_효목동
0,0,1,1,0,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,1,1,0,0,1,0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1,1,1,1,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,1,1,2,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,1,1,1,4,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,1,12,31,19,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
39605,1,12,31,19,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
39606,0,12,31,21,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
39607,1,12,31,22,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False


## Modeling

In [24]:
## 3. hold-out train test split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
x_train, x_valid, y_train, y_valid = train_test_split(train[train.columns.difference(['경상자수'])],
                                                      train['경상자수'],
                                                      test_size = 0.3,
                                                      random_state = 42
                                                    )

In [25]:
# Define a function to calculate RMSLE
from sklearn.metrics import make_scorer
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

# # Create a scorer for RMSLE
# rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [26]:
def optimizer_xgb(trial):
    # 조절할 hyper-parameter 조합을 적어줍니다.  # bayesian optimization
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = 5 # trial.suggest_int('max_depth', 5, 10)
    colsample_bynode = trial.suggest_float('colsample_bynode', 0.5, 0.8)
    reg_lambda = trial.suggest_float('reg_lambda', 10, 20)
    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.3)
    tree_method = "exact"

    model = XGBRegressor(n_estimators=n_estimators,
                         max_depth=max_depth,
                         colsample_bynode = colsample_bynode,
                         reg_lambda = reg_lambda,
                         learning_rate = learning_rate,
                         tree_method = "exact",
                         random_state=42)

    model.fit(x_train, y_train)
    preds = model.predict(x_valid)
    RMSLE = rmsle(y_true=y_valid, y_pred=preds)

    return RMSLE

In [153]:
study_xgb = optuna.create_study(direction="minimize") # 최소/최대 어느 방향의 최적값을 구할 건지.
study_xgb.optimize(optimizer_xgb, n_trials=100)

[I 2023-12-06 05:50:59,552] A new study created in memory with name: no-name-c34bc499-a5df-4f04-a483-be0a4d98e72c
[I 2023-12-06 05:51:20,324] Trial 0 finished with value: 0.4329637632935846 and parameters: {'n_estimators': 81, 'colsample_bynode': 0.6842552608526418, 'reg_lambda': 14.49061662845417, 'learning_rate': 0.2972349186068183}. Best is trial 0 with value: 0.4329637632935846.
[I 2023-12-06 05:51:55,052] Trial 1 finished with value: 0.43245108807031873 and parameters: {'n_estimators': 145, 'colsample_bynode': 0.7494196626365461, 'reg_lambda': 10.34906834878745, 'learning_rate': 0.1677687509933547}. Best is trial 1 with value: 0.43245108807031873.
[I 2023-12-06 05:52:32,037] Trial 2 finished with value: 0.43234742456342046 and parameters: {'n_estimators': 144, 'colsample_bynode': 0.6608273012009788, 'reg_lambda': 10.029156480034949, 'learning_rate': 0.1524519300346741}. Best is trial 2 with value: 0.43234742456342046.
[I 2023-12-06 05:53:10,493] Trial 3 finished with value: 0.4323

In [154]:
# optuna가 시도했던 모든 실험 관련 데이터
study_xgb.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bynode,params_learning_rate,params_n_estimators,params_reg_lambda,state
0,0,0.432964,2023-12-06 05:50:59.555236,2023-12-06 05:51:20.324445,0 days 00:00:20.769209,0.684255,0.297235,81,14.490617,COMPLETE
1,1,0.432451,2023-12-06 05:51:20.329275,2023-12-06 05:51:55.052353,0 days 00:00:34.723078,0.749420,0.167769,145,10.349068,COMPLETE
2,2,0.432347,2023-12-06 05:51:55.058504,2023-12-06 05:52:32.037182,0 days 00:00:36.978678,0.660827,0.152452,144,10.029156,COMPLETE
3,3,0.432377,2023-12-06 05:52:32.042233,2023-12-06 05:53:10.493187,0 days 00:00:38.450954,0.734475,0.238399,167,14.907755,COMPLETE
4,4,0.432061,2023-12-06 05:53:10.495554,2023-12-06 05:53:23.907484,0 days 00:00:13.411930,0.627988,0.153468,57,18.703168,COMPLETE
...,...,...,...,...,...,...,...,...,...,...
95,95,0.432166,2023-12-06 06:21:45.258174,2023-12-06 06:22:08.818934,0 days 00:00:23.560760,0.507451,0.113231,148,12.757138,COMPLETE
96,96,0.432079,2023-12-06 06:22:08.826468,2023-12-06 06:22:30.015143,0 days 00:00:21.188675,0.514389,0.135387,118,11.303856,COMPLETE
97,97,0.432119,2023-12-06 06:22:30.019520,2023-12-06 06:22:54.024500,0 days 00:00:24.004980,0.527853,0.105031,123,11.694332,COMPLETE
98,98,0.432042,2023-12-06 06:22:54.030453,2023-12-06 06:23:08.968062,0 days 00:00:14.937609,0.521305,0.122531,78,10.818257,COMPLETE


In [155]:
print("Best Score: %.4f" % study_xgb.best_value) # best score 출력
print("Best params: ", study_xgb.best_trial.params) # best score일 때의 하이퍼파라미터들

Best Score: 0.4318
Best params:  {'n_estimators': 72, 'colsample_bynode': 0.5007395732540035, 'reg_lambda': 11.724858323815726, 'learning_rate': 0.1308869026452198}


In [156]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(study_xgb)

In [157]:
# hyper-parameter들의 중요도
optuna.visualization.plot_param_importances(study_xgb)

In [168]:
# model finalization
print("Validation ACC")
best_params_xgb = study_xgb.best_params
best_model_xgb = XGBRegressor(**best_params_xgb,
                          random_state=42)
best_model_xgb.fit(x_train, y_train)
print("Validation Score : %.3f" % rmsle(y_valid, best_model_xgb.predict(x_valid)))

Validation ACC
Validation Score : 0.432


## LGBM

In [27]:
def optimizer_lgbm(trial):
    # 조절할 hyper-parameter 조합을 적어줍니다.  # bayesian optimization
    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 10, 25)
    num_leaves = trial.suggest_categorical('num_leaves', [63, 127, 255, 511, 1023, 2047])
    colsample_bytree = trial.suggest_float('colsample_bynode', 0.5, 0.8)
    reg_lambda = trial.suggest_float('reg_lambda', 0.5, 50)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.1)

    model = LGBMRegressor(n_estimators=n_estimators,
                         max_depth=max_depth,
                         num_leaves = num_leaves,
                         colsample_bytree = colsample_bytree,
                         reg_lambda = reg_lambda,
                         learning_rate = learning_rate,
                         tree_method = "exact",
                         random_state=42)

    model.fit(x_train, y_train)
    preds = model.predict(x_valid)
    RMSLE = rmsle(y_true=y_valid, y_pred=preds)

    return RMSLE

In [160]:
study_lgbm = optuna.create_study(direction="minimize") # 최소/최대 어느 방향의 최적값을 구할 건지.
study_lgbm.optimize(optimizer_xgb, n_trials=100)

[I 2023-12-06 06:24:30,244] A new study created in memory with name: no-name-3d502004-96e8-4ae8-8f14-ee0875503b52
[I 2023-12-06 06:24:44,887] Trial 0 finished with value: 0.4322765871091752 and parameters: {'n_estimators': 75, 'colsample_bynode': 0.5275952157139778, 'reg_lambda': 19.22335751553654, 'learning_rate': 0.2121307358479619}. Best is trial 0 with value: 0.4322765871091752.
[I 2023-12-06 06:25:19,311] Trial 1 finished with value: 0.4323507611083985 and parameters: {'n_estimators': 149, 'colsample_bynode': 0.7074862754177531, 'reg_lambda': 19.157923335548837, 'learning_rate': 0.16925580866420306}. Best is trial 0 with value: 0.4322765871091752.
[I 2023-12-06 06:25:36,361] Trial 2 finished with value: 0.43199085228939965 and parameters: {'n_estimators': 74, 'colsample_bynode': 0.6806031064908146, 'reg_lambda': 15.297135574145642, 'learning_rate': 0.15079346421173898}. Best is trial 2 with value: 0.43199085228939965.
[I 2023-12-06 06:26:05,867] Trial 3 finished with value: 0.4327

In [161]:
# optuna가 시도했던 모든 실험 관련 데이터
study_lgbm.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bynode,params_learning_rate,params_n_estimators,params_reg_lambda,state
0,0,0.432277,2023-12-06 06:24:30.246583,2023-12-06 06:24:44.886803,0 days 00:00:14.640220,0.527595,0.212131,75,19.223358,COMPLETE
1,1,0.432351,2023-12-06 06:24:44.894299,2023-12-06 06:25:19.311118,0 days 00:00:34.416819,0.707486,0.169256,149,19.157923,COMPLETE
2,2,0.431991,2023-12-06 06:25:19.315904,2023-12-06 06:25:36.360918,0 days 00:00:17.045014,0.680603,0.150793,74,15.297136,COMPLETE
3,3,0.432718,2023-12-06 06:25:36.363435,2023-12-06 06:26:05.867473,0 days 00:00:29.504038,0.536733,0.267381,155,15.833630,COMPLETE
4,4,0.432170,2023-12-06 06:26:05.874037,2023-12-06 06:26:27.775557,0 days 00:00:21.901520,0.550963,0.109387,125,12.456294,COMPLETE
...,...,...,...,...,...,...,...,...,...,...
95,95,0.432066,2023-12-06 06:55:15.764677,2023-12-06 06:55:27.811036,0 days 00:00:12.046359,0.632326,0.162065,67,11.671958,COMPLETE
96,96,0.432105,2023-12-06 06:55:27.813951,2023-12-06 06:55:54.727974,0 days 00:00:26.914023,0.585343,0.141121,125,12.362862,COMPLETE
97,97,0.431995,2023-12-06 06:55:54.730479,2023-12-06 06:56:10.008666,0 days 00:00:15.278187,0.574486,0.149152,74,14.012582,COMPLETE
98,98,0.432053,2023-12-06 06:56:10.012449,2023-12-06 06:56:18.561225,0 days 00:00:08.548776,0.602473,0.154300,53,13.103824,COMPLETE


In [162]:
print("Best Score: %.4f" % study_lgbm.best_value) # best score 출력
print("Best params: ", study_lgbm.best_trial.params) # best score일 때의 하이퍼파라미터들

Best Score: 0.4319
Best params:  {'n_estimators': 71, 'colsample_bynode': 0.5479778599318706, 'reg_lambda': 11.597145167459137, 'learning_rate': 0.15766010908190956}


In [163]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(study_lgbm)

In [164]:
# hyper-parameter들의 중요도
optuna.visualization.plot_param_importances(study_lgbm)

In [167]:
# model finalization
print("Validation ACC")
best_params_lgbm = study_lgbm.best_params
best_model_lgbm = LGBMRegressor(**best_params_lgbm,
                          random_state=42)
best_model_lgbm.fit(x_train, y_train)
print("Validation Score : %.3f" % rmsle(y_valid, best_model_lgbm.predict(x_valid)))

Validation ACC
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 397
[LightGBM] [Info] Number of data points in the train set: 27726, number of used features: 167
[LightGBM] [Info] Start training from score 1.071485
Validation Score : 0.432
