In [1]:
#데이터 분석 4종 세트
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 모델들, 성능 평가
# (저는 일반적으로 정형데이터로 머신러닝 분석할 때는 이 2개 모델은 그냥 돌려봅니다. 특히 RF가 테스트하기 좋습니다.)
from lightgbm.sklearn import LGBMRegressor
from xgboost.sklearn import XGBRegressor
from catboost import CatBoostRegressor

# 상관관계 분석, VIF : 다중공산성 제거
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : optuna를 사용하기 위함
from sklearn.model_selection import KFold
from functools import partial

# hyper-parameter tuning을 위한 라이브러리, optuna
import optuna

In [3]:
train = pd.read_csv("../../data/train.csv")
test = pd.read_csv("../../data/test.csv")

In [4]:
train

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO
0,ACCIDENT_00000,2019-01-01 00,화요일,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,길가장자리구역통행중,안전운전불이행,...,상해없음,보행자,여,70세,중상,0,1,0,0,5
1,ACCIDENT_00001,2019-01-01 00,화요일,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,보도통행중,기타,...,상해없음,보행자,남,61세,경상,0,0,1,0,3
2,ACCIDENT_00002,2019-01-01 01,화요일,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,차도통행중,안전운전불이행,...,상해없음,보행자,남,38세,경상,0,0,1,0,3
3,ACCIDENT_00003,2019-01-01 02,화요일,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,36세,중상,0,1,0,0,5
4,ACCIDENT_00004,2019-01-01 04,화요일,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,남,52세,경상,0,0,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,2021-12-31 19,금요일,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,측면충돌,신호위반,...,상해없음,이륜,남,28세,경상,0,0,1,0,3
39605,ACCIDENT_39605,2021-12-31 19,금요일,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,측면충돌,안전거리미확보,...,상해없음,승용,남,52세,경상,0,0,1,0,3
39606,ACCIDENT_39606,2021-12-31 21,금요일,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,측면충돌,교차로운행방법위반,...,중상,승용,남,73세,중상,0,2,0,0,10
39607,ACCIDENT_39607,2021-12-31 22,금요일,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,추돌,안전운전불이행,...,상해없음,승용,여,57세,경상,0,0,1,0,3


In [5]:
test

Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형
0,ACCIDENT_39609,2022-01-01 01,토요일,맑음,대구광역시 수성구 상동,교차로 - 교차로안,건조,차대사람
1,ACCIDENT_39610,2022-01-01 01,토요일,맑음,대구광역시 수성구 지산동,단일로 - 기타,건조,차대사람
2,ACCIDENT_39611,2022-01-01 04,토요일,맑음,대구광역시 수성구 수성동2가,교차로 - 교차로안,건조,차대차
3,ACCIDENT_39612,2022-01-01 04,토요일,맑음,대구광역시 수성구 신매동,단일로 - 기타,건조,차대차
4,ACCIDENT_39613,2022-01-01 06,토요일,맑음,대구광역시 달서구 감삼동,교차로 - 교차로안,건조,차대차
...,...,...,...,...,...,...,...,...
10958,ACCIDENT_50567,2022-12-31 18,토요일,맑음,대구광역시 남구 대명동,단일로 - 터널,건조,차대차
10959,ACCIDENT_50568,2022-12-31 18,토요일,맑음,대구광역시 수성구 시지동,단일로 - 기타,건조,차대차
10960,ACCIDENT_50569,2022-12-31 20,토요일,맑음,대구광역시 수성구 연호동,단일로 - 기타,건조,차대차
10961,ACCIDENT_50570,2022-12-31 20,토요일,맑음,대구광역시 수성구 범물동,교차로 - 교차로부근,건조,차대차


## Data Wrangiling

In [6]:
def wrangiling(data_set):
    print("==================")
    print("       Shape      ")
    print("==================")

    display(data_set.shape)

    print("==================")
    print("        Info      ")
    print("==================")

    display(data_set.info())

    print("==================")
    print("      Columns     ")
    print("==================")

    display(data_set.columns)

    print("==================")
    print("        NaN       ")
    print("==================")

    display(data_set.isna().sum())

    print("==================")
    print("    Duplicated    ")
    print("==================")

    display(data_set[data_set.duplicated()])

    print("==================")
    print("    Description   ")
    print("==================")

    display(data_set.describe())

    print("==================")
    print("      Unique      ")
    print("==================")

    display(data_set.nunique())

In [7]:
wrangiling(train)

       Shape      


(39609, 23)

        Info      
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39609 entries, 0 to 39608
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           39609 non-null  object
 1   사고일시         39609 non-null  object
 2   요일           39609 non-null  object
 3   기상상태         39609 non-null  object
 4   시군구          39609 non-null  object
 5   도로형태         39609 non-null  object
 6   노면상태         39609 non-null  object
 7   사고유형         39609 non-null  object
 8   사고유형 - 세부분류  39609 non-null  object
 9   법규위반         39609 non-null  object
 10  가해운전자 차종     39609 non-null  object
 11  가해운전자 성별     39609 non-null  object
 12  가해운전자 연령     39609 non-null  object
 13  가해운전자 상해정도   39609 non-null  object
 14  피해운전자 차종     38618 non-null  object
 15  피해운전자 성별     38618 non-null  object
 16  피해운전자 연령     38618 non-null  object
 17  피해운전자 상해정도   38618 non-null  object
 18  사망자수         39609 non-null  int64 
 19  중상자수  

None

      Columns     


Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형',
       '사고유형 - 세부분류', '법규위반', '가해운전자 차종', '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도',
       '피해운전자 차종', '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도', '사망자수', '중상자수',
       '경상자수', '부상자수', 'ECLO'],
      dtype='object')

        NaN       


ID               0
사고일시             0
요일               0
기상상태             0
시군구              0
도로형태             0
노면상태             0
사고유형             0
사고유형 - 세부분류      0
법규위반             0
가해운전자 차종         0
가해운전자 성별         0
가해운전자 연령         0
가해운전자 상해정도       0
피해운전자 차종       991
피해운전자 성별       991
피해운전자 연령       991
피해운전자 상해정도     991
사망자수             0
중상자수             0
경상자수             0
부상자수             0
ECLO             0
dtype: int64

    Duplicated    


Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형,사고유형 - 세부분류,법규위반,...,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,ECLO


    Description   


Unnamed: 0,사망자수,중상자수,경상자수,부상자수,ECLO
count,39609.0,39609.0,39609.0,39609.0,39609.0
mean,0.007776,0.262365,1.070085,0.126865,4.726704
std,0.090109,0.500845,0.992034,0.39467,3.207206
min,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,3.0
50%,0.0,0.0,1.0,0.0,3.0
75%,0.0,0.0,1.0,0.0,6.0
max,2.0,6.0,22.0,10.0,74.0


      Unique      


ID             39609
사고일시           18057
요일                 7
기상상태               6
시군구              199
도로형태              11
노면상태               6
사고유형               3
사고유형 - 세부분류       14
법규위반              11
가해운전자 차종          12
가해운전자 성별           3
가해운전자 연령          89
가해운전자 상해정도         6
피해운전자 차종          13
피해운전자 성별           3
피해운전자 연령          97
피해운전자 상해정도         6
사망자수               3
중상자수               7
경상자수              18
부상자수               9
ECLO              46
dtype: int64

In [8]:
wrangiling(test)

       Shape      


(10963, 8)

        Info      
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10963 entries, 0 to 10962
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      10963 non-null  object
 1   사고일시    10963 non-null  object
 2   요일      10963 non-null  object
 3   기상상태    10963 non-null  object
 4   시군구     10963 non-null  object
 5   도로형태    10963 non-null  object
 6   노면상태    10963 non-null  object
 7   사고유형    10963 non-null  object
dtypes: object(8)
memory usage: 685.3+ KB


None

      Columns     


Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형'], dtype='object')

        NaN       


ID      0
사고일시    0
요일      0
기상상태    0
시군구     0
도로형태    0
노면상태    0
사고유형    0
dtype: int64

    Duplicated    


Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형


    Description   


Unnamed: 0,ID,사고일시,요일,기상상태,시군구,도로형태,노면상태,사고유형
count,10963,10963,10963,10963,10963,10963,10963,10963
unique,10963,5548,7,5,192,11,6,3
top,ACCIDENT_39609,2022-11-10 18,금요일,맑음,대구광역시 남구 대명동,단일로 - 기타,건조,차대차
freq,1,10,1743,10321,422,5039,10394,8559


      Unique      


ID      10963
사고일시     5548
요일          7
기상상태        5
시군구       192
도로형태       11
노면상태        6
사고유형        3
dtype: int64

## Data Preprocessing

In [9]:
train = train.drop(["ID", "사고유형 - 세부분류", "법규위반", "가해운전자 차종", "가해운전자 성별", "가해운전자 연령",
                    "가해운전자 상해정도", "피해운전자 차종", "피해운전자 성별", "피해운전자 연령", "피해운전자 상해정도",
                    "중상자수", "경상자수", "부상자수", "ECLO"], axis=1)

In [10]:
train["date_time"] = pd.to_datetime(train['사고일시'])
train['Month'] = train.date_time.dt.month
train['Day'] = train.date_time.dt.day
train['Hour'] = train.date_time.dt.hour
train = train.drop(["사고일시"], axis = 1)
train = train.drop(["date_time"], axis = 1)

In [11]:
train["Week"]=np.where(train["요일"].isin(["월요일", "화요일", "수요일", "목요일", "금요일"]), 0, 1)

In [12]:
train["Week"].unique()

array([0, 1])

In [13]:
train = train.drop(["요일"], axis = 1)
train

Unnamed: 0,기상상태,시군구,도로형태,노면상태,사고유형,사망자수,Month,Day,Hour,Week
0,맑음,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,0,1,1,0,0
1,흐림,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,0,1,1,0,0
2,맑음,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,0,1,1,1,0
3,맑음,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,0,1,1,2,0
4,맑음,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,0,1,1,4,0
...,...,...,...,...,...,...,...,...,...,...
39604,맑음,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,0,12,31,19,0
39605,맑음,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,0,12,31,19,0
39606,맑음,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,0,12,31,21,0
39607,맑음,대구광역시 달서구 장동,기타 - 기타,건조,차대차,0,12,31,22,0


In [14]:
train["Weather"] = np.where(train["기상상태"].isin(["맑음"]), 0, 1)
train = train.drop(["기상상태"], axis=1)
train

Unnamed: 0,시군구,도로형태,노면상태,사고유형,사망자수,Month,Day,Hour,Week,Weather
0,대구광역시 중구 대신동,단일로 - 기타,건조,차대사람,0,1,1,0,0,0
1,대구광역시 달서구 감삼동,단일로 - 기타,건조,차대사람,0,1,1,0,0,1
2,대구광역시 수성구 두산동,단일로 - 기타,건조,차대사람,0,1,1,1,0,0
3,대구광역시 북구 복현동,단일로 - 기타,건조,차대차,0,1,1,2,0,0
4,대구광역시 동구 신암동,단일로 - 기타,건조,차대차,0,1,1,4,0,0
...,...,...,...,...,...,...,...,...,...,...
39604,대구광역시 수성구 수성동3가,교차로 - 교차로안,건조,차대차,0,12,31,19,0,0
39605,대구광역시 달서구 상인동,단일로 - 기타,건조,차대차,0,12,31,19,0,0
39606,대구광역시 달서구 월성동,교차로 - 교차로안,건조,차대차,0,12,31,21,0,0
39607,대구광역시 달서구 장동,기타 - 기타,건조,차대차,0,12,31,22,0,0


In [15]:
train[["Road_Shape1", "Road_Shape2"]] = train["도로형태"].str.split("-", expand=True)
train = train.drop(["도로형태"], axis=1)
train

Unnamed: 0,시군구,노면상태,사고유형,사망자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2
0,대구광역시 중구 대신동,건조,차대사람,0,1,1,0,0,0,단일로,기타
1,대구광역시 달서구 감삼동,건조,차대사람,0,1,1,0,0,1,단일로,기타
2,대구광역시 수성구 두산동,건조,차대사람,0,1,1,1,0,0,단일로,기타
3,대구광역시 북구 복현동,건조,차대차,0,1,1,2,0,0,단일로,기타
4,대구광역시 동구 신암동,건조,차대차,0,1,1,4,0,0,단일로,기타
...,...,...,...,...,...,...,...,...,...,...,...
39604,대구광역시 수성구 수성동3가,건조,차대차,0,12,31,19,0,0,교차로,교차로안
39605,대구광역시 달서구 상인동,건조,차대차,0,12,31,19,0,0,단일로,기타
39606,대구광역시 달서구 월성동,건조,차대차,0,12,31,21,0,0,교차로,교차로안
39607,대구광역시 달서구 장동,건조,차대차,0,12,31,22,0,0,기타,기타


In [16]:
train["Road_stat"]=np.where(train["노면상태"].isin(["건조"]), 0, 1)
train = train.drop(["노면상태"], axis=1)
train

Unnamed: 0,시군구,사고유형,사망자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2,Road_stat
0,대구광역시 중구 대신동,차대사람,0,1,1,0,0,0,단일로,기타,0
1,대구광역시 달서구 감삼동,차대사람,0,1,1,0,0,1,단일로,기타,0
2,대구광역시 수성구 두산동,차대사람,0,1,1,1,0,0,단일로,기타,0
3,대구광역시 북구 복현동,차대차,0,1,1,2,0,0,단일로,기타,0
4,대구광역시 동구 신암동,차대차,0,1,1,4,0,0,단일로,기타,0
...,...,...,...,...,...,...,...,...,...,...,...
39604,대구광역시 수성구 수성동3가,차대차,0,12,31,19,0,0,교차로,교차로안,0
39605,대구광역시 달서구 상인동,차대차,0,12,31,19,0,0,단일로,기타,0
39606,대구광역시 달서구 월성동,차대차,0,12,31,21,0,0,교차로,교차로안,0
39607,대구광역시 달서구 장동,차대차,0,12,31,22,0,0,기타,기타,0


In [17]:
train["사고유형"].value_counts()

사고유형
차대차     31785
차대사람     6833
차량단독      991
Name: count, dtype: int64

In [18]:
onehot = pd.get_dummies(train[['사고유형']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['사고유형'], axis = 1)
train

Unnamed: 0,시군구,사망자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독
0,대구광역시 중구 대신동,0,1,1,0,0,0,단일로,기타,0,True,False,False
1,대구광역시 달서구 감삼동,0,1,1,0,0,1,단일로,기타,0,True,False,False
2,대구광역시 수성구 두산동,0,1,1,1,0,0,단일로,기타,0,True,False,False
3,대구광역시 북구 복현동,0,1,1,2,0,0,단일로,기타,0,False,True,False
4,대구광역시 동구 신암동,0,1,1,4,0,0,단일로,기타,0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,대구광역시 수성구 수성동3가,0,12,31,19,0,0,교차로,교차로안,0,False,True,False
39605,대구광역시 달서구 상인동,0,12,31,19,0,0,단일로,기타,0,False,True,False
39606,대구광역시 달서구 월성동,0,12,31,21,0,0,교차로,교차로안,0,False,True,False
39607,대구광역시 달서구 장동,0,12,31,22,0,0,기타,기타,0,False,True,False


In [19]:
train[["시", "구", "동"]] = train["시군구"].str.split(" ", expand=True)
train = train.drop(["시군구"], axis=1)
train

Unnamed: 0,사망자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,시,구,동
0,0,1,1,0,0,0,단일로,기타,0,True,False,False,대구광역시,중구,대신동
1,0,1,1,0,0,1,단일로,기타,0,True,False,False,대구광역시,달서구,감삼동
2,0,1,1,1,0,0,단일로,기타,0,True,False,False,대구광역시,수성구,두산동
3,0,1,1,2,0,0,단일로,기타,0,False,True,False,대구광역시,북구,복현동
4,0,1,1,4,0,0,단일로,기타,0,False,True,False,대구광역시,동구,신암동
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,12,31,19,0,0,교차로,교차로안,0,False,True,False,대구광역시,수성구,수성동3가
39605,0,12,31,19,0,0,단일로,기타,0,False,True,False,대구광역시,달서구,상인동
39606,0,12,31,21,0,0,교차로,교차로안,0,False,True,False,대구광역시,달서구,월성동
39607,0,12,31,22,0,0,기타,기타,0,False,True,False,대구광역시,달서구,장동


In [20]:
train = train.drop(["시"], axis=1)
train

Unnamed: 0,사망자수,Month,Day,Hour,Week,Weather,Road_Shape1,Road_Shape2,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,구,동
0,0,1,1,0,0,0,단일로,기타,0,True,False,False,중구,대신동
1,0,1,1,0,0,1,단일로,기타,0,True,False,False,달서구,감삼동
2,0,1,1,1,0,0,단일로,기타,0,True,False,False,수성구,두산동
3,0,1,1,2,0,0,단일로,기타,0,False,True,False,북구,복현동
4,0,1,1,4,0,0,단일로,기타,0,False,True,False,동구,신암동
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,12,31,19,0,0,교차로,교차로안,0,False,True,False,수성구,수성동3가
39605,0,12,31,19,0,0,단일로,기타,0,False,True,False,달서구,상인동
39606,0,12,31,21,0,0,교차로,교차로안,0,False,True,False,달서구,월성동
39607,0,12,31,22,0,0,기타,기타,0,False,True,False,달서구,장동


In [21]:
onehot = pd.get_dummies(train[['Road_Shape1']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['Road_Shape1'], axis = 1)
train

Unnamed: 0,사망자수,Month,Day,Hour,Week,Weather,Road_Shape2,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,구,동,Road_Shape1_교차로,Road_Shape1_기타,Road_Shape1_단일로,Road_Shape1_미분류,Road_Shape1_주차장
0,0,1,1,0,0,0,기타,0,True,False,False,중구,대신동,False,False,True,False,False
1,0,1,1,0,0,1,기타,0,True,False,False,달서구,감삼동,False,False,True,False,False
2,0,1,1,1,0,0,기타,0,True,False,False,수성구,두산동,False,False,True,False,False
3,0,1,1,2,0,0,기타,0,False,True,False,북구,복현동,False,False,True,False,False
4,0,1,1,4,0,0,기타,0,False,True,False,동구,신암동,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,12,31,19,0,0,교차로안,0,False,True,False,수성구,수성동3가,True,False,False,False,False
39605,0,12,31,19,0,0,기타,0,False,True,False,달서구,상인동,False,False,True,False,False
39606,0,12,31,21,0,0,교차로안,0,False,True,False,달서구,월성동,True,False,False,False,False
39607,0,12,31,22,0,0,기타,0,False,True,False,달서구,장동,False,True,False,False,False


In [22]:
onehot = pd.get_dummies(train[['Road_Shape2']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['Road_Shape2'], axis = 1)
train

Unnamed: 0,사망자수,Month,Day,Hour,Week,Weather,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,...,Road_Shape2_ 고가도로위,Road_Shape2_ 교량,Road_Shape2_ 교차로부근,Road_Shape2_ 교차로안,Road_Shape2_ 교차로횡단보도내,Road_Shape2_ 기타,Road_Shape2_ 미분류,Road_Shape2_ 주차장,Road_Shape2_ 지하차도(도로)내,Road_Shape2_ 터널
0,0,1,1,0,0,0,0,True,False,False,...,False,False,False,False,False,True,False,False,False,False
1,0,1,1,0,0,1,0,True,False,False,...,False,False,False,False,False,True,False,False,False,False
2,0,1,1,1,0,0,0,True,False,False,...,False,False,False,False,False,True,False,False,False,False
3,0,1,1,2,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False
4,0,1,1,4,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,12,31,19,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False
39605,0,12,31,19,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False
39606,0,12,31,21,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False
39607,0,12,31,22,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False


In [23]:
onehot = pd.get_dummies(train[['구']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['구'], axis = 1)
train

Unnamed: 0,사망자수,Month,Day,Hour,Week,Weather,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,...,Road_Shape2_ 지하차도(도로)내,Road_Shape2_ 터널,구_남구,구_달서구,구_달성군,구_동구,구_북구,구_서구,구_수성구,구_중구
0,0,1,1,0,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,False,True
1,0,1,1,0,0,1,0,True,False,False,...,False,False,False,True,False,False,False,False,False,False
2,0,1,1,1,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,True,False
3,0,1,1,2,0,0,0,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,0,1,1,4,0,0,0,False,True,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,12,31,19,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,True,False
39605,0,12,31,19,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False
39606,0,12,31,21,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False
39607,0,12,31,22,0,0,0,False,True,False,...,False,False,False,True,False,False,False,False,False,False


In [24]:
onehot = pd.get_dummies(train[['동']], drop_first=False)
train = pd.concat([train, onehot], axis = 1).drop(['동'], axis = 1)
train

Unnamed: 0,사망자수,Month,Day,Hour,Week,Weather,Road_stat,사고유형_차대사람,사고유형_차대차,사고유형_차량단독,...,동_하서동,동_학정동,동_향촌동,동_현풍읍,동_호림동,동_호산동,동_화원읍,동_화전동,동_황금동,동_효목동
0,0,1,1,0,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,1,1,0,0,1,0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,1,1,1,0,0,0,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,1,1,2,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,0,1,1,4,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,12,31,19,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
39605,0,12,31,19,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
39606,0,12,31,21,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False
39607,0,12,31,22,0,0,0,False,True,False,...,False,False,False,False,False,False,False,False,False,False


## Modeling

In [25]:
## 3. hold-out train test split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
x_train, x_valid, y_train, y_valid = train_test_split(train[train.columns.difference(['사망자수'])],
                                                      train['사망자수'],
                                                      test_size = 0.3,
                                                      random_state = 42
                                                    )

In [26]:
# Define a function to calculate RMSLE
from sklearn.metrics import make_scorer
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

# # Create a scorer for RMSLE
# rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

In [27]:
def optimizer_xgb(trial):
    # 조절할 hyper-parameter 조합을 적어줍니다.  # bayesian optimization
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 5, 10)
    colsample_bynode = trial.suggest_float('colsample_bynode', 0.5, 0.8)
    reg_lambda = trial.suggest_float('reg_lambda', 10, 20)
    learning_rate = trial.suggest_float('learning_rate', 0.1, 0.3)
    tree_method = "exact"

    model = XGBRegressor(n_estimators=n_estimators,
                         max_depth=max_depth,
                         colsample_bynode = colsample_bynode,
                         reg_lambda = reg_lambda,
                         learning_rate = learning_rate,
                         tree_method = tree_method,
                         random_state=42)

    model.fit(x_train, y_train)
    preds = model.predict(x_valid)
    RMSLE = rmsle(y_true=y_valid, y_pred=preds)

    return RMSLE

In [52]:
study_xgb = optuna.create_study(direction="minimize") # 최소/최대 어느 방향의 최적값을 구할 건지.
study_xgb.optimize(optimizer_xgb, n_trials=100)

[I 2023-12-06 06:43:32,877] A new study created in memory with name: no-name-a6756904-c927-45e2-a900-d65231139f41
[I 2023-12-06 06:44:32,924] Trial 0 finished with value: 0.06423724301094393 and parameters: {'n_estimators': 182, 'max_depth': 7, 'colsample_bynode': 0.7814606713094024, 'reg_lambda': 12.002911535880617, 'learning_rate': 0.16998145472661513}. Best is trial 0 with value: 0.06423724301094393.
[I 2023-12-06 06:45:14,832] Trial 1 finished with value: 0.06406559195982568 and parameters: {'n_estimators': 175, 'max_depth': 6, 'colsample_bynode': 0.6167191242662596, 'reg_lambda': 12.209969346416278, 'learning_rate': 0.22891386886300227}. Best is trial 1 with value: 0.06406559195982568.
[I 2023-12-06 06:46:22,558] Trial 2 finished with value: 0.06645037286646985 and parameters: {'n_estimators': 152, 'max_depth': 10, 'colsample_bynode': 0.7278443161400475, 'reg_lambda': 13.688968106521724, 'learning_rate': 0.26334123513428565}. Best is trial 1 with value: 0.06406559195982568.
[I 202

In [44]:
# optuna가 시도했던 모든 실험 관련 데이터
study_xgb.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bynode,params_learning_rate,params_reg_lambda,state
0,0,,2023-12-06 06:42:31.270306,2023-12-06 06:42:31.271651,0 days 00:00:00.001345,0.600987,0.264801,17.51673,FAIL


In [53]:
print("Best Score: %.4f" % study_xgb.best_value) # best score 출력
print("Best params: ", study_xgb.best_trial.params) # best score일 때의 하이퍼파라미터들

Best Score: 0.0631
Best params:  {'n_estimators': 60, 'max_depth': 5, 'colsample_bynode': 0.5003659401745417, 'reg_lambda': 16.663370609249696, 'learning_rate': 0.21940444649503582}


In [54]:
# 실험 기록 시각화
optuna.visualization.plot_optimization_history(study_xgb)

In [55]:
# hyper-parameter들의 중요도
optuna.visualization.plot_param_importances(study_xgb)

In [48]:
# # model
# xgb = XGBRegressor(objective='count:poisson',
#                     random_state = 42,
#                     use_label_encoder=False,
#                     enable_categorical=False,
#                     tree_method='hist',
#                     n_estimators=80
#                     )
# lgbm = LGBMRegressor(
#     objective='poisson',
#     random_state = 42,
#     n_estimators=80
# )
# cb = CatBoostRegressor(
#     cat_features = categorical_cols,
#     objective = 'Poisson',
#     random_state = 42
# )

In [None]:
# fitting
# xgb.fit(x_train, y_train)
# lgbm.fit(x_train, y_train)
# cb.fit(x_train, y_train,
#         eval_set=(x_valid, y_valid),
#         use_best_model = True,
#         plot=True)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 397
[LightGBM] [Info] Number of data points in the train set: 27726, number of used features: 167
[LightGBM] [Info] Start training from score 0.069046


In [None]:
# predict
pred_xgb = xgb.predict(x_valid)
pred_lgbm = lgbm.predict(x_valid)
# pred_cb = cb.predict(x_valid)

In [None]:
# metrics
rmsle_xgb = mean_squared_log_error(y_valid, pred_xgb, squared = False)
rmsle_lgbm = mean_squared_log_error(y_valid, pred_lgbm, squared = False)
# rmsle_cb = mean_squared_log_error(y_valid, pred_cb, squared = False)

print(f'xgboost : {rmsle_xgb}')
print(f'lightgbm : {rmsle_lgbm}')
# print(f'catboost : {rmsle_cb}')

xgboost : 0.43257084685807645
lightgbm : 0.4325627422810081
