Setting Data

In [3]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [4]:
# load modules
import numpy as np
import pandas as pd

# split
from sklearn.model_selection import train_test_split

# models
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# metrics
from sklearn.metrics import mean_squared_log_error

In [5]:
# load data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [6]:
# 필요한 데이터만 가져오기
use_train = train[['사고일시', '요일', '기상상태', '도로형태', '노면상태', '사고유형', '시군구', '중상자수']]
#  'ID'를 제외한 모든 열을 선택한 새로운 데이터프레임을 생성
use_test = test[test.columns.difference(['ID'])]

Data Preprocessing

In [7]:
# train 데이터프레임에 대한 one-hot encoding 및 열 삭제
use_train2 = pd.concat([use_train, pd.get_dummies(use_train[['기상상태', '도로형태', '노면상태', '사고유형']], drop_first=True)], axis=1).drop(['기상상태', '도로형태', '노면상태', '사고유형'], axis=1)

# test 데이터프레임에 대한 one-hot encoding 및 열 삭제
use_test2 = pd.concat([use_test, pd.get_dummies(test[['기상상태', '도로형태', '노면상태', '사고유형']], drop_first=True)], axis=1).drop(['기상상태', '도로형태', '노면상태', '사고유형'], axis=1)

In [8]:
# 요일 월~금: 0 토~일:1
# train 데이터프레임에 대한 '주말' 열 생성 및 '요일' 열 삭제
use_train2['주말'] = use_train2['요일'].apply(lambda x: 0 if x in ['월요일', '화요일', '수요일', '목요일', '금요일'] else 1)
use_train3 = use_train2.drop(['요일'], axis=1)

# test 데이터프레임에 대한 '주말' 열 생성 및 '요일' 열 삭제
use_test2['주말'] = use_test2['요일'].apply(lambda x: 0 if x in ['월요일', '화요일', '수요일', '목요일', '금요일'] else 1)
use_test3 = use_test2.drop(['요일'], axis=1)

In [9]:
# 사고일시 나누기
## date type data
use_train3['사고일시'] = pd.to_datetime(use_train3['사고일시'])
use_test3['사고일시'] = pd.to_datetime(use_test3['사고일시'])

In [10]:
# split year, month, day, hour
use_train3['year'] = use_train3['사고일시'].dt.year
use_train3['month'] = use_train3['사고일시'].dt.month
use_train3['day'] = use_train3['사고일시'].dt.day
use_train3['hour'] = use_train3['사고일시'].dt.hour

use_test3['year'] = use_test3['사고일시'].dt.year
use_test3['month'] = use_test3['사고일시'].dt.month
use_test3['day'] = use_test3['사고일시'].dt.day
use_test3['hour'] = use_test3['사고일시'].dt.hour

In [11]:
# 시군구 나누기
use_train3[['시', '구', '동가']] = use_train3['시군구'].str.split(' ', expand = True)
use_train4 = use_train3.drop(['사고일시', '시군구', '시', 'year'], axis = 1)

use_test3[['시', '구', '동가']] = use_test3['시군구'].str.split(' ', expand = True)
use_test4 = use_test3.drop(['사고일시', '시군구', '시', 'year'], axis = 1)

In [12]:
# type change
use_train4['구'] = use_train4['구'].astype('category')
use_train4['동가'] = use_train4['동가'].astype('category')

use_test4['구'] = use_test4['구'].astype('category')
use_test4['동가'] = use_test4['동가'].astype('category')

In [13]:
# 구와 동가 제거
use_train5 = use_train4[use_train4.columns.difference(['구', '동가'])]
use_test5 = use_test4[use_test4.columns.difference(['구', '동가'])]

In [24]:
# 기상정보_안개 추가
use_test4['기상상태_안개']  = 0

In [15]:
# use_train5.columns.to_list()
# use_train4.dtypes
# use_train4['중상자수']
# use_train4['중상자수'] = use_train4['중상자수'].astype('int64')

Modeling

In [16]:
## 3. hold-out train test split
x_train, x_valid, y_train, y_valid = train_test_split(use_train4[use_train4.columns.difference(['중상자수'])],
                                                      use_train4['중상자수'],
                                                      test_size = 0.3,
                                                      random_state = 42
                                                    )

In [17]:
# category values
labels = x_train.dtypes.reset_index()
categorical_cols = list(labels[labels[0] == 'category'].index)

In [18]:
# model
xgb = XGBRegressor(objective='count:poisson',
                    random_state = 42,
                    use_label_encoder=False,
                    enable_categorical=True,
                    tree_method='hist',
                    n_estimators=80
                    )
lgbm = LGBMRegressor(
    objective='poisson',
    random_state = 42,
     n_estimators=80
)
cb = CatBoostRegressor(
    cat_features = categorical_cols,
    objective = 'Poisson',
    random_state = 42,
    iterations=80
)

In [19]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27726 entries, 29542 to 15795
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   day                   27726 non-null  int32   
 1   hour                  27726 non-null  int32   
 2   month                 27726 non-null  int32   
 3   구                     27726 non-null  category
 4   기상상태_눈                27726 non-null  bool    
 5   기상상태_맑음               27726 non-null  bool    
 6   기상상태_비                27726 non-null  bool    
 7   기상상태_안개               27726 non-null  bool    
 8   기상상태_흐림               27726 non-null  bool    
 9   노면상태_기타               27726 non-null  bool    
 10  노면상태_서리/결빙            27726 non-null  bool    
 11  노면상태_적설               27726 non-null  bool    
 12  노면상태_젖음/습기            27726 non-null  bool    
 13  노면상태_침수               27726 non-null  bool    
 14  도로형태_교차로 - 교차로안       27726 non-null  bool    
 15  도로형

In [20]:
x_train.dtypes

day                        int32
hour                       int32
month                      int32
구                       category
기상상태_눈                      bool
기상상태_맑음                     bool
기상상태_비                      bool
기상상태_안개                     bool
기상상태_흐림                     bool
노면상태_기타                     bool
노면상태_서리/결빙                  bool
노면상태_적설                     bool
노면상태_젖음/습기                  bool
노면상태_침수                     bool
도로형태_교차로 - 교차로안             bool
도로형태_교차로 - 교차로횡단보도내         bool
도로형태_기타 - 기타                bool
도로형태_단일로 - 고가도로위            bool
도로형태_단일로 - 교량               bool
도로형태_단일로 - 기타               bool
도로형태_단일로 - 지하차도(도로)내        bool
도로형태_단일로 - 터널               bool
도로형태_미분류 - 미분류              bool
도로형태_주차장 - 주차장              bool
동가                      category
사고유형_차대차                    bool
사고유형_차량단독                   bool
주말                         int64
dtype: object

In [21]:
# fitting
xgb.fit(x_train, y_train)
lgbm.fit(x_train, y_train)
cb.fit(x_train, y_train,
        eval_set=(x_valid, y_valid),
        use_best_model = True,
        plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.9709695	test: 0.9705568	best: 0.9705568 (0)	total: 375ms	remaining: 29.6s
1:	learn: 0.9446251	test: 0.9438926	best: 0.9438926 (1)	total: 488ms	remaining: 19s
2:	learn: 0.9200798	test: 0.9189530	best: 0.9189530 (2)	total: 638ms	remaining: 16.4s
3:	learn: 0.8978606	test: 0.8964162	best: 0.8964162 (3)	total: 722ms	remaining: 13.7s
4:	learn: 0.8768227	test: 0.8750279	best: 0.8750279 (4)	total: 865ms	remaining: 13s
5:	learn: 0.8580159	test: 0.8559081	best: 0.8559081 (5)	total: 948ms	remaining: 11.7s
6:	learn: 0.8407160	test: 0.8383088	best: 0.8383088 (6)	total: 1.05s	remaining: 11s
7:	learn: 0.8242951	test: 0.8215979	best: 0.8215979 (7)	total: 1.18s	remaining: 10.7s
8:	learn: 0.8095130	test: 0.8065655	best: 0.8065655 (8)	total: 1.3s	remaining: 10.2s
9:	learn: 0.7957326	test: 0.7924987	best: 0.7924987 (9)	total: 1.37s	remaining: 9.62s
10:	learn: 0.7830974	test: 0.7796117	best: 0.7796117 (10)	total: 1.48s	remaining: 9.27s
11:	learn: 0.7713389	test: 0.7675961	best: 0.7675961 (11)	t

<catboost.core.CatBoostRegressor at 0x7fd96aa202b0>

In [22]:
# predict
pred_xgb = xgb.predict(x_valid)
pred_lgbm = lgbm.predict(x_valid)
pred_cb = cb.predict(x_valid)

In [25]:
# use_train4의 열 순서 가져오기
desired_order_train = ['day', 'hour', 'month', '구', '기상상태_눈', '기상상태_맑음', '기상상태_비', '기상상태_안개', '기상상태_흐림', '노면상태_기타', '노면상태_서리/결빙', '노면상태_적설', '노면상태_젖음/습기', '노면상태_침수', '도로형태_교차로 - 교차로안', '도로형태_교차로 - 교차로횡단보도내', '도로형태_기타 - 기타', '도로형태_단일로 - 고가도로위', '도로형태_단일로 - 교량', '도로형태_단일로 - 기타', '도로형태_단일로 - 지하차도(도로)내', '도로형태_단일로 - 터널', '도로형태_미분류 - 미분류', '도로형태_주차장 - 주차장', '동가', '사고유형_차대차', '사고유형_차량단독', '주말']

# use_test4의 열을 use_train4의 열 순서에 맞춰 재배치
use_test4 = use_test4[desired_order_train]

In [26]:
# metrics
rmsle_xgb = mean_squared_log_error(y_valid, pred_xgb, squared = False)
rmsle_lgbm = mean_squared_log_error(y_valid, pred_lgbm, squared = False)
rmsle_cb = mean_squared_log_error(y_valid, pred_cb, squared = False)

print(f'xgboost : {rmsle_xgb}')
print(f'lightgbm : {rmsle_lgbm}')
print(f'catboost : {rmsle_cb}')

xgboost : 0.324583323813243
lightgbm : 0.3195920865893019
catboost : 0.32404003398906145


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [27]:
# predict
test_xgb = xgb.predict(use_test4)
test_lgbm = lgbm.predict(use_test4)
test_cb = cb.predict(use_test4)

In [29]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = test_xgb
baseline_submission

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,0.127859
1,ACCIDENT_39610,0.079349
2,ACCIDENT_39611,0.225567
3,ACCIDENT_39612,0.153497
4,ACCIDENT_39613,0.290050
...,...,...
10958,ACCIDENT_50567,0.061406
10959,ACCIDENT_50568,0.044746
10960,ACCIDENT_50569,0.118067
10961,ACCIDENT_50570,0.035119


In [179]:
sample_submission = pd.read_csv('sample_submission.csv')
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = test_lgbm
baseline_submission

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,0.677294
1,ACCIDENT_39610,0.468718
2,ACCIDENT_39611,1.153290
3,ACCIDENT_39612,1.027721
4,ACCIDENT_39613,1.187340
...,...,...
10958,ACCIDENT_50567,1.577597
10959,ACCIDENT_50568,1.174221
10960,ACCIDENT_50569,1.291134
10961,ACCIDENT_50570,1.510382


In [31]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = test_cb
baseline_submission

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,0.404699
1,ACCIDENT_39610,0.368024
2,ACCIDENT_39611,0.341992
3,ACCIDENT_39612,0.308780
4,ACCIDENT_39613,0.330044
...,...,...
10958,ACCIDENT_50567,0.275696
10959,ACCIDENT_50568,0.268619
10960,ACCIDENT_50569,0.268089
10961,ACCIDENT_50570,0.274448


In [32]:
def calculate_eclo(row):
    m, n, l, o = row['사망자수'], row['중상자수'], row['경상자수'], row['부상자수']
    return m * 10 + n * 5 + l * 3 + o * 1

def add_eclo_column(csv_path):
    # CSV 파일 읽기
    df = pd.read_csv(csv_path, encoding ='cp949')

    # ECLO 값 계산 및 'ECLO' 열 추가
    df['ECLO'] = df.apply(calculate_eclo, axis=1)

    # 결과를 새로운 CSV 파일로 저장 (선택사항)
    df.to_csv('output_with_eclo.csv', index=False)

In [33]:
sample_submission = pd.read_csv('./data/sample_submission.csv')
baseline_submission = sample_submission.copy()
baseline_submission['중상자수'] = test_lgbm
baseline_submission

Unnamed: 0,ID,ECLO,중상자수
0,ACCIDENT_39609,0,0.280747
1,ACCIDENT_39610,0,0.437957
2,ACCIDENT_39611,0,0.307829
3,ACCIDENT_39612,0,0.290756
4,ACCIDENT_39613,0,0.262292
...,...,...,...
10958,ACCIDENT_50567,0,0.166153
10959,ACCIDENT_50568,0,0.328809
10960,ACCIDENT_50569,0,0.243574
10961,ACCIDENT_50570,0,0.246647


In [34]:
baseline_submission.to_csv('submission.csv', index=False)

In [35]:
# ECLO 합친 후 csv 파일 저장
add_eclo_column('baseline.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'baseline.csv'