## 0. 라이브러리 불러오기

In [23]:
import os
import random
from datetime import datetime
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# random seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(0)

## 1. 데이터 불러오기
- train.csv
- test.csv
- sample_submission.csv
- 대구 보안등 정보.csv
- 대구 어린이 보호 구역 정보.csv
- 대구 주차장 정보.csv

In [24]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [25]:
light_df = pd.read_csv('data/external_open/대구 보안등 정보.csv', encoding='cp949')[['설치개수', '소재지지번주소']]

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

light_df[['시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])

light_df = light_df.groupby(['시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)

In [26]:
child_area_df = pd.read_csv('data/external_open/대구 어린이 보호 구역 정보.csv', encoding='cp949').drop_duplicates()[['소재지지번주소']]
child_area_df['cnt'] = 1

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

child_area_df[['시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df = child_area_df.groupby(['시', '구', '동']).sum().reset_index()
child_area_df.reset_index(inplace=True, drop=True)

In [27]:
parking_df = pd.read_csv('data/external_open/대구 주차장 정보.csv', encoding='cp949')[['소재지지번주소', '급지구분']]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

parking_df[['시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

parking_df = parking_df.groupby(['시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

## 2. 데이터 합치기 (train, test + 외부데이터)

In [28]:
location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['시', '구', '동']] = train_df['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['시', '구', '동']] = test_df['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

In [29]:
# train_df와 test_df에, light_df와 child_area_df, parking_df를 merge하세요.
train_df = pd.merge(train_df, light_df, how='left', on=['시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['시', '구', '동'])

test_df = pd.merge(test_df, light_df, how='left', on=['시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['시', '구', '동'])

In [30]:
len(train_df['동'].unique())

196

## 3. One - Hot Encoding
구에 대해서 원핫인코딩 수행

In [31]:
from sklearn.preprocessing import OneHotEncoder

# train_df와 test_df의 기상상태 및 노면상태 열 선택
train_categorical_data = train_df[['구']]
test_categorical_data = test_df[['구']]

# OneHotEncoder 인스턴스 생성 및 fit_transform 수행
encoder = OneHotEncoder()
train_encoded = encoder.fit_transform(train_categorical_data)
test_encoded = encoder.transform(test_categorical_data)

# OneHotEncoder가 사용한 카테고리 목록을 가져와서 카테고리 이름을 열 이름으로 변환
feature_names = encoder.get_feature_names_out(['구'])

# 밀집 행렬로 변환 (선택 사항)
train_encoded_dense = train_encoded.toarray()
test_encoded_dense = test_encoded.toarray()

# 데이터프레임으로 변환 (선택 사항)
train_encoded_df = pd.DataFrame(train_encoded_dense, columns=feature_names, index=train_df.index)
test_encoded_df = pd.DataFrame(test_encoded_dense, columns=feature_names, index=test_df.index)

# 기존 열 제거
train_df = train_df.drop(['구'], axis=1)
test_df = test_df.drop(['구'], axis=1)

# 인코딩된 열 추가
train_df = pd.concat([train_df, train_encoded_df], axis=1)
test_df = pd.concat([test_df, test_encoded_df], axis=1)

## 4. 동별 사고 발생횟수 추가

In [32]:
accident_counts = train_df['동'].value_counts().reset_index()
accident_counts.columns = ['동', '사고횟수']

In [33]:
# '시군구'별 사고 횟수를 train_df에 매핑하기 위해 '시군구' 컬럼을 기준으로 사고 횟수를 가져옵니다.
train_df['사고발생횟수'] = train_df['동'].map(accident_counts.set_index('동')['사고횟수'])

# '시군구'별 사고 횟수를 test_df에 매핑하기 위해 '시군구' 컬럼을 기준으로 사고 횟수를 가져옵니다.
test_df['사고발생횟수'] = test_df['동'].map(accident_counts.set_index('동')['사고횟수'])

## 5. 연, 월, 일, 시간, 요일, 공휴일 추가

In [34]:
# datetime 컬럼 처리
train_df['사고일시'] = pd.to_datetime(train_df['사고일시'])
test_df['사고일시'] = pd.to_datetime(test_df['사고일시'])

# datetime을 여러 파생 변수로 변환
for df in [train_df, test_df]:
    df['year'] = df['사고일시'].dt.year
    df['month'] = df['사고일시'].dt.month
    df['day'] = df['사고일시'].dt.day
    df['hour'] = df['사고일시'].dt.hour
    df['minute'] = df['사고일시'].dt.minute
    df['weekday'] = df['사고일시'].dt.weekday

In [35]:
holi_weekday = ['2019-01-01', '2019-02-04', '2019-02-05', '2019-02-06', '2019-03-01', '2019-05-05', '2019-05-12', '2019-06-06', '2019-08-15', '2019-09-12', '2019-09-13', '2019-09-14', '2019-10-03', '2019-10-09', '2019-12-25',
                '2020-01-01' ,'2020-01-24' ,'2020-01-25', '2020-01-26', '2020-03-01', '2020-04-30', '2020-05-05', '2020-06-06', '2020-08-15', '2020-08-17', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-09', '2020-12-25',
                '2021-01-01' ,'2021-02-11' ,'2021-02-12', '2021-02-13', '2021-03-01', '2021-05-05', '2021-05-19', '2021-06-06', '2021-08-15', '2021-09-20', '2021-09-21', '2021-09-22', '2021-10-03', '2021-10-09', '2021-12-25',
                '2022-01-01' ,'2022-01-31' ,'2022-02-01', '2022-02-02', '2022-03-01', '2022-05-05', '2022-05-08', '2022-06-06', '2022-08-15', '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2020-10-10', '2022-12-25',
                '2023-01-01' ,'2023-01-21' ,'2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01']

In [36]:
train_df['사고일시'] = pd.to_datetime(train_df['사고일시'])
train_df['day_of_week'] = train_df['사고일시'].dt.dayofweek
train_df['holiday'] = np.where((train_df.사고일시.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
train_df['weekend'] = np.where((train_df.day_of_week >= 5), 1, 0)

test_df['사고일시'] = pd.to_datetime(test_df['사고일시'])
test_df['day_of_week'] = test_df['사고일시'].dt.dayofweek
test_df['holiday'] = np.where((test_df.사고일시.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
test_df['weekend'] = np.where((test_df.day_of_week >= 5), 1, 0)

## 6. 계절, 코사인 시간 추가

In [37]:
def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']

train_df['Cosine_Time'] = np.cos(2 * np.pi * train_df['hour'] / 24)
test_df['Cosine_Time'] = np.cos(2 * np.pi * test_df['hour'] / 24)
train_df['season'] = group_season(train_df)
test_df['season'] = group_season(test_df)

## 7. 새벽 추가

In [38]:
def is_dawn(x):
    dawn_time = [0, 1, 2, 3, 4, 5]
    if x in dawn_time:
        return 1
    else:
        return 0

In [39]:
train_df['dawn'] = train_df['hour'].map(lambda x:is_dawn(x))
test_df['dawn'] = test_df['hour'].map(lambda x:is_dawn(x))

## 8. Target Encoding

In [40]:
# Target encoding
from category_encoders.target_encoder import TargetEncoder

categorical_features = ['ID','요일','도로형태','사고유형','동','기상상태', '노면상태','season']

for i in categorical_features:
    tr_encoder = TargetEncoder(cols=[i])
    train_df[i] = tr_encoder.fit_transform(train_df[i], train_df['ECLO'])
    test_df[i] = tr_encoder.transform(test_df[i])

In [41]:
train_df.corr(numeric_only=True)

Unnamed: 0,ID,요일,기상상태,도로형태,노면상태,사고유형,사망자수,중상자수,경상자수,부상자수,...,day,hour,minute,weekday,day_of_week,holiday,weekend,Cosine_Time,season,dawn
ID,1.0,0.061771,0.01475,0.086154,0.018617,0.137211,0.218507,0.46452,0.63637,-0.118713,...,-0.012303,-0.017355,,0.039407,0.039407,0.020762,0.0564,0.037052,0.013063,0.04685
요일,0.061771,1.0,0.005726,0.009456,0.003387,0.010758,0.010257,0.0166,0.048458,0.007813,...,-0.001398,-0.028288,,0.637956,0.637956,0.029704,0.91305,0.031918,0.010607,0.083368
기상상태,0.01475,0.005726,1.0,0.005441,0.811933,-0.019851,0.017421,0.018294,-0.004281,-0.003704,...,-0.012132,0.025395,,-0.007867,-0.007867,-0.00205,0.001084,0.099668,0.019916,0.040733
도로형태,0.086154,0.009456,0.005441,1.0,0.0193,0.238355,0.001049,0.005993,0.085933,0.011694,...,0.006277,-0.01481,,0.008161,0.008161,0.003234,0.006813,0.022999,-0.002123,0.024622
노면상태,0.018617,0.003387,0.811933,0.0193,1.0,-0.008089,0.008569,0.014281,0.005747,-0.002227,...,-0.017716,0.019702,,-0.013314,-0.013314,-0.001212,-0.002283,0.081766,0.016906,0.034035
사고유형,0.137211,0.010758,-0.019851,0.238355,-0.008089,1.0,-0.074449,-0.099021,0.251997,0.013058,...,-0.000257,0.009275,,0.009591,0.009591,-0.005532,0.011374,-0.042967,0.001176,-0.045544
사망자수,0.218507,0.010257,0.017421,0.001049,0.008569,-0.074449,1.0,-0.004368,-0.060607,-0.022771,...,-0.001296,-0.037692,,0.003101,0.003101,0.003162,0.009693,0.028351,-0.00183,0.057583
중상자수,0.46452,0.0166,0.018294,0.005993,0.014281,-0.099021,-0.004368,1.0,-0.325585,-0.105167,...,-0.000712,-0.038748,,0.010008,0.010008,0.001592,0.015315,0.022461,0.00802,0.042798
경상자수,0.63637,0.048458,-0.004281,0.085933,0.005747,0.251997,-0.060607,-0.325585,1.0,-0.165157,...,-0.012349,0.024487,,0.031849,0.031849,0.020084,0.043803,0.011654,0.007585,-0.004176
부상자수,-0.118713,0.007813,-0.003704,0.011694,-0.002227,0.013058,-0.022771,-0.105167,-0.165157,1.0,...,0.000619,0.006234,,0.009486,0.009486,-5.4e-05,0.008715,0.005969,0.002245,0.00918


## AutoML

In [21]:
test_x = test_df.drop(columns=['ID','year','month','day','hour','minute','day_of_week','시','사고일시']).copy()
train_x = train_df[test_x.columns].copy()
train_y = train_df['ECLO'].copy()

In [73]:
import matplotlib
matplotlib.use('Agg')

In [74]:
from supervised.automl import AutoML
automl = AutoML(mode="Compete",
                algorithms = ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost'],
                n_jobs = -1,total_time_limit=43200, eval_metric="rmse", ml_task = "regression",random_state=0)



In [None]:
automl.fit(train_x, train_y)

In [None]:
pred = automl.predict(test_x)

In [None]:
submission['ECLO'] = pred