In [1]:
import numpy as np
import pandas as pd

import os
dirname = os.getcwd()

In [2]:
def read_file(file_path):
    full_path = os.path.join(dirname, file_path)
    return pd.read_csv(full_path)

training_file_path = '../dataset_kor_OSX/traffic-accident-data.csv'
train_original = read_file(training_file_path)
list(enumerate(train_original))

[(0, '발생년'),
 (1, '발생년월일시'),
 (2, '발생분'),
 (3, '주야'),
 (4, '요일'),
 (5, '사망자수'),
 (6, '사상자수'),
 (7, '중상자수'),
 (8, '경상자수'),
 (9, '부상신고자수'),
 (10, '발생지시도'),
 (11, '발생지시군구'),
 (12, '사고유형_대분류'),
 (13, '사고유형_중분류'),
 (14, '사고유형'),
 (15, '법규위반_대분류'),
 (16, '법규위반'),
 (17, '도로형태_대분류'),
 (18, '도로형태'),
 (19, '당사자종별_1당_대분류'),
 (20, '당사자종별_1당'),
 (21, '당사자종별_2당_대분류'),
 (22, '당사자종별_2당'),
 (23, '발생위치X_UTMK'),
 (24, '발생위치Y_UTMK'),
 (25, '경도'),
 (26, '위도')]

In [9]:
def one_hot_encode_column(series):
    df = pd.get_dummies(series)
    columns_map = dict(
        (value, '_'.join([series.name, value])) for value in df.columns
    )
    return df.rename(columns=columns_map)

def preprocess(df, dropping_indices, merging_indices, merging_names, numerical_columns):
    dropping_columns = df.columns[dropping_indices]
    merging_columns = [df.columns[indices] for indices in merging_indices]
    numerical_columns = df.columns[numerical_columns]
    
    dropped = df.drop(dropping_columns, axis=1)

    flattened_merging_columns = [column for columns in merging_columns for column in columns]
    not_merging_columns = list(set(dropped.columns) - set(flattened_merging_columns))
    not_merged = [dropped.loc[:, column] for column in not_merging_columns]
    merged = [dropped.loc[:, columns]
               .apply(lambda s: s.str.cat(sep='_'), axis=1)
               .rename(name)
             for columns, name in zip(merging_columns, merging_names)]
    merged_final = pd.concat(not_merged + merged, axis=1)

    categorical_columns = list(set(merged_final.columns) - set(numerical_columns))
    numerical = merged_final.loc[:, numerical_columns]
    categorical = merged_final.loc[:, categorical_columns]
    one_hot_encoded_columns = [one_hot_encode_column(merged_final.loc[:, column])
         for column in categorical_columns]
    one_hot_encoded = pd.concat(one_hot_encoded_columns, axis=1, sort=False)
    one_hot_encoded_column_map = dict(zip(
        categorical_columns,
        [one_hot_encoded_column.columns for one_hot_encoded_column in one_hot_encoded_columns]
    ))

    return numerical, categorical, one_hot_encoded, one_hot_encoded_column_map

dropping_indices = [0, 1, 2, 6, 20, 22, 23, 24, 25, 26]
merging_indices = [[10, 11], [12, 13, 14], [15, 16], [17, 18]]
merging_names = ['발생지시도', '사고유형', '법규위반', '도로형태']
numerical_columns = [5, 7, 8, 9]

numerical, categorical, one_hot_encoded, one_hot_encoded_column_map = preprocess(
    train_original, dropping_indices, merging_indices, merging_names, numerical_columns)

In [5]:
numerical

Unnamed: 0,사망자수,중상자수,경상자수,부상신고자수
0,1,0,0,0
1,1,2,1,0
2,1,0,0,0
3,1,0,0,0
4,1,1,0,0
5,1,0,0,0
6,1,2,0,0
7,1,0,0,0
8,1,0,0,0
9,1,0,1,0


In [6]:
categorical

Unnamed: 0,주야,당사자종별_1당_대분류,법규위반,요일,도로형태,사고유형,당사자종별_2당_대분류,발생지시도
0,야간,승용차,운전자법규위반_안전운전 의무 불이행,금,단일로_기타단일로,차대사람_횡단중_횡단중,보행자,경기_성남시
1,야간,승용차,운전자법규위반_안전운전 의무 불이행,금,단일로_기타단일로,차대차_추돌_추돌,승용차,전남_곡성군
2,야간,승용차,운전자법규위반_안전운전 의무 불이행,금,단일로_기타단일로,차대사람_차도통행중_차도통행중,보행자,충남_서산시
3,야간,승용차,운전자법규위반_안전운전 의무 불이행,금,교차로_교차로부근,차대차_측면충돌_측면충돌,이륜차,대구_서구
4,주간,이륜차,운전자법규위반_중앙선 침범,금,단일로_교량위,차대차_측면충돌_측면충돌,화물차,서울_영등포구
5,주간,이륜차,운전자법규위반_안전거리 미확보,금,단일로_기타단일로,차대차_측면충돌_측면충돌,승용차,광주_서구
6,주간,승용차,운전자법규위반_과속,금,교차로_교차로내,차대차_측면충돌_측면충돌,화물차,대구_달성군
7,주간,화물차,운전자법규위반_신호위반,금,단일로_기타단일로,차대사람_횡단중_횡단중,보행자,경기_용인시
8,주간,화물차,운전자법규위반_안전운전 의무 불이행,금,단일로_기타단일로,차대차_측면충돌_측면충돌,이륜차,충남_서산시
9,야간,화물차,운전자법규위반_안전운전 의무 불이행,금,단일로_기타단일로,차대차_기타_기타,승용차,전남_광양시


In [7]:
one_hot_encoded

Unnamed: 0,주야_야간,주야_주간,당사자종별_1당_대분류_개인형이동수단(PM),당사자종별_1당_대분류_건설기계,당사자종별_1당_대분류_농기계,당사자종별_1당_대분류_불명,당사자종별_1당_대분류_사륜오토바이(ATV),당사자종별_1당_대분류_승용차,당사자종별_1당_대분류_승합차,당사자종별_1당_대분류_원동기장치자전거,...,발생지시도_충북_보은군,발생지시도_충북_영동군,발생지시도_충북_옥천군,발생지시도_충북_음성군,발생지시도_충북_제천시,발생지시도_충북_증평군,발생지시도_충북_진천군,발생지시도_충북_청원군,발생지시도_충북_청주시,발생지시도_충북_충주시
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
one_hot_encoded_column_map

{'주야': Index(['주야_야간', '주야_주간'], dtype='object'),
 '당사자종별_1당_대분류': Index(['당사자종별_1당_대분류_개인형이동수단(PM)', '당사자종별_1당_대분류_건설기계', '당사자종별_1당_대분류_농기계',
        '당사자종별_1당_대분류_불명', '당사자종별_1당_대분류_사륜오토바이(ATV)', '당사자종별_1당_대분류_승용차',
        '당사자종별_1당_대분류_승합차', '당사자종별_1당_대분류_원동기장치자전거', '당사자종별_1당_대분류_이륜차',
        '당사자종별_1당_대분류_자전거', '당사자종별_1당_대분류_특수차', '당사자종별_1당_대분류_화물차'],
       dtype='object'),
 '법규위반': Index(['법규위반_보행자과실_보행자과실', '법규위반_운전자법규위반_과로', '법규위반_운전자법규위반_과속',
        '법규위반_운전자법규위반_교차로 통행방법 위반', '법규위반_운전자법규위반_기타(운전자법규위반)',
        '법규위반_운전자법규위반_보행자 보호의무 위반', '법규위반_운전자법규위반_부당한 회전',
        '법규위반_운전자법규위반_서행 및 일시정지위반', '법규위반_운전자법규위반_신호위반',
        '법규위반_운전자법규위반_안전거리 미확보', '법규위반_운전자법규위반_안전운전 의무 불이행',
        '법규위반_운전자법규위반_앞지르기 금지위반', '법규위반_운전자법규위반_앞지르기 방법위반',
        '법규위반_운전자법규위반_중앙선 침범', '법규위반_운전자법규위반_직진 및 우회전차의 통행방해',
        '법규위반_운전자법규위반_진로양보 의무 불이행', '법규위반_운전자법규위반_차로위반(진로변경 위반)',
        '법규위반_운전자법규위반_철길건널목 통과방법위반', '법규위반_운전자법규위반_통행우선 순위위반',
        '법규위반_정비불량_정비불량 제차의 운전금지위반'],
       dtyp