In [1]:
import import_ipynb
from preprocessor import numerical, one_hot_encoded, one_hot_encoded_column_map

import random
import numpy as np
import pandas as pd
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    AdaBoostClassifier, AdaBoostRegressor)
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import KFold

importing Jupyter notebook from preprocessor.ipynb


In [2]:
# Choose "target_index" from "labels"
labels = dict(enumerate(list(numerical) + list(one_hot_encoded_column_map.keys())))
labels

{0: '사망자수',
 1: '중상자수',
 2: '경상자수',
 3: '부상신고자수',
 4: '당사자종별_2당_대분류',
 5: '법규위반',
 6: '당사자종별_1당_대분류',
 7: '도로형태',
 8: '발생지시도',
 9: '요일',
 10: '사고유형',
 11: '주야'}

In [4]:
def is_numerical(target_index):
    return labels[target_index] in numerical

def split(data, target_index):
    label = labels[target_index]
    target_columns = ([label] if is_numerical(target_index)
                     else one_hot_encoded_column_map[label])
    X = data.drop(target_columns, axis=1)
    y = data[target_columns]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    return (X_train.values, X_test.values,
            y_train.values.reshape(-1), y_test.values.reshape(-1))

def calc_numerical_score(y_pred, y_true):
    return np.mean(np.exp(-np.square(- (y_pred - y_true) / np.std(y_true))))

In [6]:
def train(data, target_index, n_splits):
    X_train, X_test, y_train, y_test = split(data, target_index)
    
    model = (RandomForestRegressor
                 if is_numerical(target_index)
                 else RandomForestClassifier
            )(random_state=0)
    
    kf = KFold(n_splits=n_splits, random_state=0)
    for train_index, val_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        X_val_fold = X_train[val_index]
        y_train_fold = y_train[train_index]
        y_val_fold = y_train[val_index]
        model.fit(X_train_fold, y_train_fold)
        
        score_method = (calc_numerical_score
               if is_numerical(target_index)
               else accuracy_score)
        print(
            score_method(model.predict(X_train_fold), y_train_fold),
            score_method(model.predict(X_val_fold), y_val_fold)
        )

n_splits = 2
target_index = 1
data = pd.concat([numerical, one_hot_encoded], axis=1, sort=False)
train(data, target_index=target_index, n_splits=n_splits)

0.9464095478880815 0.8522973318173194
0.9460480752622382 0.843206072864385
