In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    AdaBoostClassifier, AdaBoostRegressor)
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score

import import_ipynb
from preprocessor import preprocessor, numerical_columns

def split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    return (X_train.values, X_test.values,
            y_train.values.reshape(-1), y_test.values.reshape(-1))

def generate_column_key(target_column, dropping_columns):
    return target_column, frozenset(dropping_columns)

def calc_numerical_score(y_pred, y_true):
    return np.mean(np.exp(-np.square(- (y_pred - y_true) / np.std(y_true))))

class Tree(object):
    def __init__(self):
        self.models = dict()
        
    def train(self, X, y, target_column, dropping_columns):
        column_key = generate_column_key(target_column, dropping_columns)
        if column_key not in self.models:
            self.models[column_key] = train_model(X, y)
    
    def predict(self, preprocessed, target_column, dropping_columns):
        column_key = generate_column_key(target_column, dropping_columns)
        return self.models[column_key].predict(preprocessed)    

In [None]:
n_splits = 3

def train_model(X, y, target_column):
    X_train, X_test, y_train, y_test = split(X, y)
    
    Model = (RandomForestRegressor
                 if target_column in numerical_columns
                 else RandomForestClassifier
            )
    score_method = (calc_numerical_score
       if target_column in numerical_columns
       else accuracy_score)
    
    avg_train_score = 0
    avg_val_score = 0
    kf = KFold(n_splits=n_splits, random_state=0)
    for train_index, val_index in kf.split(X_train):
        model = Model(random_state=0)
        X_train_fold = X_train[train_index]
        X_val_fold = X_train[val_index]
        y_train_fold = y_train[train_index]
        y_val_fold = y_train[val_index]
        model.fit(X_train_fold, y_train_fold)
        
        train_score = score_method(model.predict(X_train_fold), y_train_fold)
        val_score = score_method(model.predict(X_val_fold), y_val_fold)
        avg_train_score += train_score / n_splits
        avg_val_score += val_score / n_splits

    test_score = score_method(model.predict(X_test), y_test)

    model = Model()
    model.fit(X, y)
    return model, test_score, avg_val_score, avg_train_score

def baseline(X, y, target_column):
    baselines = {
        '사망자수': np.ones(y.shape),
        '경상자수': np.zeros(y.shape),
        '부상신고자수': np.zeros(y.shape),
        '중상자수': np.zeros(y.shape),
        '사고유형_대분류': np.full(y.shape, '차대차'),
        '사고유형_중분류': np.full(y.shape, '횡단중'),
        '법규위반': np.full(y.shape, '안전운전 의무 불이행'),
        '주야': np.full(y.shape, '야간'),
        '요일': np.full(y.shape, '금'),
        '발생지시도': np.full(y.shape, '경기'),
        '발생지시군구': np.full(y.shape, '서구'),
        '사고유형_대분류': np.full(y.shape, '차대차'),
        '사고유형_중분류': np.full(y.shape, '기타'),
        '도로형태_대분류': np.full(y.shape, '단일로'),
        '당사자종별_1당_대분류': np.full(y.shape, '승용차'),
        '당사자종별_2당_대분류': np.full(y.shape, '보행자'),
    }
    base = (baselines[target_column]
            if target_column in baselines else y)
    score_method = (calc_numerical_score 
                    if target_column in numerical_columns
                    else accuracy_score)
    return score_method(base, y)

condition = preprocessor.conditions[0]
target_column = condition.target_columns[0]
dropping_columns = condition.target_columns.drop(target_column)
X, y = preprocessor.preprocessed(target_column, dropping_columns)
# print(baseline(X, y, target_column))
# train_model(X, y, target_column)

for condition in preprocessor.conditions:
    target_columns = condition.target_columns
    for target_column in target_columns:
        dropping_columns = target_columns.drop(target_column)
        X, y = preprocessor.preprocessed(target_column, dropping_columns)
        model, test_score, avg_val_score, avg_train_score = train_model(X, y, target_column)
        base_score = baseline(X, y, target_column)
        print(target_column, list(dropping_columns), test_score, base_score)

사망자수 ['경상자수'] 0.9997402255255545 0.9668091268418382
경상자수 ['사망자수'] 0.9993622968726122 0.9113787339852367
사망자수 ['경상자수'] 0.9997402255255545 0.9668091268418382
경상자수 ['사망자수'] 0.9993622968726122 0.9113787339852367
중상자수 ['부상신고자수'] 0.9992579223248373 0.8762999464342794
부상신고자수 ['중상자수'] 0.999594630915079 0.9828530191957248
중상자수 ['부상신고자수'] 0.9992579223248373 0.8762999464342794
부상신고자수 ['중상자수'] 0.999594630915079 0.9828530191957248
중상자수 ['경상자수'] 0.9992123883370527 0.8762999464342794
경상자수 ['중상자수'] 0.9993328143999257 0.9113787339852367
중상자수 ['경상자수'] 0.9992123883370527 0.8762999464342794
경상자수 ['중상자수'] 0.9993328143999257 0.9113787339852367
중상자수 ['경상자수'] 0.9992123883370527 0.8762999464342794
경상자수 ['중상자수'] 0.9993328143999257 0.9113787339852367
사망자수 ['중상자수'] 0.9997702725071861 0.9668091268418382
중상자수 ['사망자수'] 0.9993413648773295 0.8762999464342794
사망자수 ['중상자수'] 0.9997702725071861 0.9668091268418382
중상자수 ['사망자수'] 0.9993413648773295 0.8762999464342794
사망자수 ['중상자수'] 0.9997702725071861 0.9668091268418382
중상자수 [

In [72]:
# for column in preprocessor.categorical:
#     print(preprocessor.loc[column])
for column in preprocessor.categorical:
    print(column, preprocessor.categorical.loc[:, column].value_counts().idxmax())

주야 야간
요일 금
발생지시도 경기
발생지시군구 서구
사고유형_대분류 차대차
사고유형_중분류 기타
법규위반 안전운전 의무 불이행
도로형태_대분류 단일로
도로형태 기타단일로
당사자종별_1당_대분류 승용차
당사자종별_2당_대분류 보행자
