In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    AdaBoostClassifier, AdaBoostRegressor)
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score
from imblearn.over_sampling import RandomOverSampler

import import_ipynb
from preprocessor import preprocessor, numerical_columns

def split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=0)
    return (X_train.values, X_test.values,
            y_train.values.reshape(-1), y_test.values.reshape(-1))

def generate_column_key(target_column, dropping_columns):
    return target_column, frozenset(dropping_columns)

def calc_numerical_score(y_pred, y_true):
    return np.mean(np.exp(-np.square(- (y_pred - y_true) / np.std(y_true))))

class Tree(object):
    def __init__(self):
        self.models = dict()
        
    def train(self, target_column, dropping_columns):
        column_key = generate_column_key(target_column, dropping_columns)
        if column_key not in self.models:
            print("Training target: ", target_column, " (dropped {})".format(dropping_columns))
            X, y = preprocessor.preprocessed(target_column, dropping_columns)
            model, test_score = select_model(X, y, target_column)
            self.models[column_key] = model
            base_score = baseline(target_column, dropping_columns)
            print("    Test score: {}, (base score: {})".format(test_score, base_score))
    
    def predict(self, preprocessed, target_column, dropping_columns):
        column_key = generate_column_key(target_column, dropping_columns)
        return self.models[column_key].predict(preprocessed), score, base_score
    
def baseline(target_column, dropping_columns):
    X, y = preprocessor.preprocessed(target_column, dropping_columns)
    baselines = {
        '사망자수': np.ones(y.shape),
        '경상자수': np.zeros(y.shape),
        '부상신고자수': np.zeros(y.shape),
        '중상자수': np.zeros(y.shape),
        '사고유형_대분류': np.full(y.shape, '차대차'),
        '사고유형_중분류': np.full(y.shape, '횡단중'),
        '법규위반': np.full(y.shape, '안전운전 의무 불이행'),
        '주야': np.full(y.shape, '야간'),
        '요일': np.full(y.shape, '금'),
        '발생지시도': np.full(y.shape, '경기'),
        '발생지시군구': np.full(y.shape, '서구'),
        '사고유형_대분류': np.full(y.shape, '차대차'),
        '사고유형_중분류': np.full(y.shape, '기타'),
        '도로형태_대분류': np.full(y.shape, '단일로'),
        '당사자종별_1당_대분류': np.full(y.shape, '승용차'),
        '당사자종별_2당_대분류': np.full(y.shape, '보행자'),
    }
    base = (baselines[target_column]
            if target_column in baselines else y)
    score_method = (calc_numerical_score 
                    if target_column in numerical_columns
                    else accuracy_score)
    return score_method(base, y)

importing Jupyter notebook from preprocessor.ipynb


In [75]:
n_splits = 3

def select_model(X, y, target_column):
    X_train, X_test, y_train, y_test = split(X, y)
    Model = (lambda **kwargs: RandomForestRegressor(random_state=0, **kwargs)
                 if target_column in numerical_columns
                 else RandomForestClassifier(random_state=0, **kwargs))
    score_method = (calc_numerical_score
       if target_column in numerical_columns
       else accuracy_score)

    optimal_max_depth = 0
    opmimal_max_score = 0
    for max_depth in range(3, 4):
        val_score = validate(
            X_train, y_train, Model,
            max_depth, score_method)
        if val_score > optimal_max_depth:
            optimal_max_score = val_score
            optimal_max_depth = max_depth

    model = Model(max_depth=optimal_max_depth).fit(X, y)
    test_score = score_method(model.predict(X_test), y_test)
    return model, test_score

def calc_weights(y):
    y = pd.Series(y)
    weight_dict = dict(1 / y.value_counts(normalize=True))
    return y.apply(lambda x: weight_dict[x])

def validate(X_train, y_train, Model, max_depth, score_method):
    avg_val_score = 0
    kf = KFold(n_splits=n_splits)
    
    ros = RandomOverSampler(random_state=0)
    X_train, y_train = ros.fit_sample(X_train, y_train)
    
    for train_index, val_index in kf.split(X_train):
        X_train_fold = X_train[train_index]
        X_val_fold = X_train[val_index]
        y_train_fold = y_train[train_index]
        y_val_fold = y_train[val_index]
        
        val_score = score_method(Model(max_depth=max_depth)
                    .fit(X_train_fold, y_train_fold)
                    .predict(X_val_fold), y_val_fold)
        avg_val_score += val_score / n_splits
    return avg_val_score

In [73]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_sample(X, y)
from collections import Counter
print(sorted(Counter(y_resampled).items()))

[(1, 24206), (2, 24206), (3, 24206), (4, 24206), (5, 24206), (6, 24206), (8, 24206), (10, 24206)]


In [77]:
condition = preprocessor.conditions[20]
target_column = condition.target_columns[0]
dropping_columns = condition.target_columns.drop(target_column)
X, y = preprocessor.preprocessed(target_column, dropping_columns)
base_score = baseline(target_column, dropping_columns)
model, test_score = select_model(X, y, target_column)
print("Test score: {}, (base_score: {})".format(test_score, base_score))

Test score: 0.71685303514377, (base_score: 0.6688501018492631)


In [80]:
# Most common values for categorical columns
# for column in preprocessor.categorical:
#     print(column, preprocessor.categorical.loc[:, column].value_counts().idxmax())

주야 야간
요일 금
발생지시도 경기
발생지시군구 서구
사고유형_대분류 차대차
사고유형_중분류 기타
법규위반 안전운전 의무 불이행
도로형태_대분류 단일로
도로형태 기타단일로
당사자종별_1당_대분류 승용차
당사자종별_2당_대분류 보행자
