In [78]:
import os
from collections import namedtuple
import numpy as np
import pandas as pd

dirname = os.getcwd()

file_path_train = '../dataset_kor_OSX/traffic-accident-data.csv'
file_path_predict = '../test_kor_OSX.csv'

dropping_columns_train = [
    '발생년', '발생년월일시', '발생분', '사상자수',
    '사고유형', '법규위반_대분류', '당사자종별_1당', '당사자종별_2당',
    '발생위치X_UTMK', '발생위치Y_UTMK', '경도', '위도']
dropping_columns_predict = ['사상자수']
merging_columns_map = {
    '발생지': ['발생지시도', '발생지시군구'],
    '사고유형': ['사고유형_대분류', '사고유형_중분류'],
    '도로형태': ['도로형태_대분류', '도로형태'],
}
numerical_columns = ['사망자수', '중상자수', '경상자수', '부상신고자수']

def read_file(file_path):
    full_path = os.path.join(dirname, file_path)
    return pd.read_csv(full_path)

def drop(original, dropping_columns):
    return original.drop(dropping_columns, axis=1)

def merge(dropped):
    mergeds = [dropped.reindex(columns, axis='columns')
       .apply(lambda s: s.str.cat(sep='-'), axis=1)
       .replace('', np.nan)
       .rename(name)
     for name, columns in merging_columns_map.items()]
    merging_columns = [merging_column
        for merging_columns in merging_columns_map.values()
        for merging_column in merging_columns]
    not_merged = dropped.drop(merging_columns, axis='columns')
    return pd.concat(mergeds + [not_merged], axis=1)

def one_hot_encode(df, one_hot_encode_map):
    one_hot_encoded_columns = []
    for column in df:
        series = df.loc[:, column]        
        one_hot_encoded_column = pd.get_dummies(series).rename(columns=one_hot_encode_map[column])    
        one_hot_encoded_columns.append(one_hot_encoded_column)
    return pd.concat(one_hot_encoded_columns, axis=1, sort=False)

vectorize = np.vectorize(lambda value, columns: int(value in columns))

def one_hot_encode_for_predict(series, categorize_map, one_hot_encode_map):
    one_hot_encoded_columns = []
    for column, value in series.items():
        vector = vectorize(value, categorize_map[column])
        one_hot_encoded_column = pd.Series(
            dict(zip(categorize_map[column], vector))
        ).rename(one_hot_encode_map[column])
        one_hot_encoded_columns.append(one_hot_encoded_column)
    return pd.concat(one_hot_encoded_columns)

def categorize_column(series):
    cat = series.astype('category').cat
    return cat.codes, cat.categories

def filter_numerical(df):
    return df.reindex(df.columns[df.columns.isin(numerical_columns)], axis='columns')

def filter_categorical(df):
    return df.reindex(df.columns[~df.columns.isin(numerical_columns)], axis='columns')

def drop_index(series, dropping_columns):
    return series[~series.index.isin(dropping_columns)]

def filter_row_numerical(series):
    print(series.index)
    return series[series.index.isin(numerical_columns)]

def filter_row_categorical(series):
    return drop_index(series, numerical_columns)

def nan_columns(series):
    return series[series.isna()].index
    
Condition = namedtuple('Condition', ['data', 'target_columns'])

class Preprocessor(object):
    def __init__(self):
        self.numerical = pd.DataFrame()
        self.categorical = pd.DataFrame()
        self.categorized = pd.DataFrame()
        self.categorize_map = {}
        self.one_hot_encoded = pd.DataFrame()
        self.conditions = []
        
        self.preprocess()
        self.generate_conditions()

    def categorize(self):
        categorized_columns = []
        for column in self.categorical:
            categorize_coded, categorize_map = categorize_column(
                self.categorical.loc[:, column])
            categorized_columns.append(categorize_coded)
            self.categorize_map[column] = categorize_map
        self.categorized =  pd.concat(categorized_columns, axis=1, sort=False)
        
    def preprocess(self):
        original = read_file(file_path_train)
        dropped = drop(original, dropping_columns_train)
        merged = merge(dropped)
        self.numerical = filter_numerical(merged)
        self.categorical = filter_categorical(merged)
        self.categorize()
        self.one_hot_encoded = one_hot_encode(
                self.categorical, one_hot_encode_map=self.one_hot_encode_map)
        
    def generate_conditions(self):
        original = read_file(file_path_predict)
        dropped = drop(original, dropping_columns_predict)
        merged = merge(dropped)
        for index in merged.index:
            row = merged.loc[index]
            numerical = filter_row_numerical(row)
            categorical = filter_row_categorical(row)

            numerical_target_columns = nan_columns(numerical)
            categorical_target_columns = nan_columns(categorical)
            target_columns = numerical_target_columns.append(categorical_target_columns)

            numerical_dropped = drop_index(numerical, numerical_target_columns)
            categorical_dropped = drop_index(categorical, categorical_target_columns)
            one_hot_encoded = one_hot_encode_for_predict(
                categorical_dropped,
                categorize_map=self.categorize_map,
                one_hot_encode_map=self.one_hot_encode_map)
            
            data = numerical_dropped.append(one_hot_encoded)
            condition = Condition(data=data, target_columns=target_columns)
            self.conditions.append(condition)
        
    def preprocessed(self, target_column=None):
        if not target_column:
            X = pd.concat([self.numerical, self.one_hot_encoded], axis=1, sort=False)
            y = None
        elif target_column in self.numerical:
            X = pd.concat(
                    [self.numerical, self.categorized],
                    axis=1,
                    sort=False
                ).drop(target_column, axis=1)
            y = self.numerical.loc[:, target_column]
        else:
            X = pd.concat([self.numerical, self.one_hot_encoded], axis=1, sort=False)
            y = self.categorical.loc[:, target_column]
        return X, y
    
    @property
    def one_hot_encode_map(self):
        return dict((category, dict(
                            (value, '.'.join([category, value])) for value in values))
                    for category, values in self.categorize_map.items())        

    @property
    def reverse_one_hot_encode_map(self):
        return dict(('.'.join([column, value]), column)
                    for column, values in self.one_hot_encode_map.items()
                    for value in values)
    
    @property
    def category_vectorizing_map(self):
        return dict(
            (category, dict((v, k)
                            for k, v in enumerate(indices)))
            for category, indices in self.categorize_map.items())
    
    def category_vectorize(self, categorical_data):
        return pd.Series(
                dict((category, self.category_vectorizing_map[category][value])
                     for category, value in categorical_data.items())
            )
            
    def restore(self, series):
        numerical = filter_row_numerical(series)
        categorical = filter_row_categorical(series)
        one_hot_decoded = pd.Series(dict((self.reverse_one_hot_encode_map[k], (k.split('.')[1]))
               for k, v in categorical.items() if v))
        decoded = numerical.append(one_hot_decoded)

        not_merged = decoded[~decoded.index.isin(merging_columns_map)]
        merged = decoded[decoded.index.isin(merging_columns_map)]
        unmerged = pd.Series(dict((merging_columns_map[k][i], value)
                                  for k, merged_value in merged.items()
                                  for i, value in enumerate(merged_value.split('-'))))
        return not_merged.append(unmerged)

preprocessor = Preprocessor()

Index(['발생지', '사고유형', '도로형태', '주야', '요일', '사망자수', '중상자수', '경상자수', '부상신고자수',
       '법규위반', '당사자종별_1당_대분류', '당사자종별_2당_대분류'],
      dtype='object')
Index(['발생지', '사고유형', '도로형태', '주야', '요일', '사망자수', '중상자수', '경상자수', '부상신고자수',
       '법규위반', '당사자종별_1당_대분류', '당사자종별_2당_대분류'],
      dtype='object')
Index(['발생지', '사고유형', '도로형태', '주야', '요일', '사망자수', '중상자수', '경상자수', '부상신고자수',
       '법규위반', '당사자종별_1당_대분류', '당사자종별_2당_대분류'],
      dtype='object')
Index(['발생지', '사고유형', '도로형태', '주야', '요일', '사망자수', '중상자수', '경상자수', '부상신고자수',
       '법규위반', '당사자종별_1당_대분류', '당사자종별_2당_대분류'],
      dtype='object')
Index(['발생지', '사고유형', '도로형태', '주야', '요일', '사망자수', '중상자수', '경상자수', '부상신고자수',
       '법규위반', '당사자종별_1당_대분류', '당사자종별_2당_대분류'],
      dtype='object')
Index(['발생지', '사고유형', '도로형태', '주야', '요일', '사망자수', '중상자수', '경상자수', '부상신고자수',
       '법규위반', '당사자종별_1당_대분류', '당사자종별_2당_대분류'],
      dtype='object')
Index(['발생지', '사고유형', '도로형태', '주야', '요일', '사망자수', '중상자수', '경상자수', '부상신고자수',
       '법규위반', '당사자종별_1당_대분류', '당사자종별_2당_대분류'],


In [80]:
preprocessor.restore(preprocessor.conditions[3].data)
p = preprocessor.conditions[3]
p.data['중상자수'] = 5
preprocessor.restore(p)

Index(['사망자수', '경상자수', '발생지.강원-강릉시', '발생지.강원-고성군', '발생지.강원-동해시', '발생지.강원-삼척시',
       '발생지.강원-속초시', '발생지.강원-양구군', '발생지.강원-양양군', '발생지.강원-영월군',
       ...
       '당사자종별_2당_대분류.사륜오토바이(ATV)', '당사자종별_2당_대분류.승용차', '당사자종별_2당_대분류.승합차',
       '당사자종별_2당_대분류.없음', '당사자종별_2당_대분류.열차', '당사자종별_2당_대분류.원동기장치자전거',
       '당사자종별_2당_대분류.이륜차', '당사자종별_2당_대분류.자전거', '당사자종별_2당_대분류.특수차',
       '당사자종별_2당_대분류.화물차'],
      dtype='object', length=327)
<built-in method index of Condition object at 0x121884558>


AttributeError: 'builtin_function_or_method' object has no attribute 'isin'