In [1]:
import sys
import pandas as pd
import os
from sklearn.metrics import accuracy_score,precision_score,recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from feature_engine import imputation as mdi
from feature_engine import encoding as ce
import mean_median2 as mm
import warnings
warnings.filterwarnings('ignore')


def get_path():
    cur_path = os.getcwd()
    parent_path = os.path.dirname(cur_path)
    return cur_path, parent_path


def file_path(data_path, file):
    return os.path.abspath(os.path.join(data_path, f'{file}'))


def df_write(data_path, df, file):
    df = df.copy()
    df.to_csv(os.path.abspath(os.path.join(data_path, file)), index=False)


def split_train_test(df, configs):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=configs['y_col'][0])
    return X_train, X_test, y_train, y_test


def model_selection(option='logic'):
    if option == 'light':
        return lgb.LGBMClassifier(random_state=0)
    else:
        return LogisticRegression(random_state=0)


def read_data(configs):
    if configs['date_col'][0] == ' ':
        df = pd.read_csv(configs['file_name'][0])
    else:
        df = pd.read_csv(configs['file_name'][0], parse_dates=configs['date_col'])

    if configs['remove_col'][0] == ' ':
        pass
    else:
        if configs['remove_col'][0] in df.columns.to_list():
            df = df.drop(configs['remove_col'][0], axis=1)

    return df


def y_label_enc(df, configs):
    df = df.copy()
    Y_col = configs['y_col'][0]
    if df[Y_col].isnull().any():
        Y_null = True
    else:
        Y_null = False
    labeler = LabelEncoder()
    df[Y_col] = labeler.fit_transform(df[Y_col])
    return df, Y_null


def organize_data(df, configs, y_null):
    df = df.copy()
    cols = df.columns.to_list()
    null_threshhold_cols = []
    no_null_cols = []
    date_time = configs['date_col']
    Y_col = configs['y_col'][0]

    for col in cols:
        null_mean = df[col].isnull().mean()
        if null_mean >= configs['null_threshhold'][0]:
            null_threshhold_cols.append(col)
        if null_mean == 0:
            no_null_cols.append(col)

    cols_stayed = [item for item in cols if item not in null_threshhold_cols]
    data = df[cols_stayed].copy()

    # numerical: discrete vs continuous
    discrete = [var for var in cols_stayed if
                data[var].dtype != 'O' and var != Y_col and var not in date_time and data[var].nunique() < 10]
    continuous = [var for var in cols_stayed if
                  data[var].dtype != 'O' and var != Y_col and var not in date_time and var not in discrete]

    # categorical
    categorical = [var for var in cols_stayed if data[var].dtype == 'O' and var != Y_col]

    print('There are {} date_time variables'.format(len(date_time)))
    print('There are {} discrete variables'.format(len(discrete)))
    print('There are {} continuous variables'.format(len(continuous)))
    print('There are {} categorical variables'.format(len(categorical)))

    if y_null:
        data = data[data[Y_col] != data[Y_col].max()].copy()
    else:
        data = data.copy()

    return data, discrete, continuous, categorical


def make_train_test(df, configs):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=configs['test_size'][0], random_state=0, stratify=y)
    return X_train, X_test, y_train, y_test


def make_imputer_pipe(continuous, discrete, categorical, null_impute_type=None):
    numberImputer = continuous + discrete
    categoricalImputer = categorical
    
    if null_impute_type is None:
        pipe = []
    else:
        if (len(numberImputer) > 0) & (len(categoricalImputer) > 0):
            pipe = Pipeline([
                ("imputer",
                 mm.MeanMedianImputer2(
                     imputation_method=null_impute_type, variables=numberImputer),),
    
                ('imputer_cat',
                 mdi.CategoricalImputer(variables=categoricalImputer)),
    
                ('categorical_encoder',
                 ce.OrdinalEncoder(encoding_method='ordered',
                                   variables=categoricalImputer))
            ])
        else:
            if (len(numberImputer) > 0) & (len(categoricalImputer) == 0):
                pipe = Pipeline([
                    ("imputer",
                     mm.MeanMedianImputer2(
                         imputation_method=null_impute_type, variables=numberImputer),)
                ])
            else:
                if (len(numberImputer) == 0) & (len(categoricalImputer) > 0):
                    pipe = Pipeline([
                        ('imputer_cat',
                         mdi.CategoricalImputer(variables=categoricalImputer)),
    
                        ('categorical_encoder',
                         ce.OrdinalEncoder(encoding_method='ordered',
                                           variables=categoricalImputer))
                    ])
                else:
                    pipe = []
    return pipe


def do_imputation(df, configs, pipe):
    if pipe != []:
        df = df.copy()
        xtrain, xtest, y_train, y_test = make_train_test(df, configs)
        
        # pipe.fit(X_train, y_train)
        pipe.fit(xtrain)
        
        X_train = pipe.transform(xtrain)
        X_test = pipe.transform(xtest)

        X_train[configs['y_col'][0]] = y_train        
        X_train['split'] = 'train'
        X_test[configs['y_col'][0]] = y_test
        X_test['split'] = 'test'        
        return pd.concat([X_train, X_test]).reset_index(drop=True)
    else:
        print('no pipe applied')
        return df    


def do_train(X_train, X_test, y_train, y_test, option):
    X_train, X_test, y_train, y_test = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
    model = model_selection(option)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics(y_test, y_pred, option)


def min_max_scale(df):
    df = df.copy()
    scaler = MinMaxScaler()
    scaler.fit(df)
    return scaler.transform(df)


def metrics(y_test, pred, option):
    y_test = y_test.copy()
    pred = pred.copy()
    accuracy = round(accuracy_score(y_test, pred), 2)
    precision = round(precision_score(y_test, pred), 2)
    recall = round(recall_score(y_test, pred), 2)
    f1 = round(f1_score(y_test, pred), 2)
    print(option, "f1 점수:", f1, "정확도:", accuracy, "정밀도:", precision, "재현율:", recall)
    print(confusion_matrix(y_test, pred))

In [2]:
# arv 예1: credit argumet_credit.xlsx
# arv 예2: metro argumet_metro.xlsx

try:
    folder_name = 'metro'
    config_file_name = 'argumet_metro.xlsx'
    cur_path = os.getcwd()
    parent = os.path.abspath(os.path.join(cur_path, os.pardir))
    config_file = os.path.join(parent, os.path.join('config', f'{config_file_name}'))
    configs = pd.read_excel(config_file, header=None).set_index(0).T
    configs = configs.to_dict('list')
    ori_file_name = configs['file_name'][0]
    configs['file_name'][0] = os.path.join(parent, os.path.join(f'data/{folder_name}', configs['file_name'][0]))
    df_initial = read_data(configs)

    # 전처리 저장 경로 정의
    dest_path = os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}'))
    dest_path = os.path.join(parent, os.path.join(f'{dest_path}/imputed', f'draft_{ori_file_name}.csv'))       

    # 오리지널 데이터셋 저장
#     df_initial.to_csv(dest_path, index=False)        

    # 1. Label 칼럼 인코딩   
    df, y_null = y_label_enc(df_initial, configs)

    # 2. discrete, continuous, categorical 구분작업
    df_organized, discrete, continuous, categorical = organize_data(df, configs, y_null)

    # null_impute_types 정의
    null_impute_types = ['median', 'mean', 'max', 'min']

    for null_impute_type in null_impute_types:        
        # 3. pipe 작업
        pipe = make_imputer_pipe(discrete, continuous, categorical, null_impute_type=null_impute_type)

        # 4. imputation with train/test split
        df_imputed = do_imputation(df_organized, configs, pipe)            

        # 5. 전처리 셋 저장    
        dest_path = os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}'))
        dest_path = os.path.join(parent, os.path.join(f'{dest_path}/imputed', f'imputed_{ori_file_name}_{null_impute_type}.csv'))
#         df_imputed.to_csv(dest_path, index=False)

    print('Completed.')

except Exception as e:
    exc_type, exc_obj, exc_tb = sys.exc_info()
    print('비정상종료', e)
    print(exc_type, exc_tb.tb_lineno)

There are 1 date_time variables
There are 8 discrete variables
There are 7 continuous variables
There are 0 categorical variables
Completed.


In [24]:
x = df_imputed.copy().reset_index()
x[con]

Unnamed: 0,index,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses,y,split
734002,734002,-0.014,8.782,8.770,-0.032,8.782,58.700,0.0425,1,0,1,1,0,1,1,1,0,test
734003,734003,-0.012,9.738,9.722,-0.020,9.738,60.975,3.7375,1,0,1,1,0,1,1,1,0,test
734004,734004,8.600,9.452,-0.004,0.654,9.452,77.100,5.7575,0,1,1,0,0,1,1,1,0,test
734005,734005,10.170,9.794,-0.012,-0.026,9.788,70.450,6.0850,0,1,1,0,0,1,1,0,0,test
734006,734006,-0.012,9.384,9.368,-0.022,9.382,60.350,0.0425,1,0,1,1,0,1,1,1,0,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,1048570,-0.016,8.330,8.322,-0.024,8.332,52.275,0.0400,1,0,1,1,0,1,1,1,0,test
1048571,1048571,-0.010,8.196,8.182,-0.018,8.198,15.400,0.0375,1,0,1,1,0,1,1,1,0,test
1048572,1048572,-0.012,9.002,8.988,-0.018,9.002,62.850,0.0425,1,0,1,1,0,1,1,1,0,test
1048573,1048573,8.110,7.922,-0.008,1.922,7.924,75.000,5.5050,0,1,0,0,0,1,1,1,1,test


In [44]:
null_impute_types = ['median', 'mean', 'max', 'min']
tmp = df_imputed.copy().reset_index()

for null_impute_type in null_impute_types[0]:        

# 6. 스케일링 작업 및 저장
    Y_COL = configs['y_col'][0]
    # 6.1 X_train 스케일링
    con = df_imputed['split']=='train'                        
    X_train_scaled = min_max_scale(df_imputed[con].drop(columns=[Y_COL,'split']))
    X_train_scaled = pd.DataFrame(X_train_scaled)
    X_train_scaled[Y_COL] = df_imputed[con][Y_COL]
    X_train_scaled['split'] = df_imputed[con]['split']
    X_train_scaled.columns = df_imputed.columns

    # 6.2 X_test 스케일링
    con = df_imputed['split']=='test'                        
    X_test_scaled = min_max_scale(df_imputed[con].drop(columns=[Y_COL,'split']))
    X_test_scaled = pd.DataFrame(X_test_scaled)
    X_test_scaled['index'] =  tmp[con]['index'].values
    X_test_scaled = X_test_scaled.set_index('index')
    X_test_scaled[Y_COL] = df_imputed[con][Y_COL]
    X_test_scaled['split'] = df_imputed[con]['split']
    X_test_scaled.columns = df_imputed.columns
    X_test_scaled.index.name = None

#     # 6.2 X_test 스케일링
#     con = df_imputed['split']=='test'
#     X_test = df_imputed[con].reset_index(drop=True)
#     y_test = X_test[Y_COL].reset_index(drop=True)
#     X_test = X_test.drop(columns=[Y_COL, 'split'])                        
#     X_test_scaled = min_max_scale(X_test)

#     # 6.3 data frame으로 변환
#     xtrains = pd.DataFrame(data=X_train_scaled, columns=X_train.columns)
#     xtrains['split'] = 'train'
#     xtrains[Y_COL] = y_train
#     xtests = pd.DataFrame(data=X_test_scaled, columns=X_test.columns)
#     xtests['split'] = 'test'
#     xtests[Y_COL] = y_test
#     df_scaled = pd.concat([xtrains, xtests]).reset_index(drop=True)
#     # 6.4 scaling 저장
#     dest_path = os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}'))
#     dest_path = os.path.join(parent, os.path.join(f'{dest_path}/scaled', f'scaled_{ori_file_name}_{null_impute_type}.csv'))
#     df_scaled.to_csv(dest_path, index=False)




In [26]:
X_train_scaled

Unnamed: 0,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses,y,split
0,0.789168,0.810868,0.002131,0.001823,0.811396,0.734588,0.598922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,train
1,0.001687,0.832772,0.846377,0.000810,0.833474,0.636028,0.002426,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
2,0.001687,0.801179,0.817513,0.000000,0.802145,0.589516,0.002426,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
3,0.763306,0.747894,0.002712,0.197448,0.748528,0.883352,0.598922,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1,train
4,0.002061,0.875105,0.884928,0.001418,0.875736,0.643780,0.001617,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733997,0.001499,0.818029,0.832817,0.001013,0.818755,0.580657,0.002156,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
733998,0.001874,0.951348,0.955250,0.001823,0.951640,0.725729,0.399191,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
733999,0.002061,0.886689,0.895971,0.001215,0.887511,0.637874,0.001887,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
734000,0.001499,0.855307,0.867106,0.001215,0.856602,0.590624,0.001887,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train


In [45]:
X_test_scaled

Unnamed: 0,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses,y,split
734002,0.001681,0.843318,0.854122,0.000000,0.843789,0.640770,0.002432,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
734003,0.001868,0.943444,0.946460,0.001498,0.943747,0.674436,0.401892,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
734004,0.806126,0.913490,0.003104,0.085622,0.913844,0.913060,0.620270,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0,test
734005,0.952746,0.949309,0.002328,0.000749,0.948975,0.814650,0.655676,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0,test
734006,0.001868,0.906368,0.912124,0.001248,0.906524,0.665187,0.002432,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,0.001494,0.795978,0.810669,0.000999,0.796738,0.545690,0.002162,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
1048571,0.002055,0.781944,0.797090,0.001747,0.782727,0.000000,0.001892,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
1048572,0.001868,0.866359,0.875267,0.001747,0.866792,0.702183,0.002432,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
1048573,0.760366,0.753247,0.002716,0.243884,0.754078,0.881983,0.592973,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1,test


In [46]:
pd.concat([X_train_scaled, X_test_scaled])

Unnamed: 0,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses,y,split
0,0.789168,0.810868,0.002131,0.001823,0.811396,0.734588,0.598922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,train
1,0.001687,0.832772,0.846377,0.000810,0.833474,0.636028,0.002426,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
2,0.001687,0.801179,0.817513,0.000000,0.802145,0.589516,0.002426,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
3,0.763306,0.747894,0.002712,0.197448,0.748528,0.883352,0.598922,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1,train
4,0.002061,0.875105,0.884928,0.001418,0.875736,0.643780,0.001617,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,0.001494,0.795978,0.810669,0.000999,0.796738,0.545690,0.002162,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
1048571,0.002055,0.781944,0.797090,0.001747,0.782727,0.000000,0.001892,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
1048572,0.001868,0.866359,0.875267,0.001747,0.866792,0.702183,0.002432,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0,test
1048573,0.760366,0.753247,0.002716,0.243884,0.754078,0.881983,0.592973,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1,test


In [42]:
 (X_train_scaled.index.name)

NoneType