In [1]:
import sys
import pandas as pd
import os
from sklearn.metrics import accuracy_score,precision_score,recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from feature_engine import imputation as mdi
from feature_engine import encoding as ce
import warnings
warnings.filterwarnings('ignore')


def get_path():
    cur_path = os.getcwd()
    parent_path = os.path.dirname(cur_path)
    return cur_path, parent_path


def file_path(data_path, file):
    return os.path.abspath(os.path.join(data_path, f'{file}'))


def df_write(data_path, df, file):
    df = df.copy()
    df.to_csv(os.path.abspath(os.path.join(data_path, file)), index=False)


def split_train_test(df, configs):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=configs['y_col'][0])
    return X_train, X_test, y_train, y_test


def model_selection(option='logic'):
    if option == 'light':
        return lgb.LGBMClassifier(random_state=0)
    else:
        return LogisticRegression(random_state=0)


def read_data(configs):
    if configs['date_col'][0] == ' ':
        df = pd.read_csv(configs['file_name'][0])
    else:
        df = pd.read_csv(configs['file_name'][0], parse_dates=configs['date_col'])

    if configs['remove_col'][0] == ' ':
        pass
    else:
        if configs['remove_col'][0] in df.columns.to_list():
            df = df.drop(configs['remove_col'][0], axis=1)

    return df


def y_label_enc(df, configs):
    df = df.copy()
    Y_col = configs['y_col'][0]
    if df[Y_col].isnull().any():
        Y_null = True
    else:
        Y_null = False
    labeler = LabelEncoder()
    df[Y_col] = labeler.fit_transform(df[Y_col])
    return df, Y_null


def organize_data(df, configs, y_null):
    df = df.copy()
    cols = df.columns.to_list()
    null_threshhold_cols = []
    no_null_cols = []
    date_time = configs['date_col']
    Y_col = configs['y_col'][0]

    for col in cols:
        null_mean = df[col].isnull().mean()
        if null_mean >= configs['null_threshhold'][0]:
            null_threshhold_cols.append(col)
        if null_mean == 0:
            no_null_cols.append(col)

    cols_stayed = [item for item in cols if item not in null_threshhold_cols]
    data = df[cols_stayed].copy()

    # numerical: discrete vs continuous
    discrete = [var for var in cols_stayed if
                data[var].dtype != 'O' and var != Y_col and var not in date_time and data[var].nunique() < 10]
    continuous = [var for var in cols_stayed if
                  data[var].dtype != 'O' and var != Y_col and var not in date_time and var not in discrete]

    # categorical
    categorical = [var for var in cols_stayed if data[var].dtype == 'O' and var != Y_col]

    print('There are {} date_time variables'.format(len(date_time)))
    print('There are {} discrete variables'.format(len(discrete)))
    print('There are {} continuous variables'.format(len(continuous)))
    print('There are {} categorical variables'.format(len(categorical)))

    if y_null:
        data = data[data[Y_col] != data[Y_col].max()].copy()
    else:
        data = data.copy()

    return data, discrete, continuous, categorical


def split_train_test(df, configs):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=configs['test_size'][0], random_state=0, stratify=y)
    return X_train, X_test, y_train, y_test


def make_imputer_pipe(continuous, discrete, categorical):
    numberImputer = continuous + discrete
    categoricalImputer = categorical

    if (len(numberImputer) > 0) & (len(categoricalImputer) > 0):
        pipe = Pipeline([
            ("median_imputer",
             mdi.MeanMedianImputer(
                 imputation_method="median", variables=numberImputer),),

            ('imputer_cat',
             mdi.CategoricalImputer(variables=categoricalImputer)),

            ('categorical_encoder',
             ce.OrdinalEncoder(encoding_method='ordered',
                               variables=categoricalImputer))
        ])
    else:
        if (len(numberImputer) > 0) & (len(categoricalImputer) == 0):
            pipe = Pipeline([
                ("median_imputer",
                 mdi.MeanMedianImputer(
                     imputation_method="median", variables=numberImputer),)
            ])
        else:
            if (len(numberImputer) == 0) & (len(categoricalImputer) > 0):
                pipe = Pipeline([
                    ('imputer_cat',
                     mdi.CategoricalImputer(variables=categoricalImputer)),

                    ('categorical_encoder',
                     ce.OrdinalEncoder(encoding_method='ordered',
                                       variables=categoricalImputer))
                ])
            else:
                pipe = []
    return pipe


def do_imputation(X_train, X_test, y_train, y_test, pipe):
    X_train, X_test, y_train, y_test = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
    if pipe != []:
        pipe.fit(X_train, y_train)
        X_train = pipe.transform(X_train)
        X_test = pipe.transform(X_test)
    else:
        print('no pipe applied')
    return X_train, X_test, y_train, y_test


def do_train(X_train, X_test, y_train, y_test, option):
    X_train, X_test, y_train, y_test = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
    model = model_selection(option)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics(y_test, y_pred, option)


def min_max_scale(df):
    df = df.copy()
    scaler = MinMaxScaler()
    scaler.fit(df)
    return scaler.transform(df)


def metrics(y_test, pred, option):
    y_test = y_test.copy()
    pred = pred.copy()
    accuracy = round(accuracy_score(y_test, pred), 2)
    precision = round(precision_score(y_test, pred), 2)
    recall = round(recall_score(y_test, pred), 2)
    f1 = round(f1_score(y_test, pred), 2)
    print(option, "f1 점수:", f1, "정확도:", accuracy, "정밀도:", precision, "재현율:", recall)
    print(confusion_matrix(y_test, pred))



In [2]:

    # arv 예1: credit argumet_credit.xlsx
    # arv 예2: metro argumet_metro.xlsx

# folder_name = sys.argv[1]
# config_file_name = sys.argv[2]

folder_name = 'credit'
config_file_name = 'argumet_credit.xlsx'


cur_path = os.getcwd()
parent = os.path.abspath(os.path.join(cur_path, os.pardir))
config_file = os.path.join(parent, os.path.join('config', f'{config_file_name}'))
configs = pd.read_excel(config_file, header=None).set_index(0).T
configs = configs.to_dict('list')
ori_file_name = configs['file_name'][0]
configs['file_name'][0] = os.path.join(parent, os.path.join('data', configs['file_name'][0]))
df_initial = read_data(configs)

df, y_null = y_label_enc(df_initial, configs)
df_organized, discrete, continuous, categorical = organize_data(df, configs, y_null)
X_train, X_test, y_train, y_test = split_train_test(df_organized, configs)
pipe = make_imputer_pipe(discrete, continuous, categorical)
X_train, X_test, y_train, y_test = do_imputation(X_train, X_test, y_train, y_test, pipe)

# X_train_scaled = min_max_scale(X_train)
# X_test_scaled = min_max_scale(X_test)
# xtrains = pd.DataFrame(data=X_train_scaled, columns=X_train.columns)
# xtests = pd.DataFrame(data=X_test_scaled, columns=X_test.columns)

There are 1 date_time variables
There are 0 discrete variables
There are 29 continuous variables
There are 0 categorical variables


In [39]:
configs

{'file_name': ['C:\\Users\\jh\\0py_dev\\digitalship\\data\\creditcard.csv'],
 'y_col': ['Class'],
 'date_col': [' '],
 'remove_col': ['Time'],
 'null_threshhold': [0.3],
 'fold': [1.5],
 'test_size': [0.3],
 'drop': [False]}

In [35]:
import numpy as np
corr_mat = df_organized.corr(method='pearson')
upper_corr_mat = corr_mat.where(
    np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool))
  
# Convert to 1-D series and drop Null values
unique_corr_pairs = upper_corr_mat.unstack().dropna()
  
# Sort correlation pairs
sorted_mat = unique_corr_pairs.sort_values()
df_corr = pd.DataFrame(sorted_mat).reset_index()
df_corr = df_corr[df_corr['level_0']=='Class']
df_corr[1] = abs(df_corr[0])
df_corr = df_corr.sort_values(by=[1], ascending=False)
# df_corr
list(df_corr[df_corr['level_0']=='Class'].iloc[:1].values[0])

['Class', 'V17', -0.3264810676503034, 0.3264810676503034]

In [13]:
def get_outlier(df=None, column=None, weight=1.5):
    # fraud에 해당하는 column 데이터만 추출, 1/4 분위와 3/4 분위 지점을 np.percentile로 구함. 
    fraud = df[df['Class']==1][column]
    quantile_25 = np.percentile(fraud.values, 25)
    quantile_75 = np.percentile(fraud.values, 75)
    # IQR을 구하고, IQR에 1.5를 곱하여 최대값과 최소값 지점 구함. 
    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight
    # 최대값 보다 크거나, 최소값 보다 작은 값을 아웃라이어로 설정하고 DataFrame index 반환. 
    outlier_index = fraud[(fraud < lowest_val) | (fraud > highest_val)].index
    return outlier_index
def get_preprocessed_df(df):
    df_copy = df.copy()
    amount_n = np.log1p(df_copy['Amount'])
    df_copy.insert(0, 'Amount_Scaled', amount_n)
    df_copy.drop(['Amount'], axis=1, inplace=True)
    # 이상치 데이터 삭제하는 로직 추가
    outlier_index = get_outlier(df=df_copy, column='V14', weight=1.5)
    df_copy.drop(outlier_index, axis=0, inplace=True)
    return df_copy


cur_path = os.getcwd()
parent = os.path.abspath(os.path.join(cur_path, os.pardir))
config_file = os.path.join(parent, os.path.join('config', f'{config_file_name}'))
configs = pd.read_excel(config_file, header=None).set_index(0).T
configs = configs.to_dict('list')
ori_file_name = configs['file_name'][0]
configs['file_name'][0] = os.path.join(parent, os.path.join('data', configs['file_name'][0]))
df_initial = read_data(configs)
df_initial = df_initial.drop(0)
df_initial



X_train, X_test, y_train, y_test = split_train_test(df_initial, configs)
do_train(X_train, X_test, y_train, y_test,'logic')
do_train(X_train, X_test, y_train, y_test,'light')

logic f1 점수: 0.71 정확도: 1.0 정밀도: 0.87 재현율: 0.59
[[85282    13]
 [   60    88]]
light f1 점수: 0.06 정확도: 0.97 정밀도: 0.03 재현율: 0.65
[[82460  2835]
 [   52    96]]


In [14]:
df_processed=get_preprocessed_df(df_initial)

In [12]:
X_train, X_test, y_train, y_test = split_train_test(df_processed, configs)
do_train(X_train, X_test, y_train, y_test,'logic')
do_train(X_train, X_test, y_train, y_test,'light')

logic f1 점수: 0.76 정확도: 1.0 정밀도: 0.88 재현율: 0.67
[[85281    14]
 [   48    98]]
light f1 점수: 0.32 정확도: 1.0 정밀도: 0.22 재현율: 0.55
[[85014   281]
 [   66    80]]


In [15]:
X_train, X_test, y_train, y_test = split_train_test(df_processed, configs)
do_train(X_train, X_test, y_train, y_test,'logic')
do_train(X_train, X_test, y_train, y_test,'light')

logic f1 점수: 0.76 정확도: 1.0 정밀도: 0.88 재현율: 0.67
[[85281    14]
 [   48    98]]
light f1 점수: 0.32 정확도: 1.0 정밀도: 0.22 재현율: 0.55
[[85014   281]
 [   66    80]]


In [2]:
def get_path():
    cur_path = os.getcwd()
    parent_path = os.path.dirname(cur_path)
    return cur_path, parent_path

def file_path(data_path, file):
    return os.path.abspath(os.path.join(data_path, f'{file}'))

def df_write(data_path, df, file):
    df = df.copy()
    df.to_csv(os.path.abspath(os.path.join(data_path, file)), index=False)

def write_processed(train, test, naming, drop):
    _, parent_path = get_path()
    df = train.copy()
    del train
    df[Y_col] = test.to_list()
    del test
    df.to_csv(f'{parent_path}/result_data/{naming}.csv', index=drop)

def split_train_test(df, configs):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=configs['y_col'][0])    
    return X_train, X_test, y_train, y_test

def model_selection(option='logic'):
    if option == 'light':
        return lgb.LGBMClassifier(random_state=0)
    elif option == 'boost':
        return GradientBoostingClassifier(random_state=0)
    else:
        return LogisticRegression(random_state=0)
    
def read_data(configs):
    if configs['date_col'][0] ==' ':
        df = pd.read_csv(configs['file_name'][0])
    else:
        df = pd.read_csv(configs['file_name'][0], parse_dates=configs['date_col'])

    if configs['remove_col'][0] == ' ':
        pass        
    else:
        if configs['remove_col'][0] in df.columns.to_list():
            df = df.drop(configs['remove_col'][0], axis=1)

    return df

def y_label_enc(df, configs):
    df = df.copy()
    Y_col = configs['y_col'][0]
    if df[Y_col].isnull().any():
        Y_null = True
    else:
        Y_null = False
    labeler = LabelEncoder()
    df[Y_col] = labeler.fit_transform(df[Y_col])
    return df, Y_null


def organize_data(df, configs, y_null):
    df = df.copy()
    cols = df.columns.to_list()
    null_threshhold_cols = []
    no_null_cols = []
    date_time = configs['date_col']
    Y_col = configs['y_col'][0]
    
#     if configs['remove_col'][0] == ' ':
#         pass
#     else:
#         df = df.drop(columns=configs['remove_col'][0])
    
    for col in cols:
        null_mean = df[col].isnull().mean()
        if null_mean >= configs['null_threshhold'][0]:
            null_threshhold_cols.append(col)
        if null_mean == 0:
            no_null_cols.append(col)

    cols_stayed = [item for item in cols if item not in null_threshhold_cols]
    data = df[cols_stayed].copy()

    # numerical: discrete vs continuous
    discrete = [var for var in cols_stayed if
                data[var].dtype != 'O' and var != Y_col and var not in date_time and data[var].nunique() < 10]
    continuous = [var for var in cols_stayed if
                  data[var].dtype != 'O' and var != Y_col and var not in date_time and var not in discrete]

    # categorical
    categorical = [var for var in cols_stayed if data[var].dtype == 'O' and var != Y_col]

    print('There are {} date_time variables'.format(len(date_time)))
    print('There are {} discrete variables'.format(len(discrete)))
    print('There are {} continuous variables'.format(len(continuous)))
    print('There are {} categorical variables'.format(len(categorical)))

    if y_null:
        data = data[data[Y_col] != data[Y_col].max()].copy()
    else:
        data = data.copy()

    return data, discrete, continuous, categorical

def split_train_test(df, configs):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=configs['test_size'][0], random_state=0, stratify=y)  
    return X_train, X_test, y_train, y_test

def make_imputer_pipe(continuous, discrete, categorical):
#     numberImputer = [item for item in continuous + discrete if item not in no_null_cols]
#     categoricalImputer = [item for item in categorical if item not in no_null_cols]
    numberImputer = continuous + discrete
    categoricalImputer = categorical

    if (len(numberImputer)>0) & (len(categoricalImputer)>0):
        pipe = Pipeline([
                ("median_imputer",
                 mdi.MeanMedianImputer(
                     imputation_method="median", variables=numberImputer),),

                ('imputer_cat',
                 mdi.CategoricalImputer(variables=categoricalImputer)),

                ('categorical_encoder',
                 ce.OrdinalEncoder(encoding_method='ordered',
                                   variables=categoricalImputer))
            ])
    else:
        if (len(numberImputer)>0) & (len(categoricalImputer)==0):
            pipe = Pipeline([
                ("median_imputer",
                 mdi.MeanMedianImputer(
                     imputation_method="median", variables=numberImputer),)
            ])
        else:
            if (len(numberImputer)==0) & (len(categoricalImputer)>0):
                pipe = Pipeline([
                    ('imputer_cat',
                     mdi.CategoricalImputer(variables=categoricalImputer)),

                    ('categorical_encoder',
                     ce.OrdinalEncoder(encoding_method='ordered',
                                       variables=categoricalImputer))
                ])
            else:
                pipe = []
    return pipe

def do_imputation(X_train, X_test, y_train, y_test, pipe):
    X_train, X_test, y_train, y_test = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
    if pipe != []:        
        pipe.fit(X_train, y_train)
        X_train = pipe.transform(X_train)
        X_test = pipe.transform(X_test)
    else:
        print ('no pipe')
    return X_train, X_test, y_train, y_test

def do_train(X_train, X_test, y_train, y_test, option):
    X_train, X_test, y_train, y_test = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
    model = model_selection(option)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics(y_test, y_pred, option)
    # write_processed(X_test_t, y_test, 'X_test_transform', drop)
    # write_processed(X_train_t, y_train, 'X_train_transform', drop)
                
def min_max_scale(df):
    df = df.copy()    
    scaler = MinMaxScaler()
    scaler.fit(df)
    return scaler.transform(df)
    
def metrics(y_test,pred, option):
    y_test = y_test.copy()
    pred = pred.copy()
    accuracy = round(accuracy_score(y_test, pred),2)
    precision = round(precision_score(y_test, pred),2)
    recall = round(recall_score(y_test, pred),2)
    f1 = round(f1_score(y_test, pred),2)
    print(option, "f1 점수:", f1, "정확도:", accuracy, "정밀도:", precision, "재현율:", recall)
    print(confusion_matrix(y_test,pred))

In [3]:
# df = pd.read_csv(r"C:\Users\jh\0py_dev\digitalship\data\MetroPT3(AirCompressor).csv")
# df = df.append(df.iloc[0])
# import numpy as np
# df.iloc[-1,1] = np.nan
# df.to_csv(r"C:\Users\jh\0py_dev\digitalship\data\MetroPT3(AirCompressor)2.csv", index=False)

In [4]:
config_file_name = 'argumet_metro.xlsx'
# config_file_name = sys.argv[1]
cur_path = os.getcwd()
parent = os.path.abspath(os.path.join(cur_path, os.pardir))
config_file = os.path.join(parent, os.path.join('config', f'{config_file_name}'))
configs = pd.read_excel(config_file, header=None).set_index(0).T
configs = configs.to_dict('list')
configs['file_name'][0] = os.path.join(parent,os.path.join('data',configs['file_name'][0]))
df = read_data(configs)
df

Unnamed: 0,TP2,TP3,H1,DV_pressure,Reservoirs,Oil_temperature,Motor_current,COMP,DV_eletric,Towers,MPG,LPS,Pressure_switch,Oil_level,Caudal_impulses,y
0,-0.012,9.358,9.340,-0.024,9.358,53.600,0.0400,1,0,1,1,0,1,1,1,0
1,-0.014,9.348,9.332,-0.022,9.348,53.675,0.0400,1,0,1,1,0,1,1,1,0
2,-0.012,9.338,9.322,-0.022,9.338,53.600,0.0425,1,0,1,1,0,1,1,1,0
3,-0.012,9.328,9.312,-0.022,9.328,53.425,0.0400,1,0,1,1,0,1,1,1,0
4,-0.012,9.318,9.302,-0.022,9.318,53.475,0.0400,1,0,1,1,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,-0.010,9.382,9.370,-0.018,9.384,68.300,3.7425,1,0,1,1,0,1,1,1,0
1048571,-0.010,9.366,9.356,-0.018,9.368,68.050,3.7625,1,0,1,1,0,1,1,1,0
1048572,-0.012,9.356,9.346,-0.018,9.358,67.825,3.6550,1,0,1,1,0,1,1,1,0
1048573,-0.012,9.344,9.332,-0.018,9.344,67.650,3.7600,1,0,1,1,0,1,1,1,0


#### initial score before pre_processing

In [5]:
df = read_data(configs)
X_train, X_test, y_train, y_test = split_train_test(df, configs)
do_train(X_train, X_test, y_train, y_test,'logic')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

#### second trail for f1 score after pre_processing with Logistic Regression

In [5]:
df = read_data(configs)
df, y_null = y_label_enc(df, configs)
df, discrete, continuous, categorical = organize_data(df, configs, y_null)
X_train, X_test, y_train, y_test = split_train_test(df, configs)
pipe = make_imputer_pipe(discrete, continuous, categorical)
X_train, X_test, y_train, y_test = do_imputation(X_train, X_test, y_train, y_test, pipe)
do_train(X_train, X_test, y_train, y_test,'logic')
do_train(X_train, X_test, y_train, y_test,'light')

There are 1 date_time variables
There are 8 discrete variables
There are 7 continuous variables
There are 0 categorical variables
logic f1 점수: 0.62 정확도: 0.97 정밀도: 0.64 재현율: 0.6
[[299708   3795]
 [  4426   6644]]
light f1 점수: 0.87 정확도: 0.99 정밀도: 0.98 재현율: 0.78
[[303293    210]
 [  2458   8612]]


#### third trail for f1 score with extra pre_processing with scaling

In [12]:
df = read_data(configs)
df, y_null = y_label_enc(df, configs)
df, discrete, continuous, categorical = organize_data(df, configs, y_null)
X_train, X_test, y_train, y_test = split_train_test(df, configs)
pipe = make_imputer_pipe(discrete, continuous, categorical)
X_train, X_test, y_train, y_test = do_imputation(X_train, X_test, y_train, y_test, pipe)

X_train_scaled = min_max_scale(X_train)
X_test_scaled = min_max_scale(X_test)
do_train(X_train_scaled, X_test_scaled, y_train, y_test,'logic')
do_train(X_train_scaled, X_test_scaled, y_train, y_test,'light')

There are 1 date_time variables
There are 8 discrete variables
There are 7 continuous variables
There are 0 categorical variables
logic f1 점수: 0.69 정확도: 0.98 정밀도: 0.62 재현율: 0.77
[[298155   5348]
 [  2513   8557]]
light f1 점수: 0.74 정확도: 0.98 정밀도: 0.93 재현율: 0.61
[[302960    543]
 [  4295   6775]]


In [9]:
# xtrains = pd.DataFrame(data = X_train_scaled, columns=X_train.columns)
# xtests = pd.DataFrame(data = X_test_scaled, columns=X_test.columns)

In [10]:
do_train(xtrains, xtests, y_train, y_test,'logic')

logic f1 점수: 0.69 정확도: 0.98 정밀도: 0.62 재현율: 0.77
[[298155   5348]
 [  2513   8557]]
