In [1]:
import sys
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score,precision_score,recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV
from feature_engine import imputation as mdi
from feature_engine import encoding as ce
import warnings
warnings.filterwarnings('ignore')


def get_path():
    cur_path = os.getcwd()
    parent_path = os.path.dirname(cur_path)
    return cur_path, parent_path


def file_path(data_path, file):
    return os.path.abspath(os.path.join(data_path, f'{file}'))


def df_write(data_path, df, file):
    df = df.copy()
    df.to_csv(os.path.abspath(os.path.join(data_path, file)), index=False)


def split_train_test(df, configs):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=configs['y_col'][0])
    return X_train, X_test, y_train, y_test


def model_selection(option='logistic'):
    if option == 'LGBM':
        return lgb.LGBMClassifier(random_state=0)
    elif option == 'logistic':
        return LogisticRegression(random_state=0)
    elif option == 'knn':
        return KNeighborsClassifier(n_neighbors=3)
    elif option =='cat':
        return CatBoostClassifier(random_seed=0)        


def read_data(configs):
    if configs['date_col'][0] == ' ':
        df = pd.read_csv(configs['file_name'][0])
    else:
        df = pd.read_csv(configs['file_name'][0], parse_dates=configs['date_col'])

    if configs['remove_col'][0] == ' ':
        pass
    else:
        if configs['remove_col'][0] in df.columns.to_list():
            df = df.drop(configs['remove_col'][0], axis=1)

    return df


def y_label_enc(df, configs):
    df = df.copy()
    Y_col = configs['y_col'][0]
    if df[Y_col].isnull().any():
        Y_null = True
    else:
        Y_null = False
    labeler = LabelEncoder()
    df[Y_col] = labeler.fit_transform(df[Y_col])
    return df, Y_null


def organize_data(df, configs, y_null):
    df = df.copy()
    cols = df.columns.to_list()
    null_threshhold_cols = []
    no_null_cols = []
    date_time = configs['date_col']
    Y_col = configs['y_col'][0]

    for col in cols:
        null_mean = df[col].isnull().mean()
        if null_mean >= configs['null_threshhold'][0]:
            null_threshhold_cols.append(col)
        if null_mean == 0:
            no_null_cols.append(col)

    cols_stayed = [item for item in cols if item not in null_threshhold_cols]
    data = df[cols_stayed].copy()

    # numerical: discrete vs continuous
    discrete = [var for var in cols_stayed if
                data[var].dtype != 'O' and var != Y_col and var not in date_time and data[var].nunique() < 10]
    continuous = [var for var in cols_stayed if
                  data[var].dtype != 'O' and var != Y_col and var not in date_time and var not in discrete]

    # categorical
    categorical = [var for var in cols_stayed if data[var].dtype == 'O' and var != Y_col]

    print('There are {} date_time variables'.format(len(date_time)))
    print('There are {} discrete variables'.format(len(discrete)))
    print('There are {} continuous variables'.format(len(continuous)))
    print('There are {} categorical variables'.format(len(categorical)))

    if y_null:
        data = data[data[Y_col] != data[Y_col].max()].copy()
    else:
        data = data.copy()

    return data, discrete, continuous, categorical


def split_train_test(df, configs):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=configs['test_size'][0], random_state=0, stratify=y)
    return X_train, X_test, y_train, y_test


def make_imputer_pipe(continuous, discrete, categorical):
    numberImputer = continuous + discrete
    categoricalImputer = categorical

    if (len(numberImputer) > 0) & (len(categoricalImputer) > 0):
        pipe = Pipeline([
            ("median_imputer",
             mdi.MeanMedianImputer(
                 imputation_method="median", variables=numberImputer),),

            ('imputer_cat',
             mdi.CategoricalImputer(variables=categoricalImputer)),

            ('categorical_encoder',
             ce.OrdinalEncoder(encoding_method='ordered',
                               variables=categoricalImputer))
        ])
    else:
        if (len(numberImputer) > 0) & (len(categoricalImputer) == 0):
            pipe = Pipeline([
                ("median_imputer",
                 mdi.MeanMedianImputer(
                     imputation_method="median", variables=numberImputer),)
            ])
        else:
            if (len(numberImputer) == 0) & (len(categoricalImputer) > 0):
                pipe = Pipeline([
                    ('imputer_cat',
                     mdi.CategoricalImputer(variables=categoricalImputer)),

                    ('categorical_encoder',
                     ce.OrdinalEncoder(encoding_method='ordered',
                                       variables=categoricalImputer))
                ])
            else:
                pipe = []
    return pipe


def do_imputation(X_train, X_test, y_train, y_test, pipe):
    X_train, X_test, y_train, y_test = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()
    if pipe != []:
        pipe.fit(X_train, y_train)
        X_train = pipe.transform(X_train)
        X_test = pipe.transform(X_test)
    else:
        print('no pipe applied')
    return X_train, X_test, y_train, y_test

def do_imputation_whole(df, pipe):
    df = df.copy()
    if pipe != []:
        df = pipe.fit_transform(df)        
    else:
        print('no pipe applied')
    return df


def do_train(X_train, y_train, option):
    X_train, y_train = X_train.copy(), y_train.copy()
    model = model_selection(option)
    model.fit(X_train, y_train)
    return model

def do_train_cat(X_train, y_train):
    X_train, y_train = X_train.copy(), y_train.copy()
    model = model_selection('cat')
    model.fit(X_train, y_train, silent=True)
    return model

    
def do_predict(model, X_test):
    return model.predict(X_test)


def min_max_scale(df):
    df = df.copy()
    scaler = MinMaxScaler()
    scaler.fit(df)
    return scaler.transform(df)


def metrics(file, y_test, pred, option, display_confusion=False, logtran=None, out_col=None):
    y_test = y_test.copy()
    pred = pred.copy()
    accuracy = round(accuracy_score(y_test, pred), 3)
    precision = round(precision_score(y_test, pred), 3)
    recall = round(recall_score(y_test, pred), 3)
    f1 = round(f1_score(y_test, pred), 3)
#     print(file, option, "f1 점수:", f1, "정확도:", accuracy, "정밀도:", precision, "재현율:", recall)
    if display_confusion:        
        print(confusion_matrix(y_test, pred))
    
    scores = [file, option, logtran, out_col, f1, accuracy, precision, recall]
    cols = ['File', 'Model', 'LogTrans', 'Outliered', 'F1', 'Accuracy', 'Precision', 'Recall']
    return pd.DataFrame(data=[scores], columns=cols)


def drop_outlier(df=None, corr_highest=None, y_col=None, yes_value=None, weight=1.5):
    df = df.copy()
    targeted = df[df[y_col]==yes_value][corr_highest]
    quantile_25 = np.percentile(targeted.values, 25)
    quantile_75 = np.percentile(targeted.values, 75)

    iqr = quantile_75 - quantile_25
    iqr_weight = iqr * weight
    lowest_val = quantile_25 - iqr_weight
    highest_val = quantile_75 + iqr_weight

    outlier_index = targeted [(targeted  < lowest_val) | (targeted > highest_val)].index
    df = df.drop(outlier_index, axis=0)
    return df


def log_trans(df, trans_col):
    df = df.copy()
    trans_values = np.log1p(df[trans_col])
    df.drop([trans_col], axis=1, inplace=True)
    df.insert(0, trans_col, trans_values)
    return df


def get_skew_top3(df):
    df = df.copy()
    skewed = df.skew()
    skewed = pd.DataFrame(data=skewed).reset_index()
    idx = skewed[skewed['index']==Y_col].index
    skewed = skewed.drop(idx, axis=0)
    skewed = skewed.rename(columns={'index':'col', 0:'skewed_value'})
    skewed['abs_skewed_value'] = abs(skewed['skewed_value'])
    skewed = skewed.sort_values(by=['abs_skewed_value'], ascending=False).reset_index(drop=True)
    skewed_higher = skewed.iloc[0:3]
    return skewed_higher


def get_corr_top5(df, y_col, yes_value):
    df = df.copy()
    corr_mat = df.corr(method='pearson')
    upper_corr_mat = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(np.bool))

    # Convert to 1-D series and drop Null values
    unique_corr_pairs = upper_corr_mat.unstack().dropna()

    # Sort correlation pairs
    sorted_mat = unique_corr_pairs.sort_values()
    df_corr = pd.DataFrame(data=sorted_mat).reset_index()
    df_corr = df_corr[df_corr['level_0']==y_col]
    df_corr = df_corr.rename(columns={'level_0':'Y_col', 'level_1':'col', 0:'corr_value'})
    df_corr['abs_corr_value'] = abs(df_corr['corr_value'])
    df_corr = df_corr.sort_values(by=['abs_corr_value'], ascending=False).reset_index(drop=True)
    corr_higher = df_corr.iloc[0:3]
    return corr_higher

def split_impute_train2(file, df, configs, option='logic', logtran=None, out_col=None):
    df = df.copy()
    Y_col = configs['y_col'][0]
    
    con = df['split']=='train'
    X_train = df[con].drop(columns=['split', Y_col]) 
    y_train = df[con][Y_col]
    con = df['split']=='test'
    X_test = df[con].drop(columns=['split', Y_col]) 
    y_test = df[con][Y_col]
    
    if option=='cat':
        model = do_train_cat(X_train, y_train)
    else:
        model = do_train(X_train, y_train, option)
    pred = do_predict(model, X_test)
    result = metrics(file, y_test, pred, option, display_confusion=False, logtran=logtran, out_col=out_col)
    return result
    

def split_impute_train(df, configs, discrete, continuous, categorical, option='logic', logtran=None, out_col=None):
    X_train, X_test, y_train, y_test = split_train_test(df, configs)
    pipe = make_imputer_pipe(discrete, continuous, categorical)
    X_train, X_test, y_train, y_test = do_imputation(X_train, X_test, y_train, y_test, pipe)
    model = do_train(X_train, y_train, option)
    pred = do_predict(model, X_test)
    result = metrics(y_test, pred, option, display_confusion=False, logtran=logtran, out_col=out_col)
    return result

def pca_train_metric(df, configs, option='logic', logtran=None, out_col=None):
    df = df.copy()
    X = df.drop(columns=configs['y_col'][0])
    y = df[configs['y_col'][0]]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=configs['test_size'][0], random_state=0, stratify=y)
    model = do_train(X_train, y_train, option)
    pred = do_predict(model, X_test)
    result = metrics(y_test, pred, option, display_confusion=False, logtran=logtran, out_col=out_col)
    return result

def pca_train_metric2(file, df, configs, option='logic', logtran=None, out_col=None):
    df = df.copy()
    Y_col = configs['y_col'][0]
    
    con = df['split']=='train'
    X_train = df[con].drop(columns=['split', Y_col]) 
    y_train = df[con][Y_col]
    con = df['split']=='test'
    X_test = df[con].drop(columns=['split', Y_col]) 
    y_test = df[con][Y_col]    
    model = do_train(X_train, y_train, option)
    pred = do_predict(model, X_test)
    result = metrics(file, y_test, pred, option, display_confusion=False, logtran=logtran, out_col=out_col)
    return result

In [2]:
folder_name = 'higgs'
config_file_name = f'argumet_{folder_name}.xlsx'

cur_path = os.getcwd()
parent = os.path.abspath(os.path.join(cur_path, os.pardir))
config_file = os.path.join(parent, os.path.join('config', f'{config_file_name}'))
configs = pd.read_excel(config_file, header=None).set_index(0).T
configs = configs.to_dict('list')
ori_file_name = configs['file_name'][0]
configs['file_name'][0] = os.path.join(parent, os.path.join('data', configs['file_name'][0]))
Y_col = configs['y_col'][0]

### before log transformation

In [3]:
files = os.listdir(os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/imputed')))
df = {}
for file in files:
#     df[file] = pd.read_csv( os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/scaled', f'{file}')))
    df[file] = pd.read_csv( os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/imputed', f'{file}')))
    
results = pd.DataFrame()
# result = split_impute_train(df.drop(columns=['split']), configs, discrete, continuous, categorical, option='logistic')
for file in files:
    try:
        result = split_impute_train2(file, df[file], configs, option='logistic', logtran=None, out_col=None)
        results = results.append(result)
        result = split_impute_train2(file, df[file], configs, option='LGBM', logtran=None, out_col=None)
        results = results.append(result)
    #     result = split_impute_train2(file, df[file], configs, option='knn', logtran=None, out_col=None)
    #     results = results.append(result)
    except Exception as e:    
        print(e)
    
results.sort_values('F1', ascending=True)

'split'


Unnamed: 0,File,Model,LogTrans,Outliered,F1,Accuracy,Precision,Recall
0,imputed_higgs_max.csv,logistic,,,0.567,0.585,0.592,0.544
0,imputed_higgs_mean.csv,logistic,,,0.706,0.736,0.797,0.634
0,imputed_higgs_min.csv,logistic,,,0.776,0.732,0.667,0.926
0,imputed_higgs_median.csv,logistic,,,0.779,0.741,0.68,0.912
0,imputed_higgs_median.csv,LGBM,,,0.835,0.826,0.794,0.881
0,imputed_higgs_min.csv,LGBM,,,0.845,0.844,0.837,0.853
0,imputed_higgs_max.csv,LGBM,,,0.85,0.85,0.848,0.852
0,imputed_higgs_mean.csv,LGBM,,,0.851,0.85,0.848,0.854


In [4]:
files = os.listdir(os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/scaled')))
df = {}
for file in files:
    df[file] = pd.read_csv( os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/scaled', f'{file}')))

for file in files:
    try:
        result = split_impute_train2(file, df[file], configs, option='logistic', logtran=None, out_col=None)
        results = results.append(result)
#         result = split_impute_train2(file, df[file], configs, option='LGBM', logtran=None, out_col=None)
#         results = results.append(result)
    except Exception as e:    
        print(e)
    
results[results['Model']=='logistic'].sort_values('F1', ascending=True)

Unnamed: 0,File,Model,LogTrans,Outliered,F1,Accuracy,Precision,Recall
0,imputed_higgs_max.csv,logistic,,,0.567,0.585,0.592,0.544
0,scaled_higgs_max.csv,logistic,,,0.582,0.588,0.591,0.573
0,scaled_higgs_mean.csv,logistic,,,0.693,0.735,0.821,0.6
0,imputed_higgs_mean.csv,logistic,,,0.706,0.736,0.797,0.634
0,scaled_higgs_median.csv,logistic,,,0.774,0.755,0.718,0.839
0,imputed_higgs_min.csv,logistic,,,0.776,0.732,0.667,0.926
0,imputed_higgs_median.csv,logistic,,,0.779,0.741,0.68,0.912
0,scaled_higgs_min.csv,logistic,,,0.784,0.742,0.674,0.937


### after log transformation

In [5]:
files = os.listdir(os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/scaled')))
df = {}
for file in files:
#     df[file] = pd.read_csv( os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/scaled', f'{file}')))
    df[file] = pd.read_csv( os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/scaled', f'{file}')))

try:
    for file in files:
        skewed = get_skew_top3(df[file])
        for idx, row in skewed.iterrows():
            trans_col = row[0]
#             print('Logtrans 칼럼', trans_col)
            df_log_processed = log_trans(df[file], trans_col)
            result = split_impute_train2(file, df_log_processed, configs, option='logistic', logtran=trans_col, out_col=None)
            results = results.append(result)
#             result = split_impute_train2(file, df_log_processed, configs, option='LGBM', logtran=trans_col, out_col=None)
#             results = results.append(result)
#             print()
except Exception as e:    
    print(e)
results[results['Model']=='logistic'].sort_values('F1', ascending=True)

Unnamed: 0,File,Model,LogTrans,Outliered,F1,Accuracy,Precision,Recall
0,imputed_higgs_max.csv,logistic,,,0.567,0.585,0.592,0.544
0,scaled_higgs_max.csv,logistic,DER_prodeta_jet_jet,,0.582,0.589,0.591,0.573
0,scaled_higgs_max.csv,logistic,DER_lep_eta_centrality,,0.582,0.589,0.591,0.574
0,scaled_higgs_max.csv,logistic,,,0.582,0.588,0.591,0.573
0,scaled_higgs_max.csv,logistic,DER_deltaeta_jet_jet,,0.582,0.589,0.591,0.573
0,scaled_higgs_mean.csv,logistic,DER_mass_MMC,,0.68,0.729,0.832,0.574
0,scaled_higgs_mean.csv,logistic,PRI_tau_pt,,0.68,0.73,0.832,0.576
0,scaled_higgs_mean.csv,logistic,PRI_jet_subleading_pt,,0.689,0.733,0.826,0.59
0,scaled_higgs_mean.csv,logistic,,,0.693,0.735,0.821,0.6
0,imputed_higgs_mean.csv,logistic,,,0.706,0.736,0.797,0.634


### after removing outlier with log transformation

In [6]:
yes_value = 1

for file in files:
    corr_higher = get_corr_top5(df[file], Y_col, yes_value)

    for idx_s, row_s in skewed.iterrows():
        trans_col = row_s[0]    
        for idx_c, row_c in corr_higher.iterrows():
#             print('Logtrans 칼럼', trans_col, 'Outlier 처리 칼럼', row_c['col'], row_c['corr_value'])
            corr_highest = row_c[1]
            df_processed = log_trans(df[file], trans_col)
            df_processed = drop_outlier(df_processed, corr_highest, Y_col, yes_value, weight=1.5)
            result = split_impute_train2(file, df_processed, configs, option='logistic', logtran=trans_col, out_col=corr_highest)           
            results = results.append(result)
#             result = split_impute_train2(file, df_processed, configs, option='LGBM', logtran=trans_col, out_col=corr_highest)           
#             results = results.append(result)
#             print()
results[results['Model']=='logistic'].sort_values('F1', ascending=True)

Unnamed: 0,File,Model,LogTrans,Outliered,F1,Accuracy,Precision,Recall
0,scaled_higgs_max.csv,logistic,DER_pt_h,PRI_jet_subleading_pt,0.547,0.617,0.582,0.516
0,scaled_higgs_max.csv,logistic,DER_mass_jet_jet,PRI_jet_subleading_pt,0.547,0.617,0.582,0.517
0,scaled_higgs_max.csv,logistic,PRI_jet_subleading_pt,PRI_jet_subleading_pt,0.548,0.617,0.582,0.517
0,imputed_higgs_max.csv,logistic,,,0.567,0.585,0.592,0.544
0,scaled_higgs_max.csv,logistic,,,0.582,0.588,0.591,0.573
0,scaled_higgs_max.csv,logistic,PRI_jet_subleading_pt,Weight,0.582,0.588,0.591,0.573
0,scaled_higgs_max.csv,logistic,DER_mass_jet_jet,Weight,0.582,0.588,0.591,0.573
0,scaled_higgs_max.csv,logistic,DER_lep_eta_centrality,,0.582,0.589,0.591,0.574
0,scaled_higgs_max.csv,logistic,DER_deltaeta_jet_jet,,0.582,0.589,0.591,0.573
0,scaled_higgs_max.csv,logistic,DER_prodeta_jet_jet,,0.582,0.589,0.591,0.573


In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['dark_background'])

In [8]:
results_ori = results.reset_index(drop=True)
results_ori['cat'] = results_ori['LogTrans']+':'+results_ori['Outliered'] 
results_ori

Unnamed: 0,File,Model,LogTrans,Outliered,F1,Accuracy,Precision,Recall,cat
0,imputed_higgs_max.csv,logistic,,,0.567,0.585,0.592,0.544,
1,imputed_higgs_max.csv,LGBM,,,0.85,0.85,0.848,0.852,
2,imputed_higgs_mean.csv,logistic,,,0.706,0.736,0.797,0.634,
3,imputed_higgs_mean.csv,LGBM,,,0.851,0.85,0.848,0.854,
4,imputed_higgs_median.csv,logistic,,,0.779,0.741,0.68,0.912,
5,imputed_higgs_median.csv,LGBM,,,0.835,0.826,0.794,0.881,
6,imputed_higgs_min.csv,logistic,,,0.776,0.732,0.667,0.926,
7,imputed_higgs_min.csv,LGBM,,,0.845,0.844,0.837,0.853,
8,scaled_higgs_max.csv,logistic,,,0.582,0.588,0.591,0.573,
9,scaled_higgs_mean.csv,logistic,,,0.693,0.735,0.821,0.6,


In [9]:
results_sorted = results_ori.sort_values(['F1','cat'], ascending=[True,False])#.set_index('cat')
results_sorted

Unnamed: 0,File,Model,LogTrans,Outliered,F1,Accuracy,Precision,Recall,cat
28,scaled_higgs_max.csv,logistic,DER_pt_h,PRI_jet_subleading_pt,0.547,0.617,0.582,0.516,DER_pt_h:PRI_jet_subleading_pt
25,scaled_higgs_max.csv,logistic,DER_mass_jet_jet,PRI_jet_subleading_pt,0.547,0.617,0.582,0.517,DER_mass_jet_jet:PRI_jet_subleading_pt
31,scaled_higgs_max.csv,logistic,PRI_jet_subleading_pt,PRI_jet_subleading_pt,0.548,0.617,0.582,0.517,PRI_jet_subleading_pt:PRI_jet_subleading_pt
0,imputed_higgs_max.csv,logistic,,,0.567,0.585,0.592,0.544,
30,scaled_higgs_max.csv,logistic,PRI_jet_subleading_pt,Weight,0.582,0.588,0.591,0.573,PRI_jet_subleading_pt:Weight
27,scaled_higgs_max.csv,logistic,DER_pt_h,Weight,0.582,0.588,0.591,0.573,DER_pt_h:Weight
24,scaled_higgs_max.csv,logistic,DER_mass_jet_jet,Weight,0.582,0.588,0.591,0.573,DER_mass_jet_jet:Weight
8,scaled_higgs_max.csv,logistic,,,0.582,0.588,0.591,0.573,
12,scaled_higgs_max.csv,logistic,DER_lep_eta_centrality,,0.582,0.589,0.591,0.574,
13,scaled_higgs_max.csv,logistic,DER_deltaeta_jet_jet,,0.582,0.589,0.591,0.573,


In [10]:
results_sorted[results_sorted['Model']=='logistic']

Unnamed: 0,File,Model,LogTrans,Outliered,F1,Accuracy,Precision,Recall,cat
28,scaled_higgs_max.csv,logistic,DER_pt_h,PRI_jet_subleading_pt,0.547,0.617,0.582,0.516,DER_pt_h:PRI_jet_subleading_pt
25,scaled_higgs_max.csv,logistic,DER_mass_jet_jet,PRI_jet_subleading_pt,0.547,0.617,0.582,0.517,DER_mass_jet_jet:PRI_jet_subleading_pt
31,scaled_higgs_max.csv,logistic,PRI_jet_subleading_pt,PRI_jet_subleading_pt,0.548,0.617,0.582,0.517,PRI_jet_subleading_pt:PRI_jet_subleading_pt
0,imputed_higgs_max.csv,logistic,,,0.567,0.585,0.592,0.544,
30,scaled_higgs_max.csv,logistic,PRI_jet_subleading_pt,Weight,0.582,0.588,0.591,0.573,PRI_jet_subleading_pt:Weight
27,scaled_higgs_max.csv,logistic,DER_pt_h,Weight,0.582,0.588,0.591,0.573,DER_pt_h:Weight
24,scaled_higgs_max.csv,logistic,DER_mass_jet_jet,Weight,0.582,0.588,0.591,0.573,DER_mass_jet_jet:Weight
8,scaled_higgs_max.csv,logistic,,,0.582,0.588,0.591,0.573,
12,scaled_higgs_max.csv,logistic,DER_lep_eta_centrality,,0.582,0.589,0.591,0.574,
13,scaled_higgs_max.csv,logistic,DER_deltaeta_jet_jet,,0.582,0.589,0.591,0.573,


In [11]:
1/0

ZeroDivisionError: division by zero

In [None]:
sns.reset_orig()
plt.style.use(['dark_background'])
ax = sns.catplot(x="cat", y="F1", data=results_sorted.iloc[:], kind='point',height=5,aspect=25/6)
sns.set(font_scale = 10)
plt.xticks(rotation=45)
ax.set_xticklabels(size=15)
ax.set_yticklabels(size=15)
plt.xlabel("LogTransformation & OutlierRemoved", size=15)
plt.ylabel("F1 Score", size=15)
plt.show()

### PCA

In [None]:
cur_path = os.getcwd()
parent = os.path.abspath(os.path.join(cur_path, os.pardir))
config_file = os.path.join(parent, os.path.join('config', f'{config_file_name}'))
configs = pd.read_excel(config_file, header=None).set_index(0).T
configs = configs.to_dict('list')
ori_file_name = configs['file_name'][0]
configs['file_name'][0] = os.path.join(parent, os.path.join('data', configs['file_name'][0]))
Y_col = configs['y_col'][0]
# df_initial = read_data(configs)

# df, y_null = y_label_enc(df_initial, configs)
# df_organized, discrete, continuous, categorical = organize_data(df, configs, y_null)
# pipe = make_imputer_pipe(discrete, continuous, categorical)
# df_imputed = do_imputation_whole(df_organized, pipe)
# df_scaled = StandardScaler().fit_transform(df_imputed.drop(columns=Y_col))

In [None]:
r11 = results.copy()

In [None]:
from sklearn.decomposition import PCA

Y_COL = configs['y_col'][0]
files = os.listdir(os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/scaled')))
df = {}
for file in files:
    df[file] = pd.read_csv( os.path.join(parent, os.path.join('data_preprocessed', f'{folder_name}/scaled', f'{file}')))

    for n_component in range(2,int(len(df[file].columns)/2)):
        pca = PCA(n_components=n_component)
        pca.fit(df[file].drop(columns=['split', Y_COL]))
        df_pca = pca.transform(df[file].drop(columns=['split', Y_COL]))
        df_pca = pd.DataFrame(df_pca)
        df_pca[Y_COL]=df[file][Y_COL]
        df_pca['split']=df[file]['split']
        print('PCA n_component', n_component)
        result = pca_train_metric2(file, df_pca, configs, option='logic', logtran='PCA', out_col=n_component)
    results = results.append(result)

In [None]:
results_ori2 = results.reset_index(drop=True)
results_ori2['cat'] = results_ori2['LogTrans']+':'+results_ori2['Outliered'].astype(str)
results_sorted = results_ori2.sort_values(['F1','cat'], ascending=[True,False])#.set_index('cat')
results_sorted

In [None]:
results_sorted[results_sorted['LogTrans']=='PCA'][results_sorted.columns[3:]]

In [None]:
sns.reset_orig()
plt.style.use(['dark_background'])
ax = sns.catplot(x="cat", y="F1", data=results_sorted.iloc[:], kind='point',height=5,aspect=12/5)
sns.set(font_scale = 10)
plt.xticks(rotation=45)
ax.set_xticklabels(size=15)
ax.set_yticklabels(size=15)
plt.xlabel("LogTransformation & OutlierRemoved & PCA", size=15)
plt.ylabel("F1 Score", size=15)
plt.show()

### feature importance & permutation importance

#### Feature Importance

In [None]:
log_col = 'Pressure_switch'
corr_col = 'H1'
df_processed = log_trans(df_organized, log_col)
df_processed = drop_outlier(df_processed, corr_col, Y_col, yes_value, weight=1.5)
X_train, X_test, y_train, y_test = split_train_test(df_processed, configs)
pipe = make_imputer_pipe(discrete, continuous, categorical)
X_train, X_test, y_train, y_test = do_imputation(X_train, X_test, y_train, y_test, pipe)
model = model_selection('LGBM')
model.fit(X_train, y_train)
scaler = MinMaxScaler()
m_feature_importances = scaler.fit_transform(model.feature_importances_.reshape(-1,1)).reshape(-1)
feature_imp = pd.DataFrame(sorted(zip(m_feature_importances,X_train.columns)), columns=['Value','Feature'])

sns.reset_orig()
plt.style.use(['dark_background'])

plt.figure(figsize=(12, 6))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features Importance')
plt.tight_layout()
plt.show()

#### Permutaion Feature Importance

In [None]:
from sklearn.inspection import permutation_importance

p_feature_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=0, n_jobs=2)
scaler = MinMaxScaler()
p_feature_importance = scaler.fit_transform(p_feature_importance.importances_mean.reshape(-1,1)).reshape(-1)

feature_imp = pd.DataFrame(sorted(zip(p_feature_importance,X_test.columns)), columns=['Value','Feature'])
sns.reset_orig()
plt.style.use(['dark_background'])

plt.figure(figsize=(12, 6))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features Importance (Test Set)')
plt.tight_layout()
plt.show()

### Multicollinearity

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
df_temp = df_imputed.drop(columns=Y_col).copy()
vif["VIF Factor"] = [variance_inflation_factor(df_temp.values, i) for i in range(df_temp.shape[1])]
vif["features"] = df_temp.columns
vif_top5 = vif.sort_values(by='VIF Factor',ascending=False).head(5)
vif_top5

In [None]:
feature_c = pd.DataFrame(continuous, columns=['features'])
feature_c['type'] = 'c'
feature_d = pd.DataFrame(discrete, columns=['features'])
feature_d['type'] = 'd'
feature_ca = pd.DataFrame(categorical, columns=['features'])
feature_ca['type'] = 'ca'
feature_type = pd.concat([feature_c,feature_d,feature_ca]).reset_index(drop=True)
feature_type =vif_top5.merge(feature_type, on='features')
feature_type

In [None]:
# base score
r0 = split_impute_train(df_organized, configs, discrete, continuous, categorical, option='logistic', logtran='Base Score', out_col=None)

In [None]:
c_list = continuous.copy()
c_list.remove(vif_top5.iloc[0,1])
df_temp = df_organized.drop(columns=vif_top5.iloc[0,1])
r1 = split_impute_train(df_temp, configs, discrete, c_list, categorical, option='logistic', logtran=None, out_col=vif_top5.iloc[0,1])

In [None]:
c_list = continuous.copy()
c_list.remove(vif_top5.iloc[1,1])
df_temp = df_organized.drop(columns=vif_top5.iloc[1,1])
r2 = split_impute_train(df_temp, configs, discrete, c_list, categorical, option='logistic', logtran=None, out_col=vif_top5.iloc[1,1])

In [None]:
d_list = discrete.copy()
d_list.remove(vif_top5.iloc[2,1])
df_temp = df_organized.drop(columns=vif_top5.iloc[2,1])
r3 = split_impute_train(df_temp, configs, d_list, continuous, categorical, option='logistic', logtran=None, out_col=vif_top5.iloc[2,1])

In [None]:
c_list = continuous.copy()
c_list.remove(vif_top5.iloc[0,1])
c_list.remove(vif_top5.iloc[1,1])
df_temp = df_organized.drop(columns=[vif_top5.iloc[0,1], vif_top5.iloc[1,1]])
cols = vif_top5.iloc[0,1]+'/'+ vif_top5.iloc[1,1]
r4 = split_impute_train(df_temp, configs, discrete, c_list, categorical, option='logistic', logtran=None, out_col=cols)

In [None]:
c_list = continuous.copy()
c_list.remove(vif_top5.iloc[0,1])
c_list.remove(vif_top5.iloc[1,1])
d_list = discrete.copy()
d_list.remove(vif_top5.iloc[2,1])
df_temp = df_organized.drop(columns=[vif_top5.iloc[0,1], vif_top5.iloc[1,1], vif_top5.iloc[2,1]])
cols = vif_top5.iloc[0,1]+'/'+ vif_top5.iloc[1,1]+'/'+vif_top5.iloc[2,1]
r5 = split_impute_train(df_temp, configs, d_list, c_list, categorical, option='logistic', logtran=None, out_col=cols)

In [None]:
pd.concat([r0,r1,r2,r3,r4,r5])