In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
def binary_data_preprocessing(df, id_col_name, label_col_name, positive_name, negative_name):
    id = id_col_name
    diagnosis = label_col_name
    
    df = df.drop(id, axis=1)
    df[diagnosis] = df[diagnosis].replace({positive_name:1.0, negative_name:0.0})
    
    # null value preprocessing
    df = df.dropna(axis=1, thresh=int(len(df)*0.5)) # null값이 200이상인 column 제거
    df = df.fillna(df.mean(axis=0)) # null 값은 해당열의 평균으로 치환
    
    # 중복 데이터 제거
    df = df.drop_duplicates()
    
    # dtype: object->float32
    special_characters = ['!','@','#','$','%','^','&','*','(',')','-','=',"'",'"',';',':','<','>',',','?','/','_']
    
    ob_columns = df.columns[df.dtypes=='object']
    for ob_column in ob_columns:
        df[ob_column] = df[ob_column].astype('string')
        for special_character in special_characters:
            df[ob_column] = df[ob_column].str.replace(special_character,'')
        df[ob_column] = df[ob_column].astype('float64') 
    
    #모든 dtype를 float32로 바꾸기
    # df = df.astype('float32')
    
    # min-max Normalization
    df = (df - df.min(axis=0))/(df.max(axis=0) - df.min(axis=0))
            
    # 상관관계 분석
    df_corr = df.corr(method='pearson')
    
    # 상관관계 시각화
    fig, ax = plt.subplots(figsize=(50, 50))
    ax = sns.heatmap(df_corr, 
                    linewidths = 0.1, vmax=1.0,
                   square=True, cmap=plt.cm.PuBu,
                   linecolor='white', annot=True, annot_kws={'size':20})
    ax.tick_params(labelsize=30)
    
    #상관관계 약햔 feature 제거
    weak_corr_list = list(df_corr[abs(df_corr[diagnosis]) <= 0.4].index)
    df = df.drop(weak_corr_list, axis=1)
    
    # 상관관계 순위 DataFrame 만들기
    new_df_corr = df.corr(method='pearson')
    diagnosis_corr = new_df_corr[diagnosis]
    diagnosis_corr_sort = abs(diagnosis_corr.drop(diagnosis)).sort_values(ascending=False)
    
    sort_columns = list(diagnosis_corr_sort.index)
    
    sort_values = []
    for column in sort_columns:
        sort_values.append(diagnosis_corr[column])
    
    rank = list(range(1, len(sort_columns)+1))
    rank_df = pd.DataFrame({'rank': rank,
                            'columns':sort_columns,
                            'vlaues':sort_values}).set_index('rank')
    
    return df, rank_df