In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import matplotlib as mpl
mpl.rc('font', family="Malgun Gothic")

from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler 
from sklearn.model_selection import train_test_split
pd.set_option('mode.chained_assignment',  None)

In [32]:
def preprocessing(df_scaling):
    # 불필요한 컬럼 제거
    if 'ID' in df_scaling:
        df_scaling = df_scaling.drop("ID", axis = 1)
        if len(df_scaling.columns) == 1:
            return df_scaling
        
        df_scaling = df_scaling.drop('oral', axis = 1) 
        # 범주형 피처 레이블 인코딩 
        cate_features = df_scaling[['gender','tartar']]

        lbe = LabelEncoder()
        lbe.fit_transform(df_scaling["gender"])
        df_scaling["gender"] = lbe.fit_transform(df_scaling["gender"])

        lbe = LabelEncoder()
        lbe.fit_transform(df_scaling["tartar"])
        df_scaling["tartar"] = lbe.fit_transform(df_scaling["tartar"])
        
        #3 특성변환 hearing 1,2 -> 0,1
        df_scaling['hearing(left)'] = df_scaling['hearing(left)'].map({2.0 : 1.0 , 1.0 : 0.0})
        df_scaling['hearing(right)'] = df_scaling['hearing(right)'].map({2.0 : 1.0 , 1.0 : 0.0})
        
        df_scaling['bmi'] = df_scaling['weight(kg)'] / ((df_scaling['height(cm)']*0.01) **2)
        df_scaling['wwi'] = df_scaling['waist(cm)'] / np.sqrt(df_scaling['weight(kg)'])
        
        # 시력변환
        df_scaling['eyesight(left)'] = pd.cut(df_scaling['eyesight(left)'] , bins=[0.0,0.9,1.5,2.0,9.9] , labels=[1,2,3,4])
        df_scaling['eyesight(right)'] = pd.cut(df_scaling['eyesight(right)'] , bins=[0.0,0.9,1.5,2.0,9.9] , labels=[1,2,3,4])
        
        # Urine protein : 6단계 데이터를 0~2단계로 변환(단순화)
        x = df_scaling['Urine protein']
        for i in range(len(x)) :
            if x[i] == 1.0 :
                x[i] = 0
            elif x[i] == 2.0 :
                x[i] = 1
            else :
                x[i] = 2
        df_scaling['Urine protein'] = x
        #df_scaling['Urine protein'] = np.where(df_scaling['Urine protein'] == 1.0 , 0 , np.where(df_scaling['Urine protein'] == 2.0 , 1 , 2))
        return df_scaling

In [33]:
def scaling(train_data, test_data , scaled_form = 'MinMax scaler()'):        
        train_data = preprocessing(train_data)
        test_data = preprocessing(test_data)
        
        tr_cate_features = train_data[['Urine protein','eyesight(left)','eyesight(right)','gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]
        tr_scaled_features = train_data.drop(tr_cate_features.columns , axis =1)
        
        te_cate_features = test_data[['Urine protein','eyesight(left)','eyesight(right)','gender', 'tartar', 'hearing(right)', 'hearing(left)', 'dental caries']]
        te_scaled_features = test_data.drop(tr_cate_features.columns , axis =1)
        
        # 정규화 작업

        
        if scaled_form == 'StandardScaler()':
            # Standard scaler
            scaler = StandardScaler()
            scaler.fit(tr_scaled_features)
            
            #훈련데이터와 테스트데이터에 트랜스폼
            tr_sc = scaler.transform(tr_scaled_features)
            te_sc = scaler.transform(te_scaled_features)
            
            train_std_scaled = pd.DataFrame(tr_sc, columns = tr_scaled_features.columns)
            train_std_scaled[tr_cate_features.columns] = tr_cate_features
            
            test_std_scaled = pd.DataFrame(te_sc, columns = te_scaled_features.columns)
            test_std_scaled[te_cate_features.columns] = te_cate_features

            return train_std_scaled , test_std_scaled
        
        elif scaled_form == 'RobustScaler()' :
            scaler = RobustScaler()
            scaler.fit(tr_scaled_features)
            
            #훈련데이터와 테스트데이터에 트랜스폼
            tr_ro = scaler.transform(tr_scaled_features)
            te_ro = scaler.transform(te_scaled_features)
            
            train_ro_scaled = pd.DataFrame(tr_ro, columns = tr_scaled_features.columns)
            train_ro_scaled[tr_cate_features.columns] = tr_cate_features
            
            test_ro_scaled = pd.DataFrame(te_ro, columns = te_scaled_features.columns)
            test_ro_scaled[te_cate_features.columns] = te_cate_features

            return train_ro_scaled , test_ro_scaled
        
        else:
            scaler = MinMaxScaler()
            scaler.fit(tr_scaled_features)
            
            #훈련데이터와 테스트데이터에 트랜스폼
            tr_m = scaler.transform(tr_scaled_features)
            te_m = scaler.transform(te_scaled_features)
            
            train_m_scaled = pd.DataFrame(tr_m, columns = tr_scaled_features.columns)
            train_m_scaled[tr_cate_features.columns] = tr_cate_features
            
            test_m_scaled = pd.DataFrame(te_m, columns = te_scaled_features.columns)
            test_m_scaled[te_cate_features.columns] = te_cate_features

            return train_m_scaled , test_m_scaled

In [13]:
x_train = pd.read_csv("../data/x_train.csv")
x_test = pd.read_csv("../data/x_test.csv")
y_train = pd.read_csv("../data/y_train.csv")
y_test = pd.read_csv("../data/y_test.csv")

In [34]:
x_tr , x_te = scaling(x_train , x_test)

In [35]:
y_tr = preprocessing(y_train)
y_te = preprocessing(y_test)
y_tr

Unnamed: 0,smoking
0,0
1,0
2,1
3,0
4,0
...,...
44548,0
44549,0
44550,0
44551,1


In [39]:
x_tr

Unnamed: 0,age,height(cm),weight(kg),waist(cm),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,...,bmi,wwi,Urine protein,eyesight(left),eyesight(right),gender,tartar,hearing(right),hearing(left),dental caries
0,0.307692,0.416667,0.285714,0.388462,0.254438,0.311321,0.104575,0.410256,0.074672,0.112378,...,0.379885,0.447066,0,2,2,0,1,0.0,0.0,0.0
1,0.307692,0.500000,0.285714,0.384615,0.284024,0.283019,0.183007,0.351282,0.107972,0.061889,...,0.325362,0.442914,0,1,1,0,1,0.0,0.0,0.0
2,0.538462,0.666667,0.285714,0.371795,0.396450,0.433962,0.093682,0.479487,0.175580,0.083062,...,0.230392,0.429073,0,1,1,1,0,0.0,0.0,0.0
3,0.307692,0.583333,0.380952,0.474359,0.171598,0.188679,0.108932,0.684615,0.248234,0.066775,...,0.406062,0.449449,0,2,2,1,1,0.0,0.0,0.0
4,0.307692,0.416667,0.285714,0.448718,0.289941,0.320755,0.074074,0.330769,0.066599,0.094463,...,0.379885,0.512118,0,2,2,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44548,0.461538,0.666667,0.333333,0.474359,0.455621,0.509434,0.122004,0.325641,0.161453,0.058632,...,0.291787,0.492016,0,2,2,1,1,0.0,0.0,0.0
44549,0.230769,0.750000,0.380952,0.423077,0.201183,0.320755,0.091503,0.338462,0.052472,0.086319,...,0.304767,0.398193,0,1,1,1,1,0.0,0.0,0.0
44550,0.230769,0.750000,0.380952,0.255128,0.266272,0.283019,0.106754,0.230769,0.050454,0.060261,...,0.304767,0.230330,0,2,2,1,1,0.0,0.0,0.0
44551,0.461538,0.583333,0.571429,0.715385,0.301775,0.330189,0.098039,0.317949,0.132190,0.070033,...,0.666748,0.528752,0,1,1,1,1,0.0,0.0,0.0
