In [None]:
# 요약
# 데이터 encoding 완료 후 진행해야 함
# 스케일링 4가지 중 어떤 것 사용할지 고민해보기 -> 스케일링 4가지 모두 모델에 학습시켜서 성능이 가장 좋은 것 적용

In [1]:
import pandas as pd
from scipy import stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [2]:
# 데이터 지은언니 0615버전 데이터 (데이터 encoding 완료 후 진행해야 함)

train = pd.read_csv('../../ieee-fraud-detection/0615_train_pp_ver1.csv')
test =pd.read_csv('../../ieee-fraud-detection/0615_test_pp_ver1.csv')

In [3]:
class MultiColLabelEncoder:
    def __init__(self):
        self.encoder_dict = defaultdict(LabelEncoder)

    def fit_transform(self, X: pd.DataFrame, columns: list):  # 컬럼명 리스트 기준으로 레이블인코딩
        if not isinstance(columns, list):
            columns = [columns]

        output = X.copy()
        output[columns] = X[columns].apply(lambda x: self.encoder_dict[x.name].fit_transform(x))

        return output

    def inverse_transform(self, X: pd.DataFrame, columns: list):  # 인코딩 된 열 레이블 복구
        if not isinstance(columns, list):
            columns = [columns]

        if not all(key in self.encoder_dict for key in columns):
            raise KeyError(f'At least one of {columns} is not encoded before')

        output = X.copy()
        try:
            output[columns] = X[columns].apply(lambda x: self.encoder_dict[x.name].inverse_transform(x))
        except ValueError:
            print(f'Need assignment when do "fit_transform" function')
            raise

        return output

In [4]:
# encoding
cat_cols = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "P_emaildomain", 
            "R_emaildomain", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9"]
train_encoded = MultiColLabelEncoder().fit_transform(train, columns=cat_cols)
train_encoded

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,M3,M4,M5,M6,M7,M8,M9,PC1,PC2,PC3
0,2987000,0.0,86400,68.50,4,10095,500,42,1,38,...,1,2,0,1,2,2,2,-414.605554,-99.819259,100.066514
1,2987001,0.0,86401,29.00,4,1372,303,42,2,2,...,2,0,1,1,2,2,2,-581.849403,-115.593458,-18.591293
2,2987002,0.0,86469,59.00,4,2833,389,42,3,58,...,1,0,0,0,0,0,0,-581.849421,-115.593437,-18.591291
3,2987003,0.0,86499,50.00,4,13341,466,42,2,14,...,2,0,1,0,2,2,2,2675.079606,77.806453,998.550570
4,2987004,0.0,86506,50.00,1,2712,413,42,2,2,...,2,3,2,2,2,2,2,-581.849185,-115.593577,-18.591272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0.0,15811047,49.00,4,4305,500,42,3,108,...,1,0,1,0,0,0,1,-543.343239,-96.650015,-2.084821
590536,3577536,0.0,15811049,39.50,4,7354,124,42,2,106,...,0,0,0,1,0,0,0,-581.849421,-115.593437,-18.591291
590537,3577537,0.0,15811079,30.95,4,8621,494,42,2,106,...,0,3,2,1,2,2,2,-581.849441,-115.593445,-18.591311
590538,3577538,0.0,15811088,117.00,4,5297,380,42,2,106,...,1,0,0,1,2,2,2,2175.884363,1163.337741,2170.418130


In [5]:
# 데이터 분할
X = train_encoded.drop(['isFraud'], axis = 1)
Y = train_encoded['isFraud']

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [6]:
# data scailing
# fit은 스케일링의 기준을 세우며 변환하는 기능, transform은 기준에 따라 변환만 하는 기능
# 따라서 훈련데이터에는 fit_transform()함수를, 평가 데이터에는 transform()함수를 사용

In [7]:
# Standard Scaler(이상치에 민감, 각 피쳐의 평균을 0 분산을 1로 조정, 모든 특성들이 같은 스케일을 갖게 됨, 단위가 서로 다른 특성의 변화를 비교할 때 사용)
from sklearn.preprocessing import StandardScaler

# 스케일링 실행
scaler_s = StandardScaler()
X_train_scaled_s = scaler_s.fit_transform(X_train)
X_test_scaled_s = scaler_s.transform(X_test)

# 배열형태로 변환되기 때문에 데이터 프레임 형태로 변환
X_train_scaled_DF_s = pd.DataFrame(data = X_train_scaled_s, columns = X_train.columns)
X_test_scaled_DF_s = pd.DataFrame(data = X_test_scaled_s, columns = X_test.columns)

# 스케일링 전 후 비교
print('feature 평균')
print(X_train_scaled_DF_s.mean())
print('\nfeature 분산')
print(X_train_scaled_DF_s.var())
print('##############')
print(X_train.head(5))
print(X_train_scaled_DF_s.head(5))

feature 평균
TransactionID     4.218048e-16
TransactionDT     8.823529e-18
TransactionAmt    1.581818e-17
ProductCD         4.310695e-17
card1             1.940775e-17
card2            -3.172460e-18
card3            -1.046631e-16
card4            -5.177005e-17
card5             1.848128e-16
card6             1.479385e-16
addr1             1.398730e-16
addr2             3.264064e-16
dist1            -1.174914e-17
dist2            -2.556362e-17
P_emaildomain    -1.460936e-16
R_emaildomain     1.323690e-16
C3                4.942179e-18
C5                2.866043e-17
C9               -4.136631e-17
C13              -1.461497e-17
D1                2.786127e-17
D2                6.622881e-17
D3                2.738640e-17
D10               2.261116e-17
D11              -4.096645e-17
D15               5.440189e-17
M1                4.450267e-17
M2                1.020481e-16
M3                1.728409e-16
M4               -1.055294e-16
M5                1.345668e-16
M6                1.443850e-

In [8]:
# Robust Scaler(모든 특성들이 같은 크기를 갖음, 이상치에 영향을 받지 않음, 평균과 분산대신 median과 quartile을 사용, 서로 다른 단위를 통일 시킬 때 사용)
from sklearn.preprocessing import RobustScaler

# 스케일링 실행
scaler_r = RobustScaler()
X_train_scaled_r = scaler_r.fit_transform(X_train)
X_test_scaled_r = scaler_r.transform(X_test)

# 배열형태로 변환되기 때문에 데이터 프레임 형태로 변환
X_train_scaled_DF_r = pd.DataFrame(data = X_train_scaled_r, columns = X_train.columns)
X_test_scaled_DF_r = pd.DataFrame(data = X_test_scaled_r, columns = X_test.columns)

# 스케일링 전 후 비교
print('feature 특징')
print(X_train_scaled_DF_r.describe())
print('##############')
print(X_train.head(5))
print(X_train_scaled_DF_r.head(5))

feature 특징
       TransactionID  TransactionDT  TransactionAmt      ProductCD  \
count  442905.000000  442905.000000   442905.000000  442905.000000   
mean        0.000123       0.008314        0.809525      -0.779522   
std         0.577733       0.562139        2.899058       1.424235   
min        -1.000478      -0.878753       -0.838989      -4.000000   
25%        -0.499956      -0.520595       -0.315278      -1.000000   
50%         0.000000       0.000000        0.000000       0.000000   
75%         0.500044       0.479405        0.684722       0.000000   
max         1.001112       1.036137      389.313702       0.000000   

               card1          card2          card3          card4  \
count  442905.000000  442905.000000  442905.000000  442905.000000   
mean        0.028919       0.017278       3.014414      -0.382265   
std         0.596539       0.532648      10.616383       0.589912   
min        -1.052755      -0.869565     -42.000000      -3.000000   
25%        -0

In [9]:
# MinMax Scaler(모든 피쳐가 0과 1사이로 조정, 데이터가 2차원 셋일 경우 모든 데이터는 x축과 y축의 각각 0과 1사이에 위치)
from sklearn.preprocessing import MinMaxScaler

# 스케일링 실행
scaler_m = MinMaxScaler()
X_train_scaled_m = scaler_m.fit_transform(X_train)
X_test_scaled_m = scaler_m.transform(X_test)

# 배열형태로 변환되기 때문에 데이터 프레임 형태로 변환
X_train_scaled_DF_m = pd.DataFrame(data = X_train_scaled_m, columns = X_train.columns)
X_test_scaled_DF_m = pd.DataFrame(data = X_test_scaled_m, columns = X_test.columns)

# 스케일링 전 후 비교
print('feature 최솟값')
print(X_train_scaled_DF_m.min())
print('fearure 최댓값')
print(X_train_scaled_DF_m.max())
print('##############')
print(X_train.head(5))
print(X_train_scaled_DF_m.head(5))

feature 최솟값
TransactionID     0.0
TransactionDT     0.0
TransactionAmt    0.0
ProductCD         0.0
card1             0.0
card2             0.0
card3             0.0
card4             0.0
card5             0.0
card6             0.0
addr1             0.0
addr2             0.0
dist1             0.0
dist2             0.0
P_emaildomain     0.0
R_emaildomain     0.0
C3                0.0
C5                0.0
C9                0.0
C13               0.0
D1                0.0
D2                0.0
D3                0.0
D10               0.0
D11               0.0
D15               0.0
M1                0.0
M2                0.0
M3                0.0
M4                0.0
M5                0.0
M6                0.0
M7                0.0
M8                0.0
M9                0.0
PC1               0.0
PC2               0.0
PC3               0.0
dtype: float64
fearure 최댓값
TransactionID     1.0
TransactionDT     1.0
TransactionAmt    1.0
ProductCD         1.0
card1             1.0
card2          

In [10]:
# Normalizer Scaler(row마다 각각 정규화, 유클리드 거리가 1이 되더록 데이터 조정, 좀 더 빠르게 학습 가능, 과대적합 확률 낮춤)
# 이건 모든 cate, nume 의 NaN값 처리 후 돌리기!
from sklearn.preprocessing import Normalizer

# 스케일링 실행
scaler_n = Normalizer()
X_train_scaled_n = scaler_n.fit_transform(X_train)
X_test_scaled_n = scaler_n.transform(X_test)

# 배열형태로 변환되기 때문에 데이터 프레임 형태로 변환
X_train_scaled_DF_n = pd.DataFrame(data = X_train_scaled_n, columns = X_train.columns)
X_test_scaled_DF_n = pd.DataFrame(data = X_test_scaled_n, columns = X_test.columns)

# 스케일링 전 후 비교
print(X_train.head(5))
print(X_train_scaled_DF_n.head(5))

ValueError: Input X contains NaN.
Normalizer does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values