In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test

df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

X_train.info()
y_train.info()

(8799, 11) (2200, 11) (8799, 2) (2200, 2)
<class 'pandas.core.frame.DataFrame'>
Index: 8799 entries, 3999 to 9332
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   8799 non-null   int64 
 1   Warehouse_block      8799 non-null   object
 2   Mode_of_Shipment     8799 non-null   object
 3   Customer_care_calls  8799 non-null   int64 
 4   Customer_rating      8799 non-null   int64 
 5   Cost_of_the_Product  8799 non-null   int64 
 6   Prior_purchases      8799 non-null   int64 
 7   Product_importance   8799 non-null   object
 8   Gender               8799 non-null   object
 9   Discount_offered     8799 non-null   int64 
 10  Weight_in_gms        8799 non-null   int64 
dtypes: int64(7), object(4)
memory usage: 824.9+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 8799 entries, 3999 to 9332
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype
---  ------ 

# 범주형 변수 인코딩

| 방식                   | 특징                                                             | 사용해야 하는 상황/주의점                                                                                                                                       |
| -------------------- | -------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Label Encoding**   | - 각 고유값에 **순서 없이** 0, 1, 2, ... 등 숫자 할당<br>- 값에 **숫자 간 의미 없음** | - 트리 계열(Decision Tree, RandomForest 등)에서 범주형 변수 인코딩 시<br>- **순서 의미가 필요 없는** 단순 범주형 변수<br>- \*\*선형 모델(회귀/로지스틱)\*\*에는 사용하면 안 됨(숫자에 순서/크기 의미가 생겨 성능 저하) |
| **One-Hot Encoding** | - 각 고유값을 \*\*각각의 컬럼(더미 변수)\*\*로 분리, 0/1로 표시<br>- **변수 수가 늘어남** | - **범주형 변수에 순서가 없고**, 각 값이 동등할 때(성별, 지역 등)<br>- 대부분의 머신러닝 모델에 안전하게 사용 가능<br>- 값 종류가 너무 많으면 차원 폭발(High Cardinality) 문제 주의                             |
| **Ordinal Encoding** | - **순서가 있는** 범주형 변수에 직접 지정한 순서대로 0, 1, 2 등 숫자 할당<br>- 순서 정보 반영 | - **명확한 순서**가 있는 범주형 변수(등급, 우선순위, 크기 등)<br>- 순서 정보가 모델에 중요한 경우(선형 회귀 등에서 유리)                                                                         |


In [2]:
# label-encoding
df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')

s_col = X_train.select_dtypes(include="object").columns
n_col = X_train.select_dtypes(exclude="object").columns

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

for col in s_col:
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.transform(X_test[col])

In [3]:
# one-hot-encoding
df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')
s_col = X_train.select_dtypes(include="object").columns
n_col = X_train.select_dtypes(exclude="object").columns

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

encoded = encoder.fit_transform(X_train[s_col]).toarray()
en_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(s_col))
X_train = pd.concat([X_train.drop(s_col, axis=1), en_df], axis=1)
print(X_train.info())

encoded = encoder.transform(X_test[s_col]).toarray()
en_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(s_col))
X_test = pd.concat([X_test.drop(s_col, axis=1), en_df], axis=1)

<class 'pandas.core.frame.DataFrame'>
Index: 10541 entries, 3999 to 8785
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         8799 non-null   float64
 1   Customer_care_calls        8799 non-null   float64
 2   Customer_rating            8799 non-null   float64
 3   Cost_of_the_Product        8799 non-null   float64
 4   Prior_purchases            8799 non-null   float64
 5   Discount_offered           8799 non-null   float64
 6   Weight_in_gms              8799 non-null   float64
 7   Warehouse_block_A          8799 non-null   float64
 8   Warehouse_block_B          8799 non-null   float64
 9   Warehouse_block_C          8799 non-null   float64
 10  Warehouse_block_D          8799 non-null   float64
 11  Warehouse_block_F          8799 non-null   float64
 12  Mode_of_Shipment_Flight    8799 non-null   float64
 13  Mode_of_Shipment_Road      8799 non-null   float6

In [4]:
# ordinal-encoding
df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')
s_col = X_train.select_dtypes(include="object").columns
n_col = X_train.select_dtypes(exclude="object").columns

from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder(categories=[["high", "medium", "low"], ["F", "M"]])

X_train[["Product_importance", "Gender"]] = encoder.fit_transform(X_train[["Product_importance", "Gender"]])
X_test[["Product_importance", "Gender"]] = encoder.transform(X_test[["Product_importance", "Gender"]])

# 숫자형 변수 스케일링

| 방식                   | sklearn 함수/클래스   | 특징 및 사용 상황                                               |
| -------------------- | ---------------- | -------------------------------------------------------- |
| **Min-Max Scaling**  | `MinMaxScaler`   | 0\~1 범위로 변환<br>극단값(이상치)에 민감<br>딥러닝/신경망, 이미지 처리 등에서 자주 사용 |
| **Standard Scaling** | `StandardScaler` | 평균 0, 표준편차 1로 변환(정규화)<br>이상치의 영향 있음<br>선형모델, PCA 등에서 활용  |
| **Robust Scaling**   | `RobustScaler`   | 중앙값(중앙값 0)과 IQR로 변환<br>이상치 영향 최소화                        |
| **MaxAbs Scaling**   | `MaxAbsScaler`   | -1\~1 범위로 변환(음수 포함 데이터에 적합)<br>희소 행렬(sparse data)에 적합    |
| **Normalizer**       | `Normalizer`     | 각 샘플(행)별로 벡터의 길이가 1이 되도록 변환(개별 피처가 아니라 샘플 전체 정규화)        |


In [5]:
df = pd.read_csv("../data/2_Train.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='Reached.on.Time_Y.N', id_name='ID')
s_col = X_train.select_dtypes(include="object").columns
n_col = X_train.select_dtypes(exclude="object").columns

for col in n_col:
    print(col, X_train[col].skew())

ID 0.008133293695891518
Customer_care_calls 0.3949356694303243
Customer_rating 0.005080151359232916
Cost_of_the_Product -0.1483342088338509
Prior_purchases 1.6963061584555015
Discount_offered 1.7617689795888485
Weight_in_gms -0.23349804462110674
