# [Module 1.5] 피쳐 엔지니어링

이 노트북은 아래와 같은 피쳐 엔지니어링을 통하여 새로운 피쳐를 생성 합니다.
- 날짜관련 피쳐 생성(월, 일, 요일)
- 기존의 피쳐들을 결합하여 새로운 피쳐 생성 (피쳐1 + 피쳐2 = 뉴피쳐)
- Product_ID를 기준으로 Target Encoding 하여 새로운 피쳐 생성
- Product_ID를 기준으로 Target Encoding Smoothing 하여 새로운 피쳐 생성
- Category 피쳐를 레이블 인코딩 함
- 로컬에 데이터 저장
    - 최종 레이블 인코딩 된 데이터 세트 저장 (XGBoost, CatBoost 용)
    - 레이블 인코딩 안한 데이터 세트 저장 (AutoGluon 용)

In [1]:
import pandas as pd
pd.options.display.max_rows=5
import numpy as np

In [2]:
%store -r full_data_file_name

### 데이터 로딩 및 셔플링

In [3]:
df = pd.read_csv(full_data_file_name)
df = df.sample(frac=1.0, random_state=1000)
df

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english,seller_zip_code_prefix,seller_city,seller_state
37413,3,2018-07-14 13:04:07,4ee61c3905a5c398d44b089108961bb3,28950,armacao dos buzios,RJ,105.00,27.04,2f13d1dc8b4e1d9d8027be50339546a9,2650.0,30.0,30.0,30.0,furniture_decor,3204,sao paulo,SP
54762,3,2017-03-25 10:25:16,959292edcade77d6b60dc8f49f01cd71,37880,cabo verde,MG,23.99,14.52,b000447e24e31a4d7e628ca4d0622131,250.0,19.0,4.0,11.0,telephony,3504,sao paulo,SP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18782,3,2018-07-16 18:20:34,609b34a18fd0d61719d0b3190ec05231,21240,rio de janeiro,RJ,200.00,19.50,92d5ae8e42c4599266f210bf0469ae9b,439.0,18.0,11.0,13.0,watches_gifts,14050,ribeirao preto,SP
3776,1,2018-05-25 07:06:30,dc275c5e585b7c3a3cddef527e34fc19,26540,nilopolis,RJ,18.90,7.55,15de022edf1005363381e66bed514528,100.0,16.0,3.0,23.0,furniture_decor,20270,rio de janeiro,RJ


In [4]:
df.columns

Index(['classes', 'order_approved_at', 'customer_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state', 'price',
       'freight_value', 'product_id', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm',
       'product_category_name_english', 'seller_zip_code_prefix',
       'seller_city', 'seller_state'],
      dtype='object')

## 날짜 피쳐 생성: Month, Day, WeeoOfDay(요일)

In [5]:
def create_date_feature(raw_df):
    df = raw_df.copy()
    df['order_date'] = pd.to_datetime(df['order_approved_at'])    
    df['order_weekday'] = df['order_date'].dt.weekday
    df['order_day'] = df['order_date'].dt.day    
    df['order_month'] = df['order_date'].dt.month        
    return df

f_df = create_date_feature(df)
f_df

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,product_height_cm,product_width_cm,product_category_name_english,seller_zip_code_prefix,seller_city,seller_state,order_date,order_weekday,order_day,order_month
37413,3,2018-07-14 13:04:07,4ee61c3905a5c398d44b089108961bb3,28950,armacao dos buzios,RJ,105.00,27.04,2f13d1dc8b4e1d9d8027be50339546a9,2650.0,...,30.0,30.0,furniture_decor,3204,sao paulo,SP,2018-07-14 13:04:07,5,14,7
54762,3,2017-03-25 10:25:16,959292edcade77d6b60dc8f49f01cd71,37880,cabo verde,MG,23.99,14.52,b000447e24e31a4d7e628ca4d0622131,250.0,...,4.0,11.0,telephony,3504,sao paulo,SP,2017-03-25 10:25:16,5,25,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18782,3,2018-07-16 18:20:34,609b34a18fd0d61719d0b3190ec05231,21240,rio de janeiro,RJ,200.00,19.50,92d5ae8e42c4599266f210bf0469ae9b,439.0,...,11.0,13.0,watches_gifts,14050,ribeirao preto,SP,2018-07-16 18:20:34,0,16,7
3776,1,2018-05-25 07:06:30,dc275c5e585b7c3a3cddef527e34fc19,26540,nilopolis,RJ,18.90,7.55,15de022edf1005363381e66bed514528,100.0,...,3.0,23.0,furniture_decor,20270,rio de janeiro,RJ,2018-05-25 07:06:30,4,25,5


## 기존 피쳐 결합하여 새로운 피쳐 생성 (컬럼1 + 컬럼2 = 뉴피쳐)

In [6]:
def change_var_type(f_df):
    df = f_df.copy()
    df['customer_zip_code_prefix'] = df['customer_zip_code_prefix'].astype(str)
    df['seller_zip_code_prefix'] = df['seller_zip_code_prefix'].astype(str)    
    return df

def comnbine_columns(f_df,src_col1, src_col2,new_col):
    df = f_df.copy()
    df[new_col] = df[str(src_col1)] + '_' + df[str(src_col2)]
    print("df shape: ", df.shape)
    return df



f_df = change_var_type(f_df)

### custoemr_state + seller_state

In [7]:
f_df = comnbine_columns(f_df,src_col1='customer_state', src_col2='seller_state',new_col='customer_seller_state')

df shape:  (67176, 22)


### custoemr_city + seller_city

In [8]:
f_df = comnbine_columns(f_df,src_col1='customer_city', src_col2='seller_city',new_col='customer_seller_city')

df shape:  (67176, 23)


### custoemr_zip + seller_zip

In [9]:
f_df = comnbine_columns(f_df,src_col1='customer_zip_code_prefix', 
                        src_col2='seller_zip_code_prefix',new_col='customer_seller_zip_code_prefix')

df shape:  (67176, 24)


In [10]:
f_df

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,seller_zip_code_prefix,seller_city,seller_state,order_date,order_weekday,order_day,order_month,customer_seller_state,customer_seller_city,customer_seller_zip_code_prefix
37413,3,2018-07-14 13:04:07,4ee61c3905a5c398d44b089108961bb3,28950,armacao dos buzios,RJ,105.00,27.04,2f13d1dc8b4e1d9d8027be50339546a9,2650.0,...,3204,sao paulo,SP,2018-07-14 13:04:07,5,14,7,RJ_SP,armacao dos buzios_sao paulo,28950_3204
54762,3,2017-03-25 10:25:16,959292edcade77d6b60dc8f49f01cd71,37880,cabo verde,MG,23.99,14.52,b000447e24e31a4d7e628ca4d0622131,250.0,...,3504,sao paulo,SP,2017-03-25 10:25:16,5,25,3,MG_SP,cabo verde_sao paulo,37880_3504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18782,3,2018-07-16 18:20:34,609b34a18fd0d61719d0b3190ec05231,21240,rio de janeiro,RJ,200.00,19.50,92d5ae8e42c4599266f210bf0469ae9b,439.0,...,14050,ribeirao preto,SP,2018-07-16 18:20:34,0,16,7,RJ_SP,rio de janeiro_ribeirao preto,21240_14050
3776,1,2018-05-25 07:06:30,dc275c5e585b7c3a3cddef527e34fc19,26540,nilopolis,RJ,18.90,7.55,15de022edf1005363381e66bed514528,100.0,...,20270,rio de janeiro,RJ,2018-05-25 07:06:30,4,25,5,RJ_RJ,nilopolis_rio de janeiro,26540_20270


In [11]:
f_df.columns

Index(['classes', 'order_approved_at', 'customer_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state', 'price',
       'freight_value', 'product_id', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm',
       'product_category_name_english', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 'order_date', 'order_weekday',
       'order_day', 'order_month', 'customer_seller_state',
       'customer_seller_city', 'customer_seller_zip_code_prefix'],
      dtype='object')

## Target Encoding 관련 피쳐 생성
- Product_ID 별 Classes의 평균, 갯수 (te_pdid_mean, te_pdid_count)
- Target Error (classes - te_pdid_mean)

In [12]:

def create_target_encoding(cat, raw_df):
    df = raw_df.copy()
    te = df.groupby(cat).classes.agg(['mean','count']).reset_index()
    te.columns = cat + ['te_pdid_mean','te_pdid_count']
    te_df = df.merge(te, on=cat, how='left')
    te_df['te_pdid_error'] = (te_df['classes']  - te_df['te_pdid_mean']).abs()
    return te_df

cat = ['product_id']
te_df = create_target_encoding(cat, f_df)

In [13]:
te_df.head(2)

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,order_date,order_weekday,order_day,order_month,customer_seller_state,customer_seller_city,customer_seller_zip_code_prefix,te_pdid_mean,te_pdid_count,te_pdid_error
0,3,2018-07-14 13:04:07,4ee61c3905a5c398d44b089108961bb3,28950,armacao dos buzios,RJ,105.0,27.04,2f13d1dc8b4e1d9d8027be50339546a9,2650.0,...,2018-07-14 13:04:07,5,14,7,RJ_SP,armacao dos buzios_sao paulo,28950_3204,2.705882,17,0.294118
1,3,2017-03-25 10:25:16,959292edcade77d6b60dc8f49f01cd71,37880,cabo verde,MG,23.99,14.52,b000447e24e31a4d7e628ca4d0622131,250.0,...,2017-03-25 10:25:16,5,25,3,MG_SP,cabo verde_sao paulo,37880_3504,2.6,5,0.4


In [14]:
pd.options.display.max_rows=10
te_df.te_pdid_error.describe()

count    67176.000000
mean         0.597436
std          0.602626
min          0.000000
25%          0.000000
50%          0.500000
75%          1.000000
max          3.095238
Name: te_pdid_error, dtype: float64

In [15]:
te_df.columns

Index(['classes', 'order_approved_at', 'customer_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state', 'price',
       'freight_value', 'product_id', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm',
       'product_category_name_english', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 'order_date', 'order_weekday',
       'order_day', 'order_month', 'customer_seller_state',
       'customer_seller_city', 'customer_seller_zip_code_prefix',
       'te_pdid_mean', 'te_pdid_count', 'te_pdid_error'],
      dtype='object')

## Target Encoding with Smoothing
아래 비디오 및 코드 참조 함
- Feature Engineering
    - RecSys 2020 Tutorial: Feature Engineering for Recommender Systems
        - https://www.youtube.com/watch?v=uROvhp7cj6Q
    - Git Repo
        - https://github.com/rapidsai/deeplearning/tree/main/RecSys2020Tutorial


\begin{equation} \label{eq:te}
TE_{target}([Categories]) = \frac{count([Categories]) * mean_{target}([Categories]) + w_{smoothing} * mean_{target}(global)}{count([Categories]) + w_{smoothing}}
\end{equation}

In [16]:
w = 20 # global 평균에 더 높은 가중치를 주는 값
def create_target_encoding_smoothe(cat, raw_df, w):
    df = raw_df.copy()
    classes_mean_global = df.classes.mean()
    print(" classes_mean_global: ", classes_mean_global)
    df['te_pdid_mean_smoothed'] = (df['te_pdid_mean'] * df['te_pdid_count']) + (classes_mean_global * w) / (te_df['te_pdid_count'] + w)
    df['te_pdid_error_smoothed'] = (df['classes']  - df['te_pdid_mean_smoothed']).abs()

    return df

tes_df = create_target_encoding_smoothe(cat, te_df, w)

 classes_mean_global:  2.08827557460998


In [17]:
tes_df.head()

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,order_day,order_month,customer_seller_state,customer_seller_city,customer_seller_zip_code_prefix,te_pdid_mean,te_pdid_count,te_pdid_error,te_pdid_mean_smoothed,te_pdid_error_smoothed
0,3,2018-07-14 13:04:07,4ee61c3905a5c398d44b089108961bb3,28950,armacao dos buzios,RJ,105.0,27.04,2f13d1dc8b4e1d9d8027be50339546a9,2650.0,...,14,7,RJ_SP,armacao dos buzios_sao paulo,28950_3204,2.705882,17,0.294118,47.128798,44.128798
1,3,2017-03-25 10:25:16,959292edcade77d6b60dc8f49f01cd71,37880,cabo verde,MG,23.99,14.52,b000447e24e31a4d7e628ca4d0622131,250.0,...,25,3,MG_SP,cabo verde_sao paulo,37880_3504,2.6,5,0.4,14.67062,11.67062
2,1,2018-03-24 12:27:28,0d4a5cdbedc3aa83305c5d5ea18a19de,4537,sao paulo,SP,99.0,9.05,8983a3b149303c013fceabef902e835a,450.0,...,24,3,SP_SP,sao paulo_sao paulo,4537_4102,1.238095,21,0.238095,27.018671,26.018671
3,0,2017-11-28 03:21:09,d2d34922fed7b3a54130c786b7cb0ecd,5397,sao paulo,SP,503.34,25.53,7614c62b86a81021243e438cfde78ccc,9900.0,...,28,11,SP_SP,sao paulo_salto,5397_13328,1.666667,12,1.666667,21.305172,21.305172
4,2,2018-03-25 16:10:31,3fb1b736500b371cd8576c7050fe7256,4037,sao paulo,SP,64.9,20.18,23227b07fc69250d2fa4be6602011aea,1250.0,...,25,3,SP_DF,sao paulo_brasilia,4037_70740,2.0,5,0.0,11.67062,9.67062


## 최종 사용할 컬럼 지정

In [18]:
def filter_df(raw_df, cols):
    df = raw_df.copy()
    df = df[cols]
    return df


cols = ['classes','customer_zip_code_prefix','customer_city',
       'customer_state', 'price', 'freight_value',
       'product_weight_g', 'product_length_cm', 'product_height_cm',
       'product_width_cm', 'product_category_name_english',
       'seller_zip_code_prefix','seller_city',
       'seller_state',
       'order_weekday',
       'order_day', 'order_month',
       'customer_seller_state',
       'customer_seller_city', 'customer_seller_zip_code_prefix',
#        'product_id',        
#       'te_pdid_mean', 'te_pdid_count', 'te_pdid_error',
       'te_pdid_mean_smoothed','te_pdid_error_smoothed',

       ]


tes_df = filter_df(tes_df, cols)
tes_df



Unnamed: 0,classes,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_weight_g,product_length_cm,product_height_cm,product_width_cm,...,seller_city,seller_state,order_weekday,order_day,order_month,customer_seller_state,customer_seller_city,customer_seller_zip_code_prefix,te_pdid_mean_smoothed,te_pdid_error_smoothed
0,3,28950,armacao dos buzios,RJ,105.00,27.04,2650.0,30.0,30.0,30.0,...,sao paulo,SP,5,14,7,RJ_SP,armacao dos buzios_sao paulo,28950_3204,47.128798,44.128798
1,3,37880,cabo verde,MG,23.99,14.52,250.0,19.0,4.0,11.0,...,sao paulo,SP,5,25,3,MG_SP,cabo verde_sao paulo,37880_3504,14.670620,11.670620
2,1,4537,sao paulo,SP,99.00,9.05,450.0,16.0,3.0,15.0,...,sao paulo,SP,5,24,3,SP_SP,sao paulo_sao paulo,4537_4102,27.018671,26.018671
3,0,5397,sao paulo,SP,503.34,25.53,9900.0,30.0,39.0,38.0,...,salto,SP,1,28,11,SP_SP,sao paulo_salto,5397_13328,21.305172,21.305172
4,2,4037,sao paulo,SP,64.90,20.18,1250.0,45.0,10.0,15.0,...,brasilia,DF,6,25,3,SP_DF,sao paulo_brasilia,4037_70740,11.670620,9.670620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67171,4,49015,aracaju,SE,330.00,25.03,400.0,34.0,4.0,22.0,...,sao bernardo do campo,SP,1,10,7,SE_SP,aracaju_sao bernardo do campo,49015_9628,262.278437,258.278437
67172,2,12570,aparecida,SP,55.00,25.67,250.0,22.0,14.0,11.0,...,goiania,GO,2,10,1,SP_GO,aparecida_goiania,12570_74323,24.392184,22.392184
67173,0,1532,sao paulo,SP,399.00,11.54,975.0,19.0,18.0,23.0,...,santo andre,SP,1,26,6,SP_SP,sao paulo_santo andre,1532_9230,1.988834,1.988834
67174,3,21240,rio de janeiro,RJ,200.00,19.50,439.0,18.0,11.0,13.0,...,ribeirao preto,SP,0,16,7,RJ_SP,rio de janeiro_ribeirao preto,21240_14050,6.898432,3.898432


## Category 레이블 Encoding

In [19]:
def category_encoding(raw_df, cate_cols, encoding_option=0):
    '''
    encoding_option = 0 : One hot Encoding
    encoding_option = 1 : Label Encoding
    
    '''
    f_df = raw_df.copy()
    if encoding_option == 0: # one-hot-encoding
        f_df = pd.get_dummies(f_df)
    else:
        for item in cate_cols:
            f_df[item] = f_df[item].astype("category").cat.codes +1
    print(f_df.shape)            
        
    return f_df

cate_cols = ['customer_state','customer_city','customer_zip_code_prefix',
             'product_category_name_english',
             'seller_state','seller_city','seller_zip_code_prefix',
             'customer_seller_city',
             'customer_seller_state',
             'customer_seller_zip_code_prefix',
             'order_weekday',
             'order_day',
             'order_month',
#             'product_id'
            ]


encoding_cate = category_encoding(tes_df,cate_cols, encoding_option=1)
encoding_cate.head()

(67176, 22)


Unnamed: 0,classes,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_weight_g,product_length_cm,product_height_cm,product_width_cm,...,seller_city,seller_state,order_weekday,order_day,order_month,customer_seller_state,customer_seller_city,customer_seller_zip_code_prefix,te_pdid_mean_smoothed,te_pdid_error_smoothed
0,3,3442,219,19,105.0,27.04,2650.0,30.0,30.0,30.0,...,475,21,6,14,7,240,1223,23560,47.128798,44.128798
1,3,5183,443,11,23.99,14.52,250.0,19.0,4.0,11.0,...,475,21,6,25,3,132,3422,32864,14.67062,11.67062
2,1,6294,2730,26,99.0,9.05,450.0,16.0,3.0,15.0,...,475,21,6,24,3,326,21388,37862,27.018671,26.018671
3,0,7161,2730,26,503.34,25.53,9900.0,30.0,39.0,38.0,...,428,21,2,28,11,326,21356,41304,21.305172,21.305172
4,2,5687,2730,26,64.9,20.18,1250.0,45.0,10.0,15.0,...,75,3,7,25,3,308,21115,35333,11.67062,9.67062


## 레이블 Encoding 안하고 바로 사용(AutoGluon 용)

In [20]:
no_encoding_cate = tes_df

## Train, Val, Test 분리

In [21]:
# One-Hot-Encoding
a,b,c = np.split(encoding_cate.sample(frac=1, random_state=1729), 
                 [int(0.7 * len(encoding_cate)), 
                  int(0.9 * len(encoding_cate))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%
preproc_train_data =a 
preproc_val_data =b
preproc_test_data =c   
# No one-hot-encoding
a,b,c = np.split(no_encoding_cate.sample(frac=1, random_state=1729), 
                 [int(0.7 * len(no_encoding_cate)), 
                  int(0.9 * len(no_encoding_cate))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%
train_data=a
val_data=b 
test_data=c    

## 로컬에 데이터 저장

In [22]:
import os

def save_local(train_data, validation_data, test_data, preproc_folder):
    train_df = pd.concat([train_data['classes'], train_data.drop(['classes'], axis=1)], axis=1)
    train_file_name = os.path.join(preproc_folder, 'train.csv')
    train_df.to_csv(train_file_name, index=False)
    print(f'{train_file_name} is saved')

    val_df = pd.concat([validation_data['classes'], validation_data.drop(['classes'], axis=1)], axis=1)
    val_file_name = os.path.join(preproc_folder, 'val.csv')
    val_df.to_csv(val_file_name, index=False)
    print(f'{val_file_name} is saved')    

    test_df = pd.concat([test_data['classes'], test_data.drop(['classes'], axis=1)], axis=1)
    test_file_name = os.path.join(preproc_folder, 'test.csv')
    test_df.to_csv(test_file_name, index=False)
    print(f'{test_file_name} is saved')        
    
    return train_file_name, val_file_name, test_file_name

preproc_folder = 'preproc_data/feature_engineering'
os.makedirs(preproc_folder, exist_ok=True)    
pre_train_file,pre_val_file, pre_test_file = save_local(preproc_train_data, preproc_val_data, preproc_test_data, preproc_folder)

preproc_folder = 'preproc_data/non_feature_engineering'
os.makedirs(preproc_folder, exist_ok=True)    
train_file,val_file, test_file  = save_local(train_data, val_data, test_data, preproc_folder)

preproc_data/feature_engineering/train.csv is saved
preproc_data/feature_engineering/val.csv is saved
preproc_data/feature_engineering/test.csv is saved
preproc_data/non_feature_engineering/train.csv is saved
preproc_data/non_feature_engineering/val.csv is saved
preproc_data/non_feature_engineering/test.csv is saved


In [23]:
%store pre_train_file
%store pre_val_file
%store pre_test_file

%store train_file
%store val_file
%store test_file

Stored 'pre_train_file' (str)
Stored 'pre_val_file' (str)
Stored 'pre_test_file' (str)
Stored 'train_file' (str)
Stored 'val_file' (str)
Stored 'test_file' (str)
