# [Module 2.1] 피쳐 엔지니어링

이 노트북은 아래와 같은 피쳐 엔지니어링을 통하여 새로운 피쳐를 생성 합니다.
- 날짜관련 피쳐 생성(월, 일, 요일)
- 기존의 피쳐들을 결합하여 새로운 피쳐 생성 (피쳐1 + 피쳐2 = 뉴피쳐)
- Product_ID를 기준으로 Target Encoding 하여 새로운 피쳐 생성
- Product_ID를 기준으로 Target Encoding Smoothing 하여 새로운 피쳐 생성
- Category 피쳐를 레이블 인코딩 함
- 로컬에 데이터 저장
    - 최종 레이블 인코딩 된 데이터 세트 저장 (XGBoost, CatBoost 용)
    - 레이블 인코딩 안한 데이터 세트 저장 (AutoGluon 용)

In [1]:
import pandas as pd
pd.options.display.max_rows=5
import numpy as np

In [2]:
%store -r full_data_file_name

### 데이터 로딩 및 셔플링

In [3]:
df = pd.read_csv(full_data_file_name)
df = df.sample(frac=1.0, random_state=1000)
df

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_category_name_english,seller_zip_code_prefix,seller_city,seller_state
37413,3,2018-07-14 13:04:07,4ee61c3905a5c398d44b089108961bb3,28950,armacao dos buzios,RJ,105.00,27.04,2f13d1dc8b4e1d9d8027be50339546a9,2650.0,30.0,30.0,30.0,furniture_decor,3204,sao paulo,SP
54762,3,2017-03-25 10:25:16,959292edcade77d6b60dc8f49f01cd71,37880,cabo verde,MG,23.99,14.52,b000447e24e31a4d7e628ca4d0622131,250.0,19.0,4.0,11.0,telephony,3504,sao paulo,SP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18782,3,2018-07-16 18:20:34,609b34a18fd0d61719d0b3190ec05231,21240,rio de janeiro,RJ,200.00,19.50,92d5ae8e42c4599266f210bf0469ae9b,439.0,18.0,11.0,13.0,watches_gifts,14050,ribeirao preto,SP
3776,1,2018-05-25 07:06:30,dc275c5e585b7c3a3cddef527e34fc19,26540,nilopolis,RJ,18.90,7.55,15de022edf1005363381e66bed514528,100.0,16.0,3.0,23.0,furniture_decor,20270,rio de janeiro,RJ


In [4]:
df.columns

Index(['classes', 'order_approved_at', 'customer_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state', 'price',
       'freight_value', 'product_id', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm',
       'product_category_name_english', 'seller_zip_code_prefix',
       'seller_city', 'seller_state'],
      dtype='object')

## 날짜 피쳐 생성: Month, Day, WeeoOfDay(요일)

In [5]:
def create_date_feature(raw_df):
    df = raw_df.copy()
    df['order_date'] = pd.to_datetime(df['order_approved_at'])    
    df['order_weekday'] = df['order_date'].dt.weekday
    df['order_day'] = df['order_date'].dt.day    
    df['order_month'] = df['order_date'].dt.month        
    return df

f_df = create_date_feature(df)
f_df

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,product_height_cm,product_width_cm,product_category_name_english,seller_zip_code_prefix,seller_city,seller_state,order_date,order_weekday,order_day,order_month
37413,3,2018-07-14 13:04:07,4ee61c3905a5c398d44b089108961bb3,28950,armacao dos buzios,RJ,105.00,27.04,2f13d1dc8b4e1d9d8027be50339546a9,2650.0,...,30.0,30.0,furniture_decor,3204,sao paulo,SP,2018-07-14 13:04:07,5,14,7
54762,3,2017-03-25 10:25:16,959292edcade77d6b60dc8f49f01cd71,37880,cabo verde,MG,23.99,14.52,b000447e24e31a4d7e628ca4d0622131,250.0,...,4.0,11.0,telephony,3504,sao paulo,SP,2017-03-25 10:25:16,5,25,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18782,3,2018-07-16 18:20:34,609b34a18fd0d61719d0b3190ec05231,21240,rio de janeiro,RJ,200.00,19.50,92d5ae8e42c4599266f210bf0469ae9b,439.0,...,11.0,13.0,watches_gifts,14050,ribeirao preto,SP,2018-07-16 18:20:34,0,16,7
3776,1,2018-05-25 07:06:30,dc275c5e585b7c3a3cddef527e34fc19,26540,nilopolis,RJ,18.90,7.55,15de022edf1005363381e66bed514528,100.0,...,3.0,23.0,furniture_decor,20270,rio de janeiro,RJ,2018-05-25 07:06:30,4,25,5


## 기존 피쳐 결합하여 새로운 피쳐 생성 (컬럼1 + 컬럼2 = 뉴피쳐)

In [6]:
def change_var_type(f_df):
    df = f_df.copy()
    df['customer_zip_code_prefix'] = df['customer_zip_code_prefix'].astype(str)
    df['seller_zip_code_prefix'] = df['seller_zip_code_prefix'].astype(str)    
    return df

def comnbine_columns(f_df,src_col1, src_col2,new_col):
    df = f_df.copy()
    df[new_col] = df[str(src_col1)] + '_' + df[str(src_col2)]
    print("df shape: ", df.shape)
    return df



f_df = change_var_type(f_df)

### custoemr_state + seller_state

In [7]:
f_df = comnbine_columns(f_df,src_col1='customer_state', src_col2='seller_state',new_col='customer_seller_state')

df shape:  (67176, 22)


### custoemr_city + seller_city

In [8]:
f_df = comnbine_columns(f_df,src_col1='customer_city', src_col2='seller_city',new_col='customer_seller_city')

df shape:  (67176, 23)


### custoemr_zip + seller_zip

In [9]:
f_df = comnbine_columns(f_df,src_col1='customer_zip_code_prefix', 
                        src_col2='seller_zip_code_prefix',new_col='customer_seller_zip_code_prefix')

df shape:  (67176, 24)


In [10]:
f_df

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,seller_zip_code_prefix,seller_city,seller_state,order_date,order_weekday,order_day,order_month,customer_seller_state,customer_seller_city,customer_seller_zip_code_prefix
37413,3,2018-07-14 13:04:07,4ee61c3905a5c398d44b089108961bb3,28950,armacao dos buzios,RJ,105.00,27.04,2f13d1dc8b4e1d9d8027be50339546a9,2650.0,...,3204,sao paulo,SP,2018-07-14 13:04:07,5,14,7,RJ_SP,armacao dos buzios_sao paulo,28950_3204
54762,3,2017-03-25 10:25:16,959292edcade77d6b60dc8f49f01cd71,37880,cabo verde,MG,23.99,14.52,b000447e24e31a4d7e628ca4d0622131,250.0,...,3504,sao paulo,SP,2017-03-25 10:25:16,5,25,3,MG_SP,cabo verde_sao paulo,37880_3504
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18782,3,2018-07-16 18:20:34,609b34a18fd0d61719d0b3190ec05231,21240,rio de janeiro,RJ,200.00,19.50,92d5ae8e42c4599266f210bf0469ae9b,439.0,...,14050,ribeirao preto,SP,2018-07-16 18:20:34,0,16,7,RJ_SP,rio de janeiro_ribeirao preto,21240_14050
3776,1,2018-05-25 07:06:30,dc275c5e585b7c3a3cddef527e34fc19,26540,nilopolis,RJ,18.90,7.55,15de022edf1005363381e66bed514528,100.0,...,20270,rio de janeiro,RJ,2018-05-25 07:06:30,4,25,5,RJ_RJ,nilopolis_rio de janeiro,26540_20270


## product volume 컬럼 생성 (가로 * 세로 * 높이 의 계산값)

In [11]:
def add_product_volume(raw_df):
    df = raw_df.copy()
    df['product_volume'] = df.product_length_cm * df.product_width_cm * df.product_height_cm
    return df

f_df = add_product_volume(f_df)

In [12]:
f_df.columns

Index(['classes', 'order_approved_at', 'customer_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state', 'price',
       'freight_value', 'product_id', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm',
       'product_category_name_english', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 'order_date', 'order_weekday',
       'order_day', 'order_month', 'customer_seller_state',
       'customer_seller_city', 'customer_seller_zip_code_prefix',
       'product_volume'],
      dtype='object')

## Train, Test 데이터 셋 분리

In [13]:

def split_data_2(raw_df, sort_col='order_approved_at',val_ratio=0.3):
    '''
    train, test 데이터 분리
    '''
    df = raw_df.copy()
    val_ratio = 1 - val_ratio # 1 - 0.3  = 0.7

    
    df = df.sort_values(by= sort_col) # 시간 순으로 정렬
    # One-Hot-Encoding
    data1,data2, = np.split(df, 
                     [int(val_ratio * len(df))])   # Randomly sort the data then split out first 70%, second 20%, and last 10%
    
    print(f"data1, data2 shape: {data1.shape},{data2.shape}")
    
    return data1, data2

train_df, test_df = split_data_2(f_df, val_ratio=0.2)




data1, data2 shape: (53740, 25),(13436, 25)


## Target Encoding 관련 피쳐 생성
- Product_ID 별 Classes의 평균, 갯수 (te_pdid_mean, te_pdid_count)
- Target Error (classes - te_pdid_mean)

## Target Encoding with Smoothing
아래 비디오 및 코드 참조 함
- Feature Engineering
    - RecSys 2020 Tutorial: Feature Engineering for Recommender Systems
        - https://www.youtube.com/watch?v=uROvhp7cj6Q
    - Git Repo
        - https://github.com/rapidsai/deeplearning/tree/main/RecSys2020Tutorial



\begin{equation} \label{eq:te}
TE_{target}([Categories]) = \frac{count([Categories]) * mean_{target}([Categories]) + w_{smoothing} * mean_{target}(global)}{count([Categories]) + w_{smoothing}}
\end{equation}

In [14]:

def create_target_encoding(cat, raw_df):
    '''
    te_mean, te_count 피쳐 생성
    '''
    df = raw_df.copy()
    te = df.groupby(cat).classes.agg(['mean','count']).reset_index()
    te_mean_col = 'te_' + cat + '_mean'
    te_count_col = 'te_' + cat + '_count'    

    cat = [cat]
    te.columns = cat + [te_mean_col,te_count_col]
    te_df = df.merge(te, on=cat, how='left')
    
    return te_df

w = 20 # global 평균에 더 높은 가중치를 주는 값
def create_target_encoding_smoothe(cat, raw_df, w):
    '''
    te_mean, te_count를 기반으로 te_mean_smoothed 생성
    '''
    df = raw_df.copy()
    te_mean_col = 'te_' + cat + '_mean'
    te_count_col = 'te_' + cat + '_count'    
    te_target_col = 'te_' + cat + '_mean_smoothed'
    
    classes_mean_global = df.classes.mean()
    # print(" classes_mean_global: ", classes_mean_global)
    df[te_target_col] = (df[te_mean_col] * df[te_count_col]) + (classes_mean_global * w) / (df[te_count_col] + w)
    df.drop([te_mean_col,te_count_col],axis=1, inplace=True)

    return df

def add_te_on_test(raw_train_df, raw_test_df, join_col, te_col_name):
    '''
    train의 te_mean_smoothed를 기반으로 test데이터에도 te_mean_smoothed를 생성함.
    만일 train에 해당 값이 없으면, 전체 평균값으로 대체 함.
    '''
    train_df = raw_train_df[[join_col,te_col_name]]
    test_df = raw_test_df.copy()
    
    # global_mean = train_df[te_col_name].mean() # 전체 레코드의 평균
    avg_train = train_df.groupby(join_col)[te_col_name].mean() # join_col 의 평균값을 구함
    avg_train_df = pd.DataFrame(avg_train).reset_index()
    # display(avg_train_df)
        
    target_df = pd.merge(
        test_df,
        avg_train_df,
        on = join_col,
        how = 'left'        
    )
    
    global_mean = avg_train_df[te_col_name].mean() # 전체 레코드의 평균    
    
    # print("global mean: ", global_mean)
    # train에 있고 test에 없으면 train의 평균값을 채움
    target_df[te_col_name].fillna(global_mean, inplace=True)
    
    print("targe_df and  test shape: ", target_df.shape, test_df.shape)    
    assert(test_df.shape[0] == target_df.shape[0])
    
    return target_df




In [15]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
#    display(averages)
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # display(smoothing)
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    display(averages)    
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

## Target Encoding 실행

In [17]:
def add_new_te(raw_train, raw_test):
    train_df = raw_train.copy()
    test_df = raw_test.copy()
    
    cat = 'product_id'
    trn, sub = target_encode(train_df[cat], 
                             test_df[cat], 
                             target=train_df.classes, 
                             min_samples_leaf=100,
                             smoothing=10,
                             noise_level=0.01)
    te_col_name = 'te_' + cat + '_mean_smoothed'
    train_df[te_col_name] = trn
    test_df[te_col_name] = sub
    
    return train_df, test_df
    

train_df, test_df = add_new_te(train_df, test_df)    
    
    

Unnamed: 0_level_0,classes
product_id,Unnamed: 1_level_1
00088930e925c41fd95ebfe695fd2655,2.178716
000d9be29b5207b54e86aa1b1ac54872,2.178666
...,...
fff9553ac224cec9d15d49f5a263411f,2.178716
fffdb2d0ec8d6a61f0a0a0db3f25b441,2.178610


In [18]:
display(train_df.head(2))
display(test_df.head(2))

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,seller_state,order_date,order_weekday,order_day,order_month,customer_seller_state,customer_seller_city,customer_seller_zip_code_prefix,product_volume,te_product_id_mean_smoothed
2605,3,2016-10-04 10:19:23,7812fcebfc5e8065d31e1bb5f0017dae,12030,taubate,SP,29.99,10.96,e2a1d45a73dc7f5a7f9236b043431b89,9000.0,...,SP,2016-10-04 10:19:23,1,4,10,SP_SP,taubate_limeira,12030_13481,2640.0,2.166231
41063,2,2016-10-04 13:46:31,aadd27185177fc7ac9b364898ac09343,78075,cuiaba,MT,23.9,26.82,43bb8825dd6838251606e5e4130cfff4,1500.0,...,SP,2016-10-04 13:46:31,1,4,10,MT_SP,cuiaba_bauru,78075_17051,14625.0,2.180797


Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,seller_state,order_date,order_weekday,order_day,order_month,customer_seller_state,customer_seller_city,customer_seller_zip_code_prefix,product_volume,te_product_id_mean_smoothed
4927,3,2018-06-19 03:36:38,956346db615f7adb9ea991c5e4648c55,89219,joinville,SC,105.0,23.89,a62e25e09e05e6faf31d90c6ec1aa3d1,1000.0,...,RJ,2018-06-19 03:36:38,1,19,6,SC_RJ,joinville_rio de janeiro,89219_21840,7632.0,2.3996
6617,0,2018-06-19 03:36:39,a3777228aa1a73a3bf9a3127609cf68a,9950,diadema,SP,99.97,15.8,ed1e39b938c6cc6867d8f5fd408ce319,650.0,...,MG,2018-06-19 03:36:39,1,19,6,SP_MG,diadema_betim,9950_32677,9600.0,2.181332


In [19]:
def create_fe_target_encoding(raw_train, raw_test, cat_cols):
    train_df = raw_train.copy()
    test_df = raw_test.copy()
    for col in cat_cols:
        print("target col: ", col)
        train_df = create_target_encoding(col, train_df)
        train_df = create_target_encoding_smoothe(col, train_df, w)
        te_col_name = 'te_' + col + '_mean_smoothed'
        test_df = add_te_on_test(train_df, test_df, join_col = col, te_col_name= te_col_name)    
        
    return train_df, test_df

cat_cols = ['product_category_name_english',
            'seller_state','seller_city','seller_zip_code_prefix',
            'customer_seller_city','customer_seller_state','customer_seller_zip_code_prefix']

# cat_cols = ['product_id','product_category_name_english',
#             'seller_state','seller_city','seller_zip_code_prefix',
#             'customer_seller_city','customer_seller_state','customer_seller_zip_code_prefix']



train2_df, test2_df = create_fe_target_encoding(train_df, test_df, cat_cols)



target col:  product_category_name_english
targe_df and  test shape:  (13436, 27) (13436, 26)
target col:  seller_state
targe_df and  test shape:  (13436, 28) (13436, 27)
target col:  seller_city
targe_df and  test shape:  (13436, 29) (13436, 28)
target col:  seller_zip_code_prefix
targe_df and  test shape:  (13436, 30) (13436, 29)
target col:  customer_seller_city
targe_df and  test shape:  (13436, 31) (13436, 30)
target col:  customer_seller_state
targe_df and  test shape:  (13436, 32) (13436, 31)
target col:  customer_seller_zip_code_prefix
targe_df and  test shape:  (13436, 33) (13436, 32)


In [20]:
train2_df.head(2)

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,customer_seller_zip_code_prefix,product_volume,te_product_id_mean_smoothed,te_product_category_name_english_mean_smoothed,te_seller_state_mean_smoothed,te_seller_city_mean_smoothed,te_seller_zip_code_prefix_mean_smoothed,te_customer_seller_city_mean_smoothed,te_customer_seller_state_mean_smoothed,te_customer_seller_zip_code_prefix_mean_smoothed
0,3,2016-10-04 10:19:23,7812fcebfc5e8065d31e1bb5f0017dae,12030,taubate,SP,29.99,10.96,e2a1d45a73dc7f5a7f9236b043431b89,9000.0,...,12030_13481,2640.0,2.166231,313.325175,80621.001131,704.106277,289.234266,5.980614,35917.001966,5.074929
1,2,2016-10-04 13:46:31,aadd27185177fc7ac9b364898ac09343,78075,cuiaba,MT,23.9,26.82,43bb8825dd6838251606e5e4130cfff4,1500.0,...,78075_17051,14625.0,2.180797,7805.012365,80621.001131,204.396123,40.117269,4.074929,614.20457,4.074929


In [21]:
test2_df

Unnamed: 0,classes,order_approved_at,customer_id,customer_zip_code_prefix,customer_city,customer_state,price,freight_value,product_id,product_weight_g,...,customer_seller_zip_code_prefix,product_volume,te_product_id_mean_smoothed,te_product_category_name_english_mean_smoothed,te_seller_state_mean_smoothed,te_seller_city_mean_smoothed,te_seller_zip_code_prefix_mean_smoothed,te_customer_seller_city_mean_smoothed,te_customer_seller_state_mean_smoothed,te_customer_seller_zip_code_prefix_mean_smoothed
0,3,2018-06-19 03:36:38,956346db615f7adb9ea991c5e4648c55,89219,joinville,SC,105.00,23.89,a62e25e09e05e6faf31d90c6ec1aa3d1,1000.0,...,89219_21840,7632.0,2.399600,6506.014877,4961.018757,2484.036771,365.281119,17.815563,175.544669,4.34846
1,0,2018-06-19 03:36:39,a3777228aa1a73a3bf9a3127609cf68a,9950,diadema,SP,99.97,15.80,ed1e39b938c6cc6867d8f5fd408ce319,650.0,...,9950_32677,9600.0,2.181332,6718.013196,10105.010047,391.201729,116.495153,5.074929,3818.025616,4.34846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13434,0,2018-08-29 15:05:22,496630b6740bcca28fce9ba50d8a26ef,9541,sao caetano do sul,SP,24.90,8.33,c7f27c5bef2338541c772b5776403e6a,450.0,...,9541_1320,4788.0,2.182778,32.177662,80621.001131,27766.003039,68.310888,75.537944,35917.001966,4.34846
13435,0,2018-08-29 15:10:26,898b7fee99c4e42170ab69ba59be0a8b,13483,limeira,SP,84.99,8.76,d04857e7b4b708ee8b8b9921163edba3,450.0,...,13483_4102,1280.0,2.189203,8250.012020,80621.001131,27766.003039,293.203614,41.854382,35917.001966,4.34846


In [22]:
print(train2_df.shape)
print(test2_df.shape)

(53740, 33)
(13436, 33)


## Category 레이블 Encoding

In [23]:
# from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
class LabelEncoderExt(object):
    '''
    Source:
    # https://stackoverflow.com/questions/21057621/sklearn-labelencoder-with-never-seen-before-values
    '''
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)
    
def make_test_label_encoding(raw_train_df, raw_test_df,cols):
    train_df = raw_train_df.copy()
    test_df = raw_test_df.copy()
    
    for lb_col in cols:
        print(lb_col)
        le = LabelEncoderExt()
        le = le.fit(train_df[lb_col])
        
        train_en = le.transform(train_df[lb_col])
        test_en = le.transform(test_df[lb_col])        
        lb_col_name = 'lb_' + lb_col
        print("new col name: ", lb_col_name)
        train_df[lb_col_name] = train_en
        test_df[lb_col_name] = test_en        
    
    return train_df, test_df



### Category 변수의 레이블 인코딩 실행

In [24]:
label_cols = ['customer_city','customer_state','customer_zip_code_prefix']
train2_lb, test2_lb = make_test_label_encoding(train2_df, test2_df, label_cols)

customer_city
new col name:  lb_customer_city
customer_state
new col name:  lb_customer_state
customer_zip_code_prefix
new col name:  lb_customer_zip_code_prefix


In [25]:
pd.options.display.max_rows = 10
show_rows = 5
print(train2_lb.customer_state.value_counts()[0:show_rows])
# print(train2_lb[train2_lb.lb_customer_city == 185])
print(test2_lb.customer_state.value_counts()[0:show_rows])

SP    28232
MG     6763
RJ     6034
PR     2912
RS     2385
Name: customer_state, dtype: int64
SP    6642
MG    1541
RJ    1491
PR     715
RS     663
Name: customer_state, dtype: int64


## 레이블 Encoding 안하고 바로 사용(AutoGluon 용)

In [26]:
# no_encoding_cate = tes_df

## 최종 사용할 컬럼 지정
### XGBoost, CatBoost 알고리즘 용

In [27]:
def filter_df(raw_df, cols):
    df = raw_df.copy()
    df = df[cols]
    return df


cols = ['classes',
        'lb_customer_city',
        'lb_customer_state',  
        'lb_customer_zip_code_prefix',        
        'price', 'freight_value',
        'product_weight_g', 
        'product_volume',    
        'order_weekday',
        'order_day', 'order_month',        
        'te_product_id_mean_smoothed',
        'te_product_category_name_english_mean_smoothed',        
        'te_seller_state_mean_smoothed', 'te_seller_city_mean_smoothed',
        'te_seller_zip_code_prefix_mean_smoothed',
        'te_customer_seller_city_mean_smoothed',
        'te_customer_seller_state_mean_smoothed',
        'te_customer_seller_zip_code_prefix_mean_smoothed',
       ]


encode_train = filter_df(train2_lb, cols)
encode_test = filter_df(test2_lb, cols)

# no_encode_train = filter_df(encode_train, cols)
# no_encode_val = filter_df(encode_val, cols)
# no_encode_test = filter_df(encode_test, cols)


## 피쳐 변환한  AutoGluon 용

In [28]:
cols = ['classes',
        'customer_city',
        'customer_state',     
        'customer_zip_code_prefix',                
        'product_category_name_english',        
        'price', 'freight_value',
        'product_weight_g', 
        'product_volume',     
        'order_weekday',
        'order_day', 'order_month',        
        'te_product_id_mean_smoothed',
        'te_product_category_name_english_mean_smoothed',                
        'te_seller_state_mean_smoothed', 'te_seller_city_mean_smoothed',
        'te_seller_zip_code_prefix_mean_smoothed',
        'te_customer_seller_city_mean_smoothed',
        'te_customer_seller_state_mean_smoothed',
        'te_customer_seller_zip_code_prefix_mean_smoothed',
       ]


auto_train = filter_df(train2_lb, cols)
auto_test = filter_df(test2_lb, cols)


## 펴쳐 변환 없이 AutoGluon 용

In [29]:
train_df.columns

Index(['classes', 'order_approved_at', 'customer_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state', 'price',
       'freight_value', 'product_id', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm',
       'product_category_name_english', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 'order_date', 'order_weekday',
       'order_day', 'order_month', 'customer_seller_state',
       'customer_seller_city', 'customer_seller_zip_code_prefix',
       'product_volume', 'te_product_id_mean_smoothed'],
      dtype='object')

In [30]:
cols = ['classes', 
       'customer_zip_code_prefix', 'customer_city', 'customer_state', 'price',
       'freight_value', 'product_weight_g', 
       'product_category_name_english', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 'order_date', 'order_weekday',
       'order_day', 'order_month', 'customer_seller_state',
       'customer_seller_city', 'customer_seller_zip_code_prefix',
       'product_volume']


no_auto_train = filter_df(train_df, cols)
no_auto_test = filter_df(test_df, cols)



## 로컬에 데이터 저장

In [31]:
import os

def save_local(train_data, test_data, preproc_folder):
    train_df = pd.concat([train_data['classes'], train_data.drop(['classes'], axis=1)], axis=1)
    train_file_name = os.path.join(preproc_folder, 'train.csv')
    train_df.to_csv(train_file_name, index=False)
    print(f'{train_file_name} is saved')

    test_df = pd.concat([test_data['classes'], test_data.drop(['classes'], axis=1)], axis=1)
    test_file_name = os.path.join(preproc_folder, 'test.csv')
    test_df.to_csv(test_file_name, index=False)
    print(f'{test_file_name} is saved')        
    
    return train_file_name, test_file_name

preproc_folder = 'preproc_data/feature_engineering'
os.makedirs(preproc_folder, exist_ok=True)    
pre_train_file, pre_test_file = save_local(encode_train, encode_test, preproc_folder)

preproc_folder = 'preproc_data/auto_feature_engineering'
os.makedirs(preproc_folder, exist_ok=True)    
auto_train_file,auto_test_file  = save_local(auto_train, auto_test, preproc_folder)

preproc_folder = 'preproc_data/auto_no_fe'
os.makedirs(preproc_folder, exist_ok=True)    
no_auto_train_file,no_auto_test_file  = save_local(no_auto_train, no_auto_test, preproc_folder)

preproc_data/feature_engineering/train.csv is saved
preproc_data/feature_engineering/test.csv is saved
preproc_data/auto_feature_engineering/train.csv is saved
preproc_data/auto_feature_engineering/test.csv is saved
preproc_data/auto_no_fe/train.csv is saved
preproc_data/auto_no_fe/test.csv is saved


In [32]:
%store pre_train_file
%store pre_test_file

%store auto_train_file
%store auto_test_file

%store no_auto_train_file
%store no_auto_test_file

Stored 'pre_train_file' (str)
Stored 'pre_test_file' (str)
Stored 'auto_train_file' (str)
Stored 'auto_test_file' (str)
Stored 'no_auto_train_file' (str)
Stored 'no_auto_test_file' (str)
