## Feature Related (Categorical)

### Concatenate Categorical Columns

In [3]:
def category_concat(df, subject_cols, print_option=True):
    na_col = list(df.columns[df.isna().any()])
    for col in na_col:
        df[col].fillna('', inplace=True)
    temp_str = ''
    for col in subject_cols:
        temp_str += '_' + col
    df[temp_str[1:]] = ''
    for col in subject_cols:
        df[temp_str[1:]] += df[col]
    
    if print_option:
        print("Generated features: category_concat")
        print(f"'{temp_str[1:]}',")
        print()
    del na_col, temp_str, col; gc.collect()

### Target Encoder (OOF / smoothing)

In [164]:
class apply_target_encode:
    
    # (for train/train in oof)
    # fit grouped label stats of given df
    def fit_train(df, target_col, cat_col, m, statistic):
        # df[target_col] = np.log1p(df[target_col])
        df_group = df.groupby(cat_col)[target_col]
        group_mean = df_group.mean().astype(np.float16)
        temp_stat = []
        # ===== smoothing =====
        if m > 0:
            global_mean = df[target_col].mean()
            group_count = df_group.count().astype(np.float16)
            smoother = ((group_count * group_mean) + (m * global_mean)) / (group_count + m)
            temp_mean = (smoother, f'SMTH_MEAN_{m}')
        # ===== no smoothing =====
        elif m == 0:
            temp_mean = (group_mean, 'MEAN')
        # ===== more target statistic =====
        if statistic:
            group_min = df_group.min().astype(np.float16)
            group_max = df_group.max().astype(np.float16)
            group_std = df_group.std().astype(np.float16)
            group_rng = group_max - group_min
            group_Q1 = df_group.quantile(0.25).astype(np.float16)
            group_Q2 = df_group.median().astype(np.float16)
            group_Q3 = df_group.quantile(0.75).astype(np.float16)
            group_IQR = group_Q3 - group_Q1
            temp_stat = [(group_max, 'MAX'), (group_min, 'MIN'),
                         (group_rng, 'RNG'), (group_std, 'STD'),
                         (group_Q1, 'Q1'), (group_Q2, 'Q2'),
                         (group_Q3, 'Q3'), (group_IQR, 'IQR')]
        temp_stat.append(temp_mean)
        return temp_stat
    
    # (for train/valid in oof)
    # transform (encode) given df via given grouped label stats from fit_train
    def transform_valid(temp_stat, df, valid_idx, cat_col, print_option):
        for mapper, agg_str in temp_stat:
            if fold == 0:
                df[f'{cat_col}_{agg_str}'] = 'te_empty'
            df.loc[valid_idx, f'{cat_col}_{agg_str}'] = df[cat_col].map(mapper)
            if print_option:
                print(f"'{cat_col}_{agg_str}',")
                
    # (for test in oof)
    # fit_train and tranform_valid combined
    def transform_test(df, test_df, target_col, cat_col, m, statistic, print_option):
        temp_stat = apply_target_encode.fit_train(df, target_col, cat_col, m, statistic)
        for mapper, agg_str in temp_stat:
            test_df[f'{cat_col}_{agg_str}'] = test_df[f'{cat_col}'].map(mapper)
            if print_option:
                print(f"'{cat_col}_{agg_str}',")
    
    # (for ordinary use)
    # fit_train and tranform_valid combined
    def fit_transform(df, test_df, target_col, cat_col, m=0, statistic=False, print_option=True):
        temp_stat = apply_target_encode.fit_train(df, target_col, cat_col, m, statistic)
        for mapper, agg_str in temp_stat:
            df[f'{cat_col}_{agg_str}'] = df[f'{cat_col}'].map(mapper)
            test_df[f'{cat_col}_{agg_str}'] = test_df[f'{cat_col}'].map(mapper)
            if print_option:
                print(f"'{cat_col}_{agg_str}',")
    
    # train/train: fit grouped label statistic (with fit_train)
    # train/valid: encode via the fitted (with transform_valid)
    # test: fit with entire train, encode to test
    # note: equal m/statistic is applied to all sets
    def oof(df, test_df, target_col, cat_col, split, m=0, statistic=False, print_option=True):
        # train/valid target encode
        for fold, (train_idx, valid_idx) in enumerate(split):
            temp_stat = apply_target_encode.fit_train(df=df.loc[train_idx, :],
                                                      target_col=target_col, cat_col=cat_col,
                                                      m=m, statistic=statistic)
            apply_target_encode.transform_valid(temp_stat=temp_stat,
                                                df=df, valid_idx=valid_idx, cat_col=cat_col,
                                                print_option=False)
            if 'te_empty' in df[f'{cat_col}_{agg_str}']:
                print(f"te_empty still left in '{cat_col}_{agg_str}'")
        # test oof (=train) target encode
        apply_target_encode.transform_test(df=df, test_df=test_df,
                                           target_col=target_col, cat_col=cat_col,
                                           m=m, statistic=statistic,
                                           print_option=print_option)

### Label Encode (not ordinal)

In [476]:
from sklearn.preprocessing import LabelEncoder

In [488]:
def apply_label_encode(df, test_df, subject_cols):
    lbl = LabelEncoder()
    for str_col in subject_cols:
        lbl.fit(df[str_col].unique())
        df[str_col] = lbl.transform(df[str_col])
        test_df[str_col] = lbl.transform(test_df[str_col])
    del lbl, str_col; gc.collect()

In [None]:
# (for meta)
def apply_label_encode(df, subject_cols):
    for str_col in subject_cols:
        # ===== assumes Series of string =====
        temp_dict = {value: i for i, value in enumerate(df[str_col].unique())}
        df[str_col] = (df[str_col].map(temp_dict)).astype(np.int16)
    del temp_dict, str_col; gc.collect()

### Frequency / Count Encode

In [6]:
def apply_freq_encode(df, str_col, print_option=True):
    temp_dict = {sample: df.loc[df[str_col]==sample].shape[0] for sample in df[str_col].unique()}
    df[f'{str_col}_COUNT'] = df[str_col].map(temp_dict)
    df[f'{str_col}_RATIO'] = df[str_col].map(temp_dict) / df[str_col].shape[0]
    
    if print_option:
        print(f"'{str_col}_COUNT',")
        print(f"'{str_col}_RATIO',")
        print()
    del temp_dict; gc.collect()