In [1]:
import pandas as pd
import math
from setup import config

In [2]:
feature_config = config.cast('feature')
feature_ind_config = feature_config.cast('ind')

In [3]:
# 获取某一类的特征列名
def get_feature_column_name_list(df: pd.DataFrame, prefix: str) -> list:
    columns = list(df.columns)
    return [col for col in columns if col.startswith(prefix)]


In [4]:
# 检查特征列里有没有缺失值
def get_name_list_of_feature_with_missing_value(df: pd.DataFrame, feature: list, placeholder) -> list:
    col_has_missing_value = []
    for col in feature:
        value_counts = df[col].value_counts()
        value_counts_index = list(value_counts.index)
        if placeholder in value_counts_index:
            col_has_missing_value.append(col)
    return col_has_missing_value   

In [5]:
data_set_train_path = config.data.path('train.csv')
data_set_train_ind_path = config.data.path('train_ind.csv')
data_set_train_ind_uniquified_path = config.data.path('train_ind_uniquified.csv')

In [6]:
data_set_train = pd.read_csv(data_set_train_path)

In [7]:
_data_set_train_ind_columns = get_feature_column_name_list(data_set_train, prefix='ps_ind')

In [8]:
# 获取属性 ps_ind 相关列
_data_set_train_ind = data_set_train[['id', 'target']+_data_set_train_ind_columns]

In [9]:
feature_with_missing_value = get_name_list_of_feature_with_missing_value(
    _data_set_train_ind, 
    feature=_data_set_train_ind_columns,
    placeholder=-1
)
print('存在缺失值的列: ' + str(feature_with_missing_value))

存在缺失值的列: ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat']


In [10]:
def fix_feature_ps_ind(df: pd.DataFrame) -> pd.DataFrame:
    
    # 存在缺失值的都是 category, 直接 one-hot编码
    df_columns = list(df.columns)
    _df = pd.get_dummies(
        data=df, 
        columns=[col for col in df_columns if '_cat' in col]
    )
    
    return _df


In [11]:
data_set_train_ind = fix_feature_ps_ind(_data_set_train_ind)

In [18]:
data_set_train_ind_columns = list(data_set_train_ind.columns)
feature_ind_length = len(get_feature_column_name_list(df=data_set_train_ind, prefix='ps_ind'))
feature_ind_config.parameter.put(tag='length', value=feature_ind_length)

In [13]:
# 保存
data_set_train_ind.to_csv(path_or_buf=data_set_train_ind_path, index=False)

In [14]:
# 给每个列加上一个基数
def uniquify(df: pd.DataFrame, columns: list, offset: int=0, neat: bool = True) -> (pd.DataFrame, int):
    
    df = df.copy()
    
    for col in columns:
        # 如果要精简, 每列先减去其最小值
        
        col_min = df[col].min()
        col_max = df[col].max()
        
        col_value_range_length = col_max - col_min + 1
        
        # 每列减去最小值
        if col_min != 0 and neat:
            df[col] = df[col] - col_min
            
        col_unique_value = sorted(pd.unique(df[col]))
        col_unique_value_number = \
            len(col_unique_value) if col_value_range_length <= len(col_unique_value) else col_value_range_length
        
        print(col, col_max, col_min, col_unique_value_number, col_unique_value)
        
        df[col] = df[col] + offset
        offset = offset + col_unique_value_number
    
    return df, offset


In [19]:

feature_ind_embedding_offset = 0
    
data_set_train_ind_uniquified, limit = \
    uniquify(
        data_set_train_ind, 
        get_feature_column_name_list(df=data_set_train_ind, prefix='ps_ind'), 
        offset=feature_ind_embedding_offset
    )

feature_ind_embedding_limit = math.ceil(limit/100)*100


ps_ind_01 7 0 8 [0, 1, 2, 3, 4, 5, 6, 7]
ps_ind_03 11 0 12 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
ps_ind_06_bin 1 0 2 [0, 1]
ps_ind_07_bin 1 0 2 [0, 1]
ps_ind_08_bin 1 0 2 [0, 1]
ps_ind_09_bin 1 0 2 [0, 1]
ps_ind_10_bin 1 0 2 [0, 1]
ps_ind_11_bin 1 0 2 [0, 1]
ps_ind_12_bin 1 0 2 [0, 1]
ps_ind_13_bin 1 0 2 [0, 1]
ps_ind_14 4 0 5 [0, 1, 2, 3, 4]
ps_ind_15 13 0 14 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
ps_ind_16_bin 1 0 2 [0, 1]
ps_ind_17_bin 1 0 2 [0, 1]
ps_ind_18_bin 1 0 2 [0, 1]
ps_ind_02_cat_-1 1 0 2 [0, 1]


ps_ind_02_cat_1 1 0 2 [0, 1]
ps_ind_02_cat_2 1 0 2 [0, 1]
ps_ind_02_cat_3 1 0 2 [0, 1]
ps_ind_02_cat_4 1 0 2 [0, 1]
ps_ind_04_cat_-1 1 0 2 [0, 1]
ps_ind_04_cat_0 1 0 2 [0, 1]
ps_ind_04_cat_1 1 0 2 [0, 1]
ps_ind_05_cat_-1 1 0 2 [0, 1]
ps_ind_05_cat_0 1 0 2 [0, 1]
ps_ind_05_cat_1 1 0 2 [0, 1]


ps_ind_05_cat_2 1 0 2 [0, 1]
ps_ind_05_cat_3 1 0 2 [0, 1]
ps_ind_05_cat_4 1 0 2 [0, 1]
ps_ind_05_cat_5 1 0 2 [0, 1]
ps_ind_05_cat_6 1 0 2 [0, 1]


In [20]:
# 将每一列的特征社转为embedding的idx

print(feature_ind_embedding_limit)

feature_ind_config.parameter.put('embedding.offset', feature_ind_embedding_offset)
feature_ind_config.parameter.put('embedding.limit', feature_ind_embedding_limit)

100


In [21]:
data_set_train_ind_uniquified.to_csv(path_or_buf=data_set_train_ind_uniquified_path, index=False)

In [22]:
data_set_train_ind_uniquified['target'].value_counts() / data_set_train_ind_uniquified['target'].size

0    0.963552
1    0.036448
Name: target, dtype: float64

In [23]:

data_set_test_path = config.data.path('test.csv')
data_set_test_ind_path = config.data.path('test_ind.csv')
data_set_test_ind_uniquified_path = config.data.path('test_ind_uniquified.csv')
data_set_test = pd.read_csv(data_set_test_path)

In [24]:
# 处理测试数据

_data_set_test_ind_columns = get_feature_column_name_list(data_set_test, prefix='ps_ind')
_data_set_test_ind = data_set_test[['id']+_data_set_test_ind_columns]
data_set_test_ind = fix_feature_ps_ind(_data_set_test_ind)

data_set_test_ind.to_csv(path_or_buf=data_set_test_ind_path, index=False)

data_set_test_ind_columns_ind_only = get_feature_column_name_list(data_set_test_ind, prefix='ps_ind')

feature_ind_embedding_offset = 0
    
data_set_test_ind_uniquified, limit = \
    uniquify(data_set_test_ind, data_set_test_ind_columns_ind_only, offset=feature_ind_embedding_offset)

feature_ind_embedding_limit = math.ceil(limit/100)*100

ps_ind_01 7 0 8 [0, 1, 2, 3, 4, 5, 6, 7]
ps_ind_03 11 0 12 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
ps_ind_06_bin 1 0 2 [0, 1]
ps_ind_07_bin 1 0 2 [0, 1]
ps_ind_08_bin 1 0 2 [0, 1]
ps_ind_09_bin 1 0 2 [0, 1]
ps_ind_10_bin 1 0 2 [0, 1]
ps_ind_11_bin 1 0 2 [0, 1]
ps_ind_12_bin 1 0 2 [0, 1]
ps_ind_13_bin 1 0 2 [0, 1]
ps_ind_14 4 0 5 [0, 1, 2, 3, 4]
ps_ind_15 13 0 14 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
ps_ind_16_bin 1 0 2 [0, 1]


ps_ind_17_bin 1 0 2 [0, 1]
ps_ind_18_bin 1 0 2 [0, 1]
ps_ind_02_cat_-1 1 0 2 [0, 1]
ps_ind_02_cat_1 1 0 2 [0, 1]
ps_ind_02_cat_2 1 0 2 [0, 1]
ps_ind_02_cat_3 1 0 2 [0, 1]
ps_ind_02_cat_4 1 0 2 [0, 1]
ps_ind_04_cat_-1 1 0 2 [0, 1]
ps_ind_04_cat_0 1 0 2 [0, 1]


ps_ind_04_cat_1 1 0 2 [0, 1]
ps_ind_05_cat_-1 1 0 2 [0, 1]
ps_ind_05_cat_0 1 0 2 [0, 1]
ps_ind_05_cat_1 1 0 2 [0, 1]
ps_ind_05_cat_2 1 0 2 [0, 1]
ps_ind_05_cat_3 1 0 2 [0, 1]
ps_ind_05_cat_4 1 0 2 [0, 1]
ps_ind_05_cat_5 1 0 2 [0, 1]
ps_ind_05_cat_6 1 0 2 [0, 1]


In [25]:
print(data_set_test_ind_uniquified.head(5))

In [26]:
data_set_test_ind_uniquified.to_csv(path_or_buf=data_set_test_ind_uniquified_path, index=False)