In [2]:
import pandas as pd
from setup import config
import preprocesss.utils as preprocess_utils
import numpy as np
from sklearn import preprocessing
import math

In [2]:
feature_all_config = config.cast('feature.all')

## 加载训练数据和测试数据

In [3]:
data_train_path = config.data.path('train.csv')
data_test_path = config.data.path('test.csv')

In [4]:
data_train = pd.read_csv(data_train_path)
data_test = pd.read_csv(data_test_path)

### 检查训练数据和测试数据是否一致

In [5]:
data_train_feature = preprocess_utils.get_column_type_pair_list(df=data_train, prefix='ps')
data_test_feature = preprocess_utils.get_column_type_pair_list(df=data_test, prefix='ps')

In [6]:
data_train_feature_type_int = list(filter(lambda x: 'float' not in x[1], data_train_feature))
data_test_feature_type_int = list(filter(lambda x: 'float' not in x[1], data_test_feature))

In [7]:
for col, _type in data_train_feature_type_int:
    train_col_unique = sorted(list(pd.unique(data_train[col])))
    test_col_unique = sorted(list(pd.unique(data_test[col])))
    if train_col_unique != test_col_unique:
        print('col: {0}\ntrain: {1}\ntest:  {2}'.format(col, train_col_unique, test_col_unique))

col: ps_calc_06
train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
test:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
col: ps_calc_08
train: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
test:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
col: ps_calc_11
train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
test:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
col: ps_calc_12
train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
test:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11]
col: ps_calc_13
train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
test:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
col: ps_calc_14
train: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
test:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 28]


## 合并数据

In [8]:
data_train['set'] = 'train'
data_test['set'] = 'test'

In [9]:
data = pd.concat([data_train, data_test], axis=0)

In [10]:
data_path = config.data.path('data.csv')

In [11]:
data.to_csv = data.to_csv(data_path, index=False)

In [None]:
data = pd.read_csv(data_path)

## 处理特征

In [12]:
data_original_column_name_list = list(data.columns)

In [13]:
# 获取特征列

data_feature_list = preprocess_utils.get_column_name_list_by_prefix(
    df=data,
    prefix='ps'
)
data_feature_list = sorted(data_feature_list, reverse=False)

In [14]:
print(data_feature_list)

['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11', 'ps_car_11_cat', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03']


In [15]:
# 检查下存在缺失参数的列

feature_list_with_missing_value = preprocess_utils.get_column_name_list_with_missing_value(
    df=data,
    feature=data_feature_list,
    placeholder=-1,
)
feature_list_with_missing_value = \
    sorted(feature_list_with_missing_value, reverse=False)


In [16]:
print(feature_list_with_missing_value)

['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11', 'ps_car_12', 'ps_car_14', 'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_reg_03']


In [7]:
def show_feature_with_missing_value(df: pd.DataFrame, prefix: str, placeholder: int = -1):
    sample_number = len(df)
    column_name_list_with_missing_value = preprocess_utils.get_column_name_list_with_missing_value(
        df=df,
        feature=preprocess_utils.get_column_name_list_by_prefix(df=df, prefix=prefix),
        placeholder=placeholder,
    )
    column_name_list_with_missing_value = sorted(column_name_list_with_missing_value, reverse=False)
    
    for col in column_name_list_with_missing_value:
        missing_row_num = len(list(filter(lambda x: x, df[col] == placeholder)))
        print("col: {0}, dtype: {1}, missing rate: {2}".format(col, df[col].dtype, missing_row_num/sample_number))

In [18]:
show_feature_with_missing_value(data, prefix='ps', placeholder=-1)

col: ps_car_01_cat, dtype: int64, missing rate: 0.00017943210746034348


col: ps_car_02_cat, dtype: int64, missing rate: 6.72030365020013e-06
col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063


col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969
col: ps_car_07_cat, dtype: int64, missing rate: 0.019367915119876778


col: ps_car_09_cat, dtype: int64, missing rate: 0.0009717559078189389
col: ps_car_11, dtype: int64, missing rate: 4.032182190120078e-06


col: ps_car_12, dtype: float64, missing rate: 6.720303650200131e-07
col: ps_car_14, dtype: float64, missing rate: 0.07152083159725489


col: ps_ind_02_cat, dtype: int64, missing rate: 0.00035147188090546685
col: ps_ind_04_cat, dtype: int64, missing rate: 0.00015322292322456297


col: ps_ind_05_cat, dtype: int64, missing rate: 0.00975720886972557
col: ps_reg_03, dtype: float64, missing rate: 0.18108261403683265


### 处理特征 `ind`

In [19]:
# 特征 ind 的缺失值非常少, 又是整数, 取中位数即可


def fix_feature_ind(df: pd.DataFrame) -> pd.DataFrame:
    
    _df = df.copy()
    for col in ['ps_ind_02_cat', 'ps_ind_05_cat', 'ps_ind_04_cat']:
        median = _df[col][df[col] != -1].median()
        median = int(median)
        # mean = data_train_catergory_onehot[col].mean()
        # print(data_train[col].value_counts())
        print("col: {0}, median: {1}".format(col, median))
        _df[col][_df[col] == -1] = median

    return _df

In [20]:
data_p0 = fix_feature_ind(data)

col: ps_ind_02_cat, median: 1
col: ps_ind_05_cat, median: 0
col: ps_ind_04_cat, median: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [21]:
# 检查效果

show_feature_with_missing_value(df=data_p0, prefix='ps', placeholder=-1)

col: ps_car_01_cat, dtype: int64, missing rate: 0.00017943210746034348


col: ps_car_02_cat, dtype: int64, missing rate: 6.72030365020013e-06
col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063


col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969
col: ps_car_07_cat, dtype: int64, missing rate: 0.019367915119876778


col: ps_car_09_cat, dtype: int64, missing rate: 0.0009717559078189389
col: ps_car_11, dtype: int64, missing rate: 4.032182190120078e-06


col: ps_car_12, dtype: float64, missing rate: 6.720303650200131e-07
col: ps_car_14, dtype: float64, missing rate: 0.07152083159725489


col: ps_reg_03, dtype: float64, missing rate: 0.18108261403683265


#### 检查特征 `ind` 的数据类型, 如果有浮点类型, 需要归一化

In [22]:
for col, _type in preprocess_utils.get_column_type_pair_list(df=data_p0, prefix='ps_ind'):
    print(col, _type)

ps_ind_01 int64
ps_ind_02_cat int64
ps_ind_03 int64
ps_ind_04_cat int64
ps_ind_05_cat int64
ps_ind_06_bin int64
ps_ind_07_bin int64
ps_ind_08_bin int64
ps_ind_09_bin int64
ps_ind_10_bin int64
ps_ind_11_bin int64
ps_ind_12_bin int64
ps_ind_13_bin int64
ps_ind_14 int64
ps_ind_15 int64
ps_ind_16_bin int64
ps_ind_17_bin int64
ps_ind_18_bin int64


#### 保存

In [23]:
# 步骤0 保存路径
data_p0_path = config.data.path('data_p0.csv')


In [24]:
data_p0.to_csv(data_p0_path, index=False)

In [None]:
data_p0 = pd.read_csv(data_p0_path)

### 处理特征 `car`

#### 处理缺失值

In [25]:
show_feature_with_missing_value(data_p0, prefix='ps_car', placeholder=-1)

col: ps_car_01_cat, dtype: int64, missing rate: 0.00017943210746034348
col: ps_car_02_cat, dtype: int64, missing rate: 6.72030365020013e-06


col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063
col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969


col: ps_car_07_cat, dtype: int64, missing rate: 0.019367915119876778
col: ps_car_09_cat, dtype: int64, missing rate: 0.0009717559078189389


col: ps_car_11, dtype: int64, missing rate: 4.032182190120078e-06
col: ps_car_12, dtype: float64, missing rate: 6.720303650200131e-07


col: ps_car_14, dtype: float64, missing rate: 0.07152083159725489


In [26]:
def fix_feature_car(df: pd.DataFrame) -> pd.DataFrame:
    _df = df.copy()
    
    # 以下几个整形特征的缺失值非常少, 取中位数
    for col in ['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11']:
        median = df[col][df[col] != -1].median()
        median = int(median)
        # print(df[col].value_counts())
        print("col: {0}, median: {1}".format(col, median))
        _df[col][_df[col] == -1] = median
    
    # ps_car_03, ps_car_05 的缺失值非常多, 先放着不管, 直接做独热化处理即可
    
    # 以下几个浮点特征的缺失值较少, 取中位数
    for col in ['ps_car_12', 'ps_car_14']:
        mean = df[col][df[col] != -1].mean()
        # print(df[col].describe())
        print("col: {0}, mean: {1}".format(col, mean))
        _df[col][_df[col] == -1] = mean

    return _df

In [27]:
data_p1 = fix_feature_car(data_p0)

col: ps_car_01_cat, median: 7
col: ps_car_02_cat, median: 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


col: ps_car_07_cat, median: 1
col: ps_car_09_cat, median: 2


col: ps_car_11, median: 3
col: ps_car_12, mean: 0.379952832334839


col: ps_car_14, mean: 0.3746795999575552


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [28]:
show_feature_with_missing_value(data_p1, prefix='ps_car', placeholder=-1)

col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063
col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969


#### 归一化

In [29]:
for col, _type in preprocess_utils.get_column_type_pair_list(df=data_p1, prefix='ps_car'):
    print(col, _type)

ps_car_01_cat int64
ps_car_02_cat int64
ps_car_03_cat int64
ps_car_04_cat int64
ps_car_05_cat int64
ps_car_06_cat int64
ps_car_07_cat int64
ps_car_08_cat int64
ps_car_09_cat int64
ps_car_10_cat int64
ps_car_11 int64
ps_car_11_cat int64
ps_car_12 float64
ps_car_13 float64
ps_car_14 float64
ps_car_15 float64


In [30]:
ps_car_type_float = ['ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15']

In [31]:
for col in ps_car_type_float:
    print(data_p1[col].describe())

count    1.488028e+06
mean     3.799528e-01
std      5.835091e-02
min      1.000000e-01
25%      3.162278e-01
50%      3.741657e-01
75%      4.000000e-01
max      1.264911e+00
Name: ps_car_12, dtype: float64
count    1.488028e+06
mean     8.134878e-01
std      2.247024e-01
min      2.506191e-01
25%      6.710052e-01
50%      7.660406e-01
75%      9.061429e-01
max      4.031301e+00
Name: ps_car_13, dtype: float64


count    1.488028e+06
mean     3.746796e-01
std      4.395892e-02
min      1.095445e-01
25%      3.535534e-01
50%      3.746796e-01
75%      3.964846e-01
max      6.363961e-01
Name: ps_car_14, dtype: float64
count    1.488028e+06
mean     3.067355e+00
std      7.299510e-01
min      0.000000e+00
25%      2.828427e+00
50%      3.316625e+00
75%      3.605551e+00
max      3.741657e+00
Name: ps_car_15, dtype: float64


In [32]:
data_p1_scale = data_p1.copy()
for col in ps_car_type_float:
    data_p1_scale[col] = preprocessing.scale(data_p1_scale[col])
    print('col: {0}, mean: {1}, var: {2}'.format(
        col, data_p1_scale[col].mean(), data_p1_scale[col].std())
    )

col: ps_car_12, mean: -1.2610559221490212e-15, var: 1.0000003360153515
col: ps_car_13, mean: 4.3949679306679457e-17, var: 1.000000336015352
col: ps_car_14, mean: 1.7903811992572384e-15, var: 1.0000003360153518
col: ps_car_15, mean: -3.0856456723138053e-16, var: 1.0000003360153518


In [33]:
data_p1 = data_p1_scale

#### 保存

In [3]:
data_p1_path = config.data.path('data_p1.csv')

In [35]:
data_p1.to_csv(data_p1_path, index=False)


In [4]:
data_p1 = pd.read_csv(data_p1_path)

## 处理特征 'calc'

In [5]:
data_p2 = data_p1

### 处理缺失值

In [8]:
show_feature_with_missing_value(data_p2, prefix='ps_calc', placeholder=-1)

### 归一化

In [9]:
for col, _type in preprocess_utils.get_column_type_pair_list(df=data_p2, prefix='ps_calc'):
    print(col, _type)

ps_calc_01 float64
ps_calc_02 float64
ps_calc_03 float64
ps_calc_04 int64
ps_calc_05 int64
ps_calc_06 int64
ps_calc_07 int64
ps_calc_08 int64
ps_calc_09 int64
ps_calc_10 int64
ps_calc_11 int64
ps_calc_12 int64
ps_calc_13 int64
ps_calc_14 int64
ps_calc_15_bin int64
ps_calc_16_bin int64
ps_calc_17_bin int64
ps_calc_18_bin int64
ps_calc_19_bin int64
ps_calc_20_bin int64


In [10]:
ps_calc_type_float = ['ps_calc_01', 'ps_calc_02', 'ps_calc_03']

In [11]:
data_p2_scale = data_p2.copy()
for col in ps_calc_type_float:
    data_p2_scale[col] = preprocessing.scale(data_p2_scale[col])
    print("col: {0}, mean: {1}, var: {2}".format(
        col, data_p2_scale[col].mean(), data_p2_scale[col].std())
    )

col: ps_calc_01, mean: -8.932614742005555e-16, var: 1.000000336015352
col: ps_calc_02, mean: -3.938448038217371e-16, var: 1.0000003360153515
col: ps_calc_03, mean: -4.530770720702927e-16, var: 1.0000003360153522


In [12]:
data_p2 = data_p2_scale
for col in ps_calc_type_float:
    print("col: {0}, mean: {1}, var: {2}".format(
        col, data_p2[col].mean(), data_p2[col].std())
    )

col: ps_calc_01, mean: -8.932614742005555e-16, var: 1.000000336015352
col: ps_calc_02, mean: -3.938448038217371e-16, var: 1.0000003360153515
col: ps_calc_03, mean: -4.530770720702927e-16, var: 1.0000003360153522


### 保存

In [13]:
data_p2_path = config.data.path('data_p2.csv')

In [14]:
data_p2.to_csv(data_p2_path, index=False)

In [None]:
data_p2 = pd.read_csv(data_p2_path)

## 处理特征 `reg`

#### 处理缺失值

In [15]:
show_feature_with_missing_value(data_p2, prefix='ps', placeholder=-1)

col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063
col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969


col: ps_reg_03, dtype: float64, missing rate: 0.18108261403683265


In [16]:
def fix_feature_reg(df: pd.DataFrame) -> pd.DataFrame:
    _df = df.copy()
     
    # TODO 下面的缺失值较多, 目前先取均值, 以后再改为预测填充
    for col in ['ps_reg_03', ]:
        mean = df[col][df[col] != -1].mean()
        print(df[col].describe())
        print("col: {0}, mean: {1}".format(col, mean))
        _df[col][_df[col] == -1] = mean

    return _df

In [17]:
data_p3 = fix_feature_reg(data_p2) 

count    1.488028e+06
mean     5.514848e-01
std      7.938159e-01
min     -1.000000e+00
25%      5.250000e-01
50%      7.211103e-01
75%      1.001561e+00
max      4.423517e+00
Name: ps_reg_03, dtype: float64
col: ps_reg_03, mean: 0.8945559497971071


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


#### 归一化

In [18]:
for col, _type in preprocess_utils.get_column_type_pair_list(df=data_p3, prefix='ps_reg'):
    print(col, _type)

ps_reg_01 float64
ps_reg_02 float64
ps_reg_03 float64


In [19]:
ps_reg_float = ['ps_reg_01', 'ps_reg_02', 'ps_reg_03']

In [20]:
data_p3_scale = data_p3.copy()
for col in ps_reg_float:
    data_p3_scale[col] = preprocessing.scale(data_p3_scale[col])
    print("col: {0}, mean: {1}, var: {2}".format(
        col, data_p3_scale[col].mean(), data_p3_scale[col].std())
    )

col: ps_reg_01, mean: -8.148501656504704e-16, var: 1.0000003360153518
col: ps_reg_02, mean: -7.032473945992196e-16, var: 1.000000336015352
col: ps_reg_03, mean: -3.900844417558951e-16, var: 1.0000003360153522


In [21]:
data_p3 = data_p3_scale
for col in ps_reg_float:
    print("col: {0}, mean: {1}, var: {2}".format(
        col, data_p3[col].mean(), data_p3[col].std())
    )

col: ps_reg_01, mean: -8.148501656504704e-16, var: 1.0000003360153518
col: ps_reg_02, mean: -7.032473945992196e-16, var: 1.000000336015352
col: ps_reg_03, mean: -3.900844417558951e-16, var: 1.0000003360153522


#### 保存

In [3]:
data_p3_path = config.data.path('data_p3.csv')

In [23]:
data_p3.to_csv(data_p3_path, index=False)

In [4]:
data_p3 = pd.read_csv(data_p3_path)

## 大检查

In [5]:
data_processed = data_p3

In [7]:
preprocess_utils.show_feature_with_missing_value(data_processed, prefix='ps')

col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063


col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969


In [8]:
data_processed_columns = preprocess_utils.get_column_type_pair_list(data_processed, prefix='ps')

In [9]:
for col, _type in data_processed_columns:
    print(col, _type)

ps_calc_01 float64
ps_calc_02 float64
ps_calc_03 float64
ps_calc_04 int64
ps_calc_05 int64
ps_calc_06 int64
ps_calc_07 int64
ps_calc_08 int64
ps_calc_09 int64
ps_calc_10 int64
ps_calc_11 int64
ps_calc_12 int64
ps_calc_13 int64
ps_calc_14 int64
ps_calc_15_bin int64
ps_calc_16_bin int64
ps_calc_17_bin int64
ps_calc_18_bin int64
ps_calc_19_bin int64
ps_calc_20_bin int64
ps_car_01_cat int64
ps_car_02_cat int64
ps_car_03_cat int64
ps_car_04_cat int64
ps_car_05_cat int64
ps_car_06_cat int64
ps_car_07_cat int64
ps_car_08_cat int64
ps_car_09_cat int64
ps_car_10_cat int64
ps_car_11 int64
ps_car_11_cat int64
ps_car_12 float64
ps_car_13 float64
ps_car_14 float64
ps_car_15 float64
ps_ind_01 int64
ps_ind_02_cat int64
ps_ind_03 int64
ps_ind_04_cat int64
ps_ind_05_cat int64
ps_ind_06_bin int64
ps_ind_07_bin int64
ps_ind_08_bin int64
ps_ind_09_bin int64
ps_ind_10_bin int64
ps_ind_11_bin int64
ps_ind_12_bin int64
ps_ind_13_bin int64
ps_ind_14 int64
ps_ind_15 int64
ps_ind_16_bin int64
ps_ind_17_bin int6

In [10]:
data_processed_columns_float = list(filter(lambda x: 'float' in x[1], data_processed_columns))

for col, _type in data_processed_columns_float:
    print("col: {0}, mean: {1}, var: {2}".format(
        col, data_processed[col].mean(), data_processed[col].std())
    )


col: ps_calc_01, mean: -8.958018076850356e-16, var: 1.000000336015352
col: ps_calc_02, mean: -4.3841762884218467e-16, var: 1.0000003360153515
col: ps_calc_03, mean: -4.271866808055365e-16, var: 1.000000336015352
col: ps_car_12, mean: -1.2507417861969974e-15, var: 1.0000003360153515
col: ps_car_13, mean: 4.534399768537898e-17, var: 1.000000336015352
col: ps_car_14, mean: 1.7832377050978738e-15, var: 1.0000003360153518
col: ps_car_15, mean: -3.0397573174531806e-16, var: 1.0000003360153515
col: ps_reg_01, mean: -8.429323108050316e-16, var: 1.0000003360153518
col: ps_reg_02, mean: -6.91715617597304e-16, var: 1.000000336015352


col: ps_reg_03, mean: -3.9275847700271606e-16, var: 1.0000003360153522


In [11]:
data_processed_columns_int = list(filter(lambda x: 'float' not in x[1], data_processed_columns))

In [12]:
data_processed_columns_int_name = [x[0] for x in data_processed_columns_int]

## 整数特征独热编码

In [13]:
# category 的列全部独热编码

data_catergory_onehot = pd.get_dummies(
    data=data_processed,
    columns=data_processed_columns_int_name
)

#### 保存

In [32]:
data_catergory_onehot_path = config.data.path('data_catergory_onehot.csv')

In [33]:
data_catergory_onehot.to_csv(data_catergory_onehot_path, index=False)

In [3]:
data_catergory_onehot = pd.read_csv(data_catergory_onehot_path)

## Indexing

In [14]:
data_onehot_feature = preprocess_utils.get_column_type_pair_list(data_catergory_onehot, prefix='ps')

In [15]:
data_onehot_feature_type_int = list(filter(lambda x: 'float' not in x[1], data_onehot_feature))

In [16]:
data_onehot_feature_type_int_name = [x[0] for x in data_onehot_feature_type_int]

In [6]:
for col, _type in data_onehot_feature_type_int:
    print(col, _type)

ps_calc_04_0 int64
ps_calc_04_1 int64
ps_calc_04_2 int64
ps_calc_04_3 int64
ps_calc_04_4 int64
ps_calc_04_5 int64
ps_calc_05_0 int64
ps_calc_05_1 int64
ps_calc_05_2 int64
ps_calc_05_3 int64
ps_calc_05_4 int64
ps_calc_05_5 int64
ps_calc_05_6 int64
ps_calc_06_0 int64
ps_calc_06_1 int64
ps_calc_06_2 int64
ps_calc_06_3 int64
ps_calc_06_4 int64
ps_calc_06_5 int64
ps_calc_06_6 int64
ps_calc_06_7 int64
ps_calc_06_8 int64
ps_calc_06_9 int64
ps_calc_06_10 int64
ps_calc_07_0 int64
ps_calc_07_1 int64
ps_calc_07_2 int64
ps_calc_07_3 int64
ps_calc_07_4 int64
ps_calc_07_5 int64
ps_calc_07_6 int64
ps_calc_07_7 int64
ps_calc_07_8 int64
ps_calc_07_9 int64
ps_calc_08_1 int64
ps_calc_08_2 int64
ps_calc_08_3 int64
ps_calc_08_4 int64
ps_calc_08_5 int64
ps_calc_08_6 int64
ps_calc_08_7 int64
ps_calc_08_8 int64
ps_calc_08_9 int64
ps_calc_08_10 int64
ps_calc_08_11 int64
ps_calc_08_12 int64
ps_calc_09_0 int64
ps_calc_09_1 int64
ps_calc_09_2 int64
ps_calc_09_3 int64
ps_calc_09_4 int64
ps_calc_09_5 int64
ps_calc_

In [17]:
# 整数类型的特征先indexing
feature_all_embedding_offset = 0

data_indexed, feature_all_embedding_length = preprocess_utils.indexing(
    df=data_catergory_onehot,
    columns=data_onehot_feature_type_int_name,
    offset=feature_all_embedding_offset,
    neat=False
)
feature_all_embedding_length = math.ceil(feature_all_embedding_length/100)*100

ps_calc_04_0 1 0 2 [0, 1]
ps_calc_04_1 1 0 2 [0, 1]
ps_calc_04_2 1 0 2 [0, 1]
ps_calc_04_3 1 0 2 [0, 1]


ps_calc_04_4 1 0 2 [0, 1]
ps_calc_04_5 1 0 2 [0, 1]
ps_calc_05_0 1 0 2 [0, 1]
ps_calc_05_1 1 0 2 [0, 1]
ps_calc_05_2 1 0 2 [0, 1]


ps_calc_05_3 1 0 2 [0, 1]
ps_calc_05_4 1 0 2 [0, 1]
ps_calc_05_5 1 0 2 [0, 1]
ps_calc_05_6 1 0 2 [0, 1]
ps_calc_06_0 1 0 2 [0, 1]


ps_calc_06_1 1 0 2 [0, 1]
ps_calc_06_2 1 0 2 [0, 1]
ps_calc_06_3 1 0 2 [0, 1]
ps_calc_06_4 1 0 2 [0, 1]
ps_calc_06_5 1 0 2 [0, 1]


ps_calc_06_6 1 0 2 [0, 1]
ps_calc_06_7 1 0 2 [0, 1]
ps_calc_06_8 1 0 2 [0, 1]
ps_calc_06_9 1 0 2 [0, 1]
ps_calc_06_10 1 0 2 [0, 1]


ps_calc_07_0 1 0 2 [0, 1]
ps_calc_07_1 1 0 2 [0, 1]
ps_calc_07_2 1 0 2 [0, 1]
ps_calc_07_3 1 0 2 [0, 1]
ps_calc_07_4 1 0 2 [0, 1]


ps_calc_07_5 1 0 2 [0, 1]
ps_calc_07_6 1 0 2 [0, 1]
ps_calc_07_7 1 0 2 [0, 1]
ps_calc_07_8 1 0 2 [0, 1]
ps_calc_07_9 1 0 2 [0, 1]
ps_calc_08_1 1 0 2 [0, 1]


ps_calc_08_2 1 0 2 [0, 1]
ps_calc_08_3 1 0 2 [0, 1]
ps_calc_08_4 1 0 2 [0, 1]
ps_calc_08_5 1 0 2 [0, 1]
ps_calc_08_6 1 0 2 [0, 1]
ps_calc_08_7 1 0 2 [0, 1]


ps_calc_08_8 1 0 2 [0, 1]
ps_calc_08_9 1 0 2 [0, 1]
ps_calc_08_10 1 0 2 [0, 1]
ps_calc_08_11 1 0 2 [0, 1]
ps_calc_08_12 1 0 2 [0, 1]


ps_calc_09_0 1 0 2 [0, 1]
ps_calc_09_1 1 0 2 [0, 1]
ps_calc_09_2 1 0 2 [0, 1]
ps_calc_09_3 1 0 2 [0, 1]
ps_calc_09_4 1 0 2 [0, 1]


ps_calc_09_5 1 0 2 [0, 1]
ps_calc_09_6 1 0 2 [0, 1]
ps_calc_09_7 1 0 2 [0, 1]
ps_calc_10_0 1 0 2 [0, 1]
ps_calc_10_1 1 0 2 [0, 1]
ps_calc_10_2 1 0 2 [0, 1]


ps_calc_10_3 1 0 2 [0, 1]
ps_calc_10_4 1 0 2 [0, 1]
ps_calc_10_5 1 0 2 [0, 1]
ps_calc_10_6 1 0 2 [0, 1]
ps_calc_10_7 1 0 2 [0, 1]


ps_calc_10_8 1 0 2 [0, 1]
ps_calc_10_9 1 0 2 [0, 1]
ps_calc_10_10 1 0 2 [0, 1]
ps_calc_10_11 1 0 2 [0, 1]
ps_calc_10_12 1 0 2 [0, 1]


ps_calc_10_13 1 0 2 [0, 1]
ps_calc_10_14 1 0 2 [0, 1]
ps_calc_10_15 1 0 2 [0, 1]
ps_calc_10_16 1 0 2 [0, 1]
ps_calc_10_17 1 0 2 [0, 1]
ps_calc_10_18 1 0 2 [0, 1]


ps_calc_10_19 1 0 2 [0, 1]
ps_calc_10_20 1 0 2 [0, 1]
ps_calc_10_21 1 0 2 [0, 1]
ps_calc_10_22 1 0 2 [0, 1]
ps_calc_10_23 1 0 2 [0, 1]


ps_calc_10_24 1 0 2 [0, 1]
ps_calc_10_25 1 0 2 [0, 1]
ps_calc_11_0 1 0 2 [0, 1]
ps_calc_11_1 1 0 2 [0, 1]
ps_calc_11_2 1 0 2 [0, 1]


ps_calc_11_3 1 0 2 [0, 1]
ps_calc_11_4 1 0 2 [0, 1]
ps_calc_11_5 1 0 2 [0, 1]
ps_calc_11_6 1 0 2 [0, 1]
ps_calc_11_7 1 0 2 [0, 1]


ps_calc_11_8 1 0 2 [0, 1]
ps_calc_11_9 1 0 2 [0, 1]
ps_calc_11_10 1 0 2 [0, 1]
ps_calc_11_11 1 0 2 [0, 1]
ps_calc_11_12 1 0 2 [0, 1]


ps_calc_11_13 1 0 2 [0, 1]
ps_calc_11_14 1 0 2 [0, 1]
ps_calc_11_15 1 0 2 [0, 1]
ps_calc_11_16 1 0 2 [0, 1]
ps_calc_11_17 1 0 2 [0, 1]


ps_calc_11_18 1 0 2 [0, 1]
ps_calc_11_19 1 0 2 [0, 1]
ps_calc_11_20 1 0 2 [0, 1]
ps_calc_12_0 1 0 2 [0, 1]
ps_calc_12_1 1 0 2 [0, 1]


ps_calc_12_2 1 0 2 [0, 1]
ps_calc_12_3 1 0 2 [0, 1]
ps_calc_12_4 1 0 2 [0, 1]
ps_calc_12_5 1 0 2 [0, 1]
ps_calc_12_6 1 0 2 [0, 1]


ps_calc_12_7 1 0 2 [0, 1]
ps_calc_12_8 1 0 2 [0, 1]
ps_calc_12_9 1 0 2 [0, 1]
ps_calc_12_10 1 0 2 [0, 1]
ps_calc_12_11 1 0 2 [0, 1]


ps_calc_13_0 1 0 2 [0, 1]
ps_calc_13_1 1 0 2 [0, 1]
ps_calc_13_2 1 0 2 [0, 1]
ps_calc_13_3 1 0 2 [0, 1]
ps_calc_13_4 1 0 2 [0, 1]


ps_calc_13_5 1 0 2 [0, 1]
ps_calc_13_6 1 0 2 [0, 1]
ps_calc_13_7 1 0 2 [0, 1]
ps_calc_13_8 1 0 2 [0, 1]
ps_calc_13_9 1 0 2 [0, 1]


ps_calc_13_10 1 0 2 [0, 1]
ps_calc_13_11 1 0 2 [0, 1]
ps_calc_13_12 1 0 2 [0, 1]
ps_calc_13_13 1 0 2 [0, 1]
ps_calc_13_14 1 0 2 [0, 1]


ps_calc_13_15 1 0 2 [0, 1]


ps_calc_14_0 1 0 2 [0, 1]
ps_calc_14_1 1 0 2 [0, 1]


ps_calc_14_2 1 0 2 [0, 1]
ps_calc_14_3 1 0 2 [0, 1]


ps_calc_14_4 1 0 2 [0, 1]
ps_calc_14_5 1 0 2 [0, 1]


ps_calc_14_6 1 0 2 [0, 1]
ps_calc_14_7 1 0 2 [0, 1]


ps_calc_14_8 1 0 2 [0, 1]
ps_calc_14_9 1 0 2 [0, 1]


ps_calc_14_10 1 0 2 [0, 1]
ps_calc_14_11 1 0 2 [0, 1]


ps_calc_14_12 1 0 2 [0, 1]
ps_calc_14_13 1 0 2 [0, 1]


ps_calc_14_14 1 0 2 [0, 1]
ps_calc_14_15 1 0 2 [0, 1]


ps_calc_14_16 1 0 2 [0, 1]
ps_calc_14_17 1 0 2 [0, 1]


ps_calc_14_18 1 0 2 [0, 1]
ps_calc_14_19 1 0 2 [0, 1]


ps_calc_14_20 1 0 2 [0, 1]
ps_calc_14_21 1 0 2 [0, 1]


ps_calc_14_22 1 0 2 [0, 1]
ps_calc_14_23 1 0 2 [0, 1]


ps_calc_14_28 1 0 2 [0, 1]
ps_calc_15_bin_0 1 0 2 [0, 1]


ps_calc_15_bin_1 1 0 2 [0, 1]
ps_calc_16_bin_0 1 0 2 [0, 1]


ps_calc_16_bin_1 1 0 2 [0, 1]
ps_calc_17_bin_0 1 0 2 [0, 1]


ps_calc_17_bin_1 1 0 2 [0, 1]
ps_calc_18_bin_0 1 0 2 [0, 1]


ps_calc_18_bin_1 1 0 2 [0, 1]
ps_calc_19_bin_0 1 0 2 [0, 1]


ps_calc_19_bin_1 1 0 2 [0, 1]
ps_calc_20_bin_0 1 0 2 [0, 1]


ps_calc_20_bin_1 1 0 2 [0, 1]
ps_car_01_cat_0 1 0 2 [0, 1]


ps_car_01_cat_1 1 0 2 [0, 1]
ps_car_01_cat_2 1 0 2 [0, 1]


ps_car_01_cat_3 1 0 2 [0, 1]
ps_car_01_cat_4 1 0 2 [0, 1]


ps_car_01_cat_5 1 0 2 [0, 1]
ps_car_01_cat_6 1 0 2 [0, 1]


ps_car_01_cat_7 1 0 2 [0, 1]
ps_car_01_cat_8 1 0 2 [0, 1]


ps_car_01_cat_9 1 0 2 [0, 1]
ps_car_01_cat_10 1 0 2 [0, 1]


ps_car_01_cat_11 1 0 2 [0, 1]
ps_car_02_cat_0 1 0 2 [0, 1]


ps_car_02_cat_1 1 0 2 [0, 1]
ps_car_03_cat_-1 1 0 2 [0, 1]


ps_car_03_cat_0 1 0 2 [0, 1]
ps_car_03_cat_1 1 0 2 [0, 1]


ps_car_04_cat_0 1 0 2 [0, 1]
ps_car_04_cat_1 1 0 2 [0, 1]


ps_car_04_cat_2 1 0 2 [0, 1]
ps_car_04_cat_3 1 0 2 [0, 1]


ps_car_04_cat_4 1 0 2 [0, 1]
ps_car_04_cat_5 1 0 2 [0, 1]


ps_car_04_cat_6 1 0 2 [0, 1]
ps_car_04_cat_7 1 0 2 [0, 1]


ps_car_04_cat_8 1 0 2 [0, 1]
ps_car_04_cat_9 1 0 2 [0, 1]


ps_car_05_cat_-1 1 0 2 [0, 1]
ps_car_05_cat_0 1 0 2 [0, 1]


ps_car_05_cat_1 1 0 2 [0, 1]
ps_car_06_cat_0 1 0 2 [0, 1]


ps_car_06_cat_1 1 0 2 [0, 1]
ps_car_06_cat_2 1 0 2 [0, 1]


ps_car_06_cat_3 1 0 2 [0, 1]
ps_car_06_cat_4 1 0 2 [0, 1]


ps_car_06_cat_5 1 0 2 [0, 1]
ps_car_06_cat_6 1 0 2 [0, 1]


ps_car_06_cat_7 1 0 2 [0, 1]
ps_car_06_cat_8 1 0 2 [0, 1]


ps_car_06_cat_9 1 0 2 [0, 1]
ps_car_06_cat_10 1 0 2 [0, 1]


ps_car_06_cat_11 1 0 2 [0, 1]
ps_car_06_cat_12 1 0 2 [0, 1]


ps_car_06_cat_13 1 0 2 [0, 1]
ps_car_06_cat_14 1 0 2 [0, 1]


ps_car_06_cat_15 1 0 2 [0, 1]
ps_car_06_cat_16 1 0 2 [0, 1]


ps_car_06_cat_17 1 0 2 [0, 1]
ps_car_07_cat_0 1 0 2 [0, 1]


ps_car_07_cat_1 1 0 2 [0, 1]
ps_car_08_cat_0 1 0 2 [0, 1]


ps_car_08_cat_1 1 0 2 [0, 1]
ps_car_09_cat_0 1 0 2 [0, 1]


ps_car_09_cat_1 1 0 2 [0, 1]
ps_car_09_cat_2 1 0 2 [0, 1]


ps_car_09_cat_3 1 0 2 [0, 1]
ps_car_09_cat_4 1 0 2 [0, 1]


ps_car_10_cat_0 1 0 2 [0, 1]
ps_car_10_cat_1 1 0 2 [0, 1]


ps_car_10_cat_2 1 0 2 [0, 1]
ps_car_11_0 1 0 2 [0, 1]


ps_car_11_1 1 0 2 [0, 1]
ps_car_11_2 1 0 2 [0, 1]


ps_car_11_3 1 0 2 [0, 1]
ps_car_11_cat_1 1 0 2 [0, 1]


ps_car_11_cat_2 1 0 2 [0, 1]
ps_car_11_cat_3 1 0 2 [0, 1]


ps_car_11_cat_4 1 0 2 [0, 1]
ps_car_11_cat_5 1 0 2 [0, 1]


ps_car_11_cat_6 1 0 2 [0, 1]
ps_car_11_cat_7 1 0 2 [0, 1]


ps_car_11_cat_8 1 0 2 [0, 1]
ps_car_11_cat_9 1 0 2 [0, 1]


ps_car_11_cat_10 1 0 2 [0, 1]
ps_car_11_cat_11 1 0 2 [0, 1]


ps_car_11_cat_12 1 0 2 [0, 1]
ps_car_11_cat_13 1 0 2 [0, 1]


ps_car_11_cat_14 1 0 2 [0, 1]
ps_car_11_cat_15 1 0 2 [0, 1]


ps_car_11_cat_16 1 0 2 [0, 1]
ps_car_11_cat_17 1 0 2 [0, 1]


ps_car_11_cat_18 1 0 2 [0, 1]
ps_car_11_cat_19 1 0 2 [0, 1]


ps_car_11_cat_20 1 0 2 [0, 1]
ps_car_11_cat_21 1 0 2 [0, 1]


ps_car_11_cat_22 1 0 2 [0, 1]
ps_car_11_cat_23 1 0 2 [0, 1]


ps_car_11_cat_24 1 0 2 [0, 1]
ps_car_11_cat_25 1 0 2 [0, 1]


ps_car_11_cat_26 1 0 2 [0, 1]
ps_car_11_cat_27 1 0 2 [0, 1]


ps_car_11_cat_28 1 0 2 [0, 1]
ps_car_11_cat_29 1 0 2 [0, 1]


ps_car_11_cat_30 1 0 2 [0, 1]
ps_car_11_cat_31 1 0 2 [0, 1]


ps_car_11_cat_32 1 0 2 [0, 1]
ps_car_11_cat_33 1 0 2 [0, 1]


ps_car_11_cat_34 1 0 2 [0, 1]
ps_car_11_cat_35 1 0 2 [0, 1]


ps_car_11_cat_36 1 0 2 [0, 1]
ps_car_11_cat_37 1 0 2 [0, 1]


ps_car_11_cat_38 1 0 2 [0, 1]
ps_car_11_cat_39 1 0 2 [0, 1]


ps_car_11_cat_40 1 0 2 [0, 1]
ps_car_11_cat_41 1 0 2 [0, 1]


ps_car_11_cat_42 1 0 2 [0, 1]
ps_car_11_cat_43 1 0 2 [0, 1]


ps_car_11_cat_44 1 0 2 [0, 1]
ps_car_11_cat_45 1 0 2 [0, 1]


ps_car_11_cat_46 1 0 2 [0, 1]
ps_car_11_cat_47 1 0 2 [0, 1]


ps_car_11_cat_48 1 0 2 [0, 1]
ps_car_11_cat_49 1 0 2 [0, 1]


ps_car_11_cat_50 1 0 2 [0, 1]
ps_car_11_cat_51 1 0 2 [0, 1]


ps_car_11_cat_52 1 0 2 [0, 1]
ps_car_11_cat_53 1 0 2 [0, 1]


ps_car_11_cat_54 1 0 2 [0, 1]
ps_car_11_cat_55 1 0 2 [0, 1]


ps_car_11_cat_56 1 0 2 [0, 1]
ps_car_11_cat_57 1 0 2 [0, 1]


ps_car_11_cat_58 1 0 2 [0, 1]
ps_car_11_cat_59 1 0 2 [0, 1]


ps_car_11_cat_60 1 0 2 [0, 1]
ps_car_11_cat_61 1 0 2 [0, 1]


ps_car_11_cat_62 1 0 2 [0, 1]
ps_car_11_cat_63 1 0 2 [0, 1]


ps_car_11_cat_64 1 0 2 [0, 1]
ps_car_11_cat_65 1 0 2 [0, 1]


ps_car_11_cat_66 1 0 2 [0, 1]
ps_car_11_cat_67 1 0 2 [0, 1]


ps_car_11_cat_68 1 0 2 [0, 1]
ps_car_11_cat_69 1 0 2 [0, 1]


ps_car_11_cat_70 1 0 2 [0, 1]
ps_car_11_cat_71 1 0 2 [0, 1]


ps_car_11_cat_72 1 0 2 [0, 1]
ps_car_11_cat_73 1 0 2 [0, 1]


ps_car_11_cat_74 1 0 2 [0, 1]
ps_car_11_cat_75 1 0 2 [0, 1]


ps_car_11_cat_76 1 0 2 [0, 1]
ps_car_11_cat_77 1 0 2 [0, 1]


ps_car_11_cat_78 1 0 2 [0, 1]
ps_car_11_cat_79 1 0 2 [0, 1]


ps_car_11_cat_80 1 0 2 [0, 1]
ps_car_11_cat_81 1 0 2 [0, 1]


ps_car_11_cat_82 1 0 2 [0, 1]
ps_car_11_cat_83 1 0 2 [0, 1]


ps_car_11_cat_84 1 0 2 [0, 1]
ps_car_11_cat_85 1 0 2 [0, 1]


ps_car_11_cat_86 1 0 2 [0, 1]
ps_car_11_cat_87 1 0 2 [0, 1]
ps_car_11_cat_88 1 0 2 [0, 1]


ps_car_11_cat_89 1 0 2 [0, 1]
ps_car_11_cat_90 1 0 2 [0, 1]
ps_car_11_cat_91 1 0 2 [0, 1]


ps_car_11_cat_92 1 0 2 [0, 1]
ps_car_11_cat_93 1 0 2 [0, 1]
ps_car_11_cat_94 1 0 2 [0, 1]


ps_car_11_cat_95 1 0 2 [0, 1]
ps_car_11_cat_96 1 0 2 [0, 1]
ps_car_11_cat_97 1 0 2 [0, 1]


ps_car_11_cat_98 1 0 2 [0, 1]
ps_car_11_cat_99 1 0 2 [0, 1]
ps_car_11_cat_100 1 0 2 [0, 1]


ps_car_11_cat_101 1 0 2 [0, 1]
ps_car_11_cat_102 1 0 2 [0, 1]
ps_car_11_cat_103 1 0 2 [0, 1]


ps_car_11_cat_104 1 0 2 [0, 1]
ps_ind_01_0 1 0 2 [0, 1]


ps_ind_01_1 1 0 2 [0, 1]
ps_ind_01_2 1 0 2 [0, 1]
ps_ind_01_3 1 0 2 [0, 1]


ps_ind_01_4 1 0 2 [0, 1]
ps_ind_01_5 1 0 2 [0, 1]
ps_ind_01_6 1 0 2 [0, 1]


ps_ind_01_7 1 0 2 [0, 1]
ps_ind_02_cat_1 1 0 2 [0, 1]
ps_ind_02_cat_2 1 0 2 [0, 1]


ps_ind_02_cat_3 1 0 2 [0, 1]
ps_ind_02_cat_4 1 0 2 [0, 1]
ps_ind_03_0 1 0 2 [0, 1]


ps_ind_03_1 1 0 2 [0, 1]
ps_ind_03_2 1 0 2 [0, 1]
ps_ind_03_3 1 0 2 [0, 1]


ps_ind_03_4 1 0 2 [0, 1]
ps_ind_03_5 1 0 2 [0, 1]
ps_ind_03_6 1 0 2 [0, 1]


ps_ind_03_7 1 0 2 [0, 1]
ps_ind_03_8 1 0 2 [0, 1]
ps_ind_03_9 1 0 2 [0, 1]


ps_ind_03_10 1 0 2 [0, 1]
ps_ind_03_11 1 0 2 [0, 1]
ps_ind_04_cat_0 1 0 2 [0, 1]


ps_ind_04_cat_1 1 0 2 [0, 1]
ps_ind_05_cat_0 1 0 2 [0, 1]
ps_ind_05_cat_1 1 0 2 [0, 1]


ps_ind_05_cat_2 1 0 2 [0, 1]
ps_ind_05_cat_3 1 0 2 [0, 1]
ps_ind_05_cat_4 1 0 2 [0, 1]


ps_ind_05_cat_5 1 0 2 [0, 1]
ps_ind_05_cat_6 1 0 2 [0, 1]
ps_ind_06_bin_0 1 0 2 [0, 1]


ps_ind_06_bin_1 1 0 2 [0, 1]
ps_ind_07_bin_0 1 0 2 [0, 1]
ps_ind_07_bin_1 1 0 2 [0, 1]


ps_ind_08_bin_0 1 0 2 [0, 1]
ps_ind_08_bin_1 1 0 2 [0, 1]
ps_ind_09_bin_0 1 0 2 [0, 1]


ps_ind_09_bin_1 1 0 2 [0, 1]
ps_ind_10_bin_0 1 0 2 [0, 1]
ps_ind_10_bin_1 1 0 2 [0, 1]


ps_ind_11_bin_0 1 0 2 [0, 1]
ps_ind_11_bin_1 1 0 2 [0, 1]
ps_ind_12_bin_0 1 0 2 [0, 1]


ps_ind_12_bin_1 1 0 2 [0, 1]
ps_ind_13_bin_0 1 0 2 [0, 1]
ps_ind_13_bin_1 1 0 2 [0, 1]


ps_ind_14_0 1 0 2 [0, 1]
ps_ind_14_1 1 0 2 [0, 1]
ps_ind_14_2 1 0 2 [0, 1]


ps_ind_14_3 1 0 2 [0, 1]
ps_ind_14_4 1 0 2 [0, 1]
ps_ind_15_0 1 0 2 [0, 1]


ps_ind_15_1 1 0 2 [0, 1]
ps_ind_15_2 1 0 2 [0, 1]
ps_ind_15_3 1 0 2 [0, 1]


ps_ind_15_4 1 0 2 [0, 1]
ps_ind_15_5 1 0 2 [0, 1]
ps_ind_15_6 1 0 2 [0, 1]


ps_ind_15_7 1 0 2 [0, 1]
ps_ind_15_8 1 0 2 [0, 1]
ps_ind_15_9 1 0 2 [0, 1]


ps_ind_15_10 1 0 2 [0, 1]
ps_ind_15_11 1 0 2 [0, 1]
ps_ind_15_12 1 0 2 [0, 1]


ps_ind_15_13 1 0 2 [0, 1]
ps_ind_16_bin_0 1 0 2 [0, 1]
ps_ind_16_bin_1 1 0 2 [0, 1]


ps_ind_17_bin_0 1 0 2 [0, 1]
ps_ind_17_bin_1 1 0 2 [0, 1]
ps_ind_18_bin_0 1 0 2 [0, 1]


ps_ind_18_bin_1 1 0 2 [0, 1]


In [18]:
# 保存indexing后的offset
config.parameter.put('embedding.index.offset', feature_all_embedding_offset)
config.parameter.put('embedding.index.length', feature_all_embedding_length)

#### 保存

In [20]:
data_indexed_path = config.data.path('data_indexed.csv')
data_indexed_clip_path = config.data.path('data_indexed_clip.csv')

data_indexed_train_path = config.data.path('data_indexed_train.csv')
data_indexed_train_clip_path = config.data.path('data_indexed_train_clip.csv')

data_indexed_test_path = config.data.path('data_indexed_test.csv')
data_indexed_test_clip_path = config.data.path('data_indexed_test_clip.csv')

In [14]:
data_indexed.to_csv(data_indexed_path, index=False)

In [28]:
data_indexed\
    .groupby('target').apply(lambda x: x.sample(frac=0.01))\
    .to_csv(data_indexed_clip_path, index=False)

In [None]:
data_indexing = pd.read_csv(data_indexing_path)

In [21]:
data_indexed[data_indexed['set'] == 'train'].to_csv(data_indexed_train_path, index=False)

In [25]:
data_indexed[data_indexed['set'] == 'train']\
    .groupby('target').apply(lambda x: x.sample(frac=0.05))\
    .to_csv(data_indexed_train_clip_path, index=False)

In [23]:
data_indexed[data_indexed['set'] == 'test'].to_csv(data_indexed_test_path, index=False)

In [27]:
data_indexed[data_indexed['set'] == 'test']\
    .groupby('target').apply(lambda x: x.sample(frac=0.05))\
    .to_csv(data_indexed_test_clip_path, index=False)

In [91]:
# 计算下相关性
cor = data_train_catergory_onehot.corr()

In [104]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support



In [None]:
data_fill_reg_0s = data_train_catergory_onehot_ps_reg_03_intact.sample(n=1000)

x_fill_reg_03 = \
    data_fill_reg_0s[preprocessed_data_column_name_list_feature_all]
y_fill_reg_03 = \
    data_fill_reg_0s['ps_reg_03']

x_fill_reg_03_train, x_fill_reg_03_test, y_fill_reg_03_train, y_fill_reg_03_test = train_test_split(
    x_fill_reg_03.values, y_fill_reg_03.values, train_size=0.8)



In [106]:
import xgboost as xgb

print("#"*120)
print("构建模型")
xgb_model = xgb.XGBClassifier(
    objective= 'reg:logistic',
    max_depth=5,
    min_child_weight=1,
    nthread=8,
)


########################################################################################################################
构建模型


In [113]:
rng = np.random.RandomState(31337)
kf = KFold(n_splits=4, shuffle=True, random_state=rng)

for i, (train_index, test_index) in enumerate(kf.split(x_fill_reg_03_train, y_fill_reg_03_train)):
    xgb_model.fit(x_fill_reg_03_train[train_index], y_fill_reg_03_train[train_index])

    _y = xgb_model.predict(x_fill_reg_03_train[test_index])
    y = y_fill_reg_03_train[test_index]

    confuse = confusion_matrix(y, _y)
    print("第{0}次, CV混淆矩阵 = ".format(i))
    print(confuse)

    acc = accuracy_score(xgb_model.predict(x_fill_reg_03_test), y_fill_reg_03_test)

    print("第{0}次, 测试集准确率 = ".format(i))
    print(acc)