In [1]:
import pandas as pd
from setup import config
import preprocesss.utils as preprocess_utils
import numpy as np
import math

In [2]:
feature_all_config = config.cast('feature.all')

## 加载训练数据和测试数据

In [4]:
data_train_path = config.data.path('train.csv')
data_test_path = config.data.path('test.csv')

In [5]:
data_train = pd.read_csv(data_train_path)
data_test = pd.read_csv(data_test_path)

## 合并数据

In [6]:
data_train['set'] = 'train'
data_test['set'] = 'test'

In [7]:
data = pd.concat([data_train, data_test], axis=0)

In [8]:
print(data.head(10))

   id  ps_calc_01  ps_calc_02  ps_calc_03  ps_calc_04  ps_calc_05  ps_calc_06  \
0   7         0.6         0.5         0.2           3           1          10   
1   9         0.3         0.1         0.3           2           1           9   
2  13         0.5         0.7         0.1           2           2           9   
3  16         0.6         0.9         0.1           2           4           7   
4  17         0.4         0.6         0.0           2           2           6   
5  19         0.7         0.8         0.4           3           1           8   
6  20         0.2         0.6         0.5           2           2           8   
7  22         0.1         0.5         0.1           1           2           7   
8  26         0.9         0.8         0.6           3           1           7   
9  28         0.7         0.8         0.8           2           2           8   

   ps_calc_07  ps_calc_08  ps_calc_09   ...    ps_ind_14  ps_ind_15  \
0           1          10           1

In [9]:
data_path = config.data.path('data.csv')

In [10]:
data.to_csv = data.to_csv(data_path, index=False)

In [None]:
data = pd.read_csv(data_path)

## 处理特征

In [11]:
data_original_column_name_list = list(data.columns)

In [12]:
# 获取特征列

data_feature_list = preprocess_utils.get_column_name_list_by_prefix(
    df=data,
    prefix='ps'
)
data_feature_list = sorted(data_feature_list, reverse=False)

In [13]:
print(data_feature_list)

['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11', 'ps_car_11_cat', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03']


In [14]:
# 检查下存在缺失参数的列

feature_list_with_missing_value = preprocess_utils.get_column_name_list_with_missing_value(
    df=data,
    feature=data_feature_list,
    placeholder=-1,
)
feature_list_with_missing_value = \
    sorted(feature_list_with_missing_value, reverse=False)


In [15]:
print(feature_list_with_missing_value)

['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11', 'ps_car_12', 'ps_car_14', 'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_reg_03']


In [37]:
# 查看存在缺失参数的特征的类型和缺失率
data_train_sample_number = len(data_train)
for col in data_raw_column_name_list_with_missing_value:
    missing_row_num = len(list(filter(lambda x: x, data_train[col] == -1)))
    print("col: {0}, dtype: {1}, missing rate: {2}".format(col, data_train[col].dtype, missing_row_num/data_train_sample_number))



col: ps_car_01_cat, dtype: int64, missing rate: 0.0001797678810239041
col: ps_car_02_cat, dtype: int64, missing rate: 8.400368272145051e-06
col: ps_car_03_cat, dtype: int64, missing rate: 0.6908983689844963
col: ps_car_05_cat, dtype: int64, missing rate: 0.4478253126617071


col: ps_car_07_cat, dtype: int64, missing rate: 0.019302366215734897
col: ps_car_09_cat, dtype: int64, missing rate: 0.0009559619093701067
col: ps_car_11, dtype: int64, missing rate: 8.400368272145051e-06
col: ps_car_12, dtype: float64, missing rate: 1.68007365442901e-06


col: ps_car_14, dtype: float64, missing rate: 0.07160473915176441
col: ps_ind_02_cat, dtype: int64, missing rate: 0.0003628959093566662
col: ps_ind_04_cat, dtype: int64, missing rate: 0.00013944611331760784


col: ps_ind_05_cat, dtype: int64, missing rate: 0.00975954785857812
col: ps_reg_03, dtype: float64, missing rate: 0.18106489788512328


In [16]:
def show_feature_with_missing_value(df: pd.DataFrame, prefix: str, placeholder: int = -1):
    sample_number = len(df)
    column_name_list_with_missing_value = preprocess_utils.get_column_name_list_with_missing_value(
        df=df,
        feature=preprocess_utils.get_column_name_list_by_prefix(df=df, prefix=prefix),
        placeholder=placeholder,
    )
    column_name_list_with_missing_value = sorted(column_name_list_with_missing_value, reverse=False)
    
    for col in column_name_list_with_missing_value:
        missing_row_num = len(list(filter(lambda x: x, df[col] == placeholder)))
        print("col: {0}, dtype: {1}, missing rate: {2}".format(col, df[col].dtype, missing_row_num/sample_number))

In [17]:
show_feature_with_missing_value(data, prefix='ps', placeholder=-1)

col: ps_car_01_cat, dtype: int64, missing rate: 0.00017943210746034348
col: ps_car_02_cat, dtype: int64, missing rate: 6.72030365020013e-06


col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063
col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969


col: ps_car_07_cat, dtype: int64, missing rate: 0.019367915119876778
col: ps_car_09_cat, dtype: int64, missing rate: 0.0009717559078189389


col: ps_car_11, dtype: int64, missing rate: 4.032182190120078e-06
col: ps_car_12, dtype: float64, missing rate: 6.720303650200131e-07


col: ps_car_14, dtype: float64, missing rate: 0.07152083159725489
col: ps_ind_02_cat, dtype: int64, missing rate: 0.00035147188090546685


col: ps_ind_04_cat, dtype: int64, missing rate: 0.00015322292322456297
col: ps_ind_05_cat, dtype: int64, missing rate: 0.00975720886972557


col: ps_reg_03, dtype: float64, missing rate: 0.18108261403683265


### 处理特征 `ind`

In [20]:
# 特征 ind 的缺失值非常少, 又是整数, 取中位数即可


def fix_feature_ind(df: pd.DataFrame) -> pd.DataFrame:
    
    _df = df.copy()
    for col in ['ps_ind_02_cat', 'ps_ind_05_cat', 'ps_ind_04_cat']:
        median = _df[col][df[col] != -1].median()
        median = int(median)
        # mean = data_train_catergory_onehot[col].mean()
        # print(data_train[col].value_counts())
        print("col: {0}, median: {1}".format(col, median))
        _df[col][_df[col] == -1] = median

    return _df

In [21]:
data_p0 = fix_feature_ind(data)

col: ps_ind_02_cat, median: 1
col: ps_ind_05_cat, median: 0
col: ps_ind_04_cat, median: 0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [22]:
# 检查效果

show_feature_with_missing_value(df=data_p0, prefix='ps', placeholder=-1)

col: ps_car_01_cat, dtype: int64, missing rate: 0.00017943210746034348
col: ps_car_02_cat, dtype: int64, missing rate: 6.72030365020013e-06


col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063
col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969


col: ps_car_07_cat, dtype: int64, missing rate: 0.019367915119876778
col: ps_car_09_cat, dtype: int64, missing rate: 0.0009717559078189389


col: ps_car_11, dtype: int64, missing rate: 4.032182190120078e-06
col: ps_car_12, dtype: float64, missing rate: 6.720303650200131e-07


col: ps_car_14, dtype: float64, missing rate: 0.07152083159725489
col: ps_reg_03, dtype: float64, missing rate: 0.18108261403683265


In [23]:
# 步骤0 保存路径
data_p0_path = config.data.path('data_p0.csv')


In [24]:
data_p0.to_csv(data_p0_path, index=False)

In [None]:
data_p0 = pd.read_csv(data_p0_path)

## 处理特征 `car`

In [25]:
show_feature_with_missing_value(data_p0, prefix='ps', placeholder=-1)

col: ps_car_01_cat, dtype: int64, missing rate: 0.00017943210746034348
col: ps_car_02_cat, dtype: int64, missing rate: 6.72030365020013e-06


col: ps_car_03_cat, dtype: int64, missing rate: 0.6909426435524063
col: ps_car_05_cat, dtype: int64, missing rate: 0.4481837707354969


col: ps_car_07_cat, dtype: int64, missing rate: 0.019367915119876778
col: ps_car_09_cat, dtype: int64, missing rate: 0.0009717559078189389


col: ps_car_11, dtype: int64, missing rate: 4.032182190120078e-06
col: ps_car_12, dtype: float64, missing rate: 6.720303650200131e-07


col: ps_car_14, dtype: float64, missing rate: 0.07152083159725489
col: ps_reg_03, dtype: float64, missing rate: 0.18108261403683265


In [28]:
def fix_feature_car(df: pd.DataFrame) -> pd.DataFrame:
    _df = df.copy()
    
    # 一下几个整形特征的缺失值非常少, 取中位数
    for col in ['ps_car_01_cat', 'ps_car_02_cat', 'ps_car_07', 'ps_car_11']:
        median = _df[col][df[col] != -1].median()
        median = int(median)
        print(df[col].value_counts())
        print("col: {0}, median: {1}".format(col, median))
        # _df[col][_df[col] == -1] = median

    return _df

In [27]:
data_p1 = fix_feature_car(data_p0)

IndexingError: Unalignable boolean Series key provided

In [None]:
show_feature_with_missing_value(data_p1, prefix='ps', placeholder=-1)

In [None]:
data_train_catergory_onehot_pa = config.data.path('data_train_catergory_onehot.csv')

In [None]:
data_train_catergory_onehot.to_csv(data_train_catergory_onehot_path, index=False)

In [None]:
data_train_catergory_onehot = pd.read_csv(data_train_catergory_onehot_path)

In [21]:
unpreprocessed_data_column_name_list_feature_all = preprocess_utils.get_column_name_list_by_prefix(
    df=data_train_catergory_onehot,
    prefix='ps'
)

In [22]:
unpreprocessed_data_column_name_list_with_missing_value = preprocess_utils.get_column_name_list_with_missing_value(
    df=data_train_catergory_onehot,
    feature=unpreprocessed_data_column_name_list_feature_all,
    placeholder=-1,
)

In [23]:
print(unpreprocessed_data_column_name_list_with_missing_value)

['ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_14']


In [39]:
data_train_sample_number = len(data_train)

for col in unpreprocessed_data_column_name_list_with_missing_value:
    missing_row_num = len(list(filter(lambda x: x, data_train_catergory_onehot[col] == -1)))
    print("col: {0}, missing rate: {1}".format(col, missing_row_num/data_train_sample_number))



col: ps_reg_03, missing rate: 0.18106489788512328
col: ps_car_11, missing rate: 8.400368272145051e-06


col: ps_car_12, missing rate: 1.68007365442901e-06
col: ps_car_14, missing rate: 0.07160473915176441


In [48]:
# 'ps_car_11', 'ps_car_12', 'ps_car_14' 缺失较少, 又是浮点数, 使用均值代替
for col in ['ps_car_11', 'ps_car_12', 'ps_car_14']:
    mean = data_train_catergory_onehot[col][data_train_catergory_onehot[col] != -1].mean()
    # mean = data_train_catergory_onehot[col].mean()
    print("col: {0}, mean: {1}".format(col, mean))
    data_train_catergory_onehot[col][data_train_catergory_onehot[col] == -1] = mean

print(preprocess_utils.get_column_name_list_with_missing_value(
    df=data_train_catergory_onehot,
    feature=unpreprocessed_data_column_name_list_feature_all,
    placeholder=-1,
))

col: ps_car_11, mean: 2.3460997602514753
col: ps_car_12, mean: 0.3799471335076438
col: ps_car_14, mean: 0.37469063873812564


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


['ps_reg_03']


In [3]:
data_train_preprocessing_save_0_path = config.data.path('train_preprocessing_save_0.csv')

In [109]:
data_train_catergory_onehot.to_csv(data_train_preprocessing_save_0_path, index=False)

In [4]:
data_train_catergory_onehot = pd.read_csv(data_train_preprocessing_save_0_path)

In [5]:
# 'ps_reg_03' 的缺失较多 

print(data_train_catergory_onehot['ps_reg_03'].head(10))

0    0.718070
1    0.766078
2   -1.000000
3    0.580948
4    0.840759
5    2.332649
6    0.617454
7    0.607248
8    0.901388
9    2.316652
Name: ps_reg_03, dtype: float64


In [None]:
# category 的列全部独热编码

data_train_catergory_onehot = pd.get_dummies(
    data=data_train,
    columns=[col for col in data_train_original_column_name_list if '_cat' in col]
)


In [7]:
# 获取各个列的类型
def get_column_type_pair_list(df: pd.DataFrame) -> list:
    dtype_info = df.dtypes
    return list(zip(list(dtype_info.index), [col.name for col in dtype_info]))


column_name_list_feature_type_int = [
    x[0]
    for x in filter(
        lambda x: 'float' not in x[1] and 'id' not in x[0] and 'target' not in x[0], 
        get_column_type_pair_list(data_train_catergory_onehot)
    )
]

column_name_list_feature_type_float = [
    x[0]
    for x in filter(
        lambda x: 'float' in x[1] and 'id' not in x[0] and 'target' not in x[0], 
        get_column_type_pair_list(data_train_catergory_onehot)
    )
]

print(column_name_list_feature_type_int)
print(column_name_list_feature_type_float)


['ps_ind_01', 'ps_ind_03', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin', 'ps_ind_02_cat_-1', 'ps_ind_02_cat_1', 'ps_ind_02_cat_2', 'ps_ind_02_cat_3', 'ps_ind_02_cat_4', 'ps_ind_04_cat_-1', 'ps_ind_04_cat_0', 'ps_ind_04_cat_1', 'ps_ind_05_cat_-1', 'ps_ind_05_cat_0', 'ps_ind_05_cat_1', 'ps_ind_05_cat_2', 'ps_ind_05_cat_3', 'ps_ind_05_cat_4', 'ps_ind_05_cat_5', 'ps_ind_05_cat_6', 'ps_car_01_cat_-1', 'ps_car_01_cat_0', 'ps_car_01_cat_1', 'ps_car_01_cat_2', 'ps_car_01_cat_3', 'ps_car_01_cat_4', 'ps_car_01_cat_5', 'ps_car_01_cat_6', 'ps_car_01_cat_7', 'ps_car_01_cat_8', 

In [7]:
# 整数类型的特征先indexing
feature_all_embedding_offset = 0

data_train_catergory_onehot_int_indexing, feature_all_embedding_length = preprocess_utils.indexing(
    df=data_train_catergory_onehot,
    columns=sorted(column_name_list_feature_type_int, reverse=True),
    offset=feature_all_embedding_offset,
    neat=False
)
feature_all_embedding_length = math.ceil(feature_all_embedding_length/100)*100

ps_ind_18_bin 1 0 2 [0, 1]
ps_ind_17_bin 1 0 2 [0, 1]
ps_ind_16_bin 1 0 2 [0, 1]
ps_ind_15 13 0 14 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
ps_ind_14 4 0 5 [0, 1, 2, 3, 4]
ps_ind_13_bin 1 0 2 [0, 1]
ps_ind_12_bin 1 0 2 [0, 1]
ps_ind_11_bin 1 0 2 [0, 1]
ps_ind_10_bin 1 0 2 [0, 1]
ps_ind_09_bin 1 0 2 [0, 1]
ps_ind_08_bin 1 0 2 [0, 1]
ps_ind_07_bin 1 0 2 [0, 1]
ps_ind_06_bin 1 0 2 [0, 1]
ps_ind_05_cat_6 1 0 2 [0, 1]


ps_ind_05_cat_5 1 0 2 [0, 1]
ps_ind_05_cat_4 1 0 2 [0, 1]
ps_ind_05_cat_3 1 0 2 [0, 1]
ps_ind_05_cat_2 1 0 2 [0, 1]
ps_ind_05_cat_1 1 0 2 [0, 1]
ps_ind_05_cat_0 1 0 2 [0, 1]
ps_ind_05_cat_-1 1 0 2 [0, 1]
ps_ind_04_cat_1 1 0 2 [0, 1]
ps_ind_04_cat_0 1 0 2 [0, 1]
ps_ind_04_cat_-1 1 0 2 [0, 1]
ps_ind_03 11 0 12 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
ps_ind_02_cat_4 1 0 2 [0, 1]
ps_ind_02_cat_3 1 0 2 [0, 1]
ps_ind_02_cat_2 1 0 2 [0, 1]
ps_ind_02_cat_1 1 0 2 [0, 1]
ps_ind_02_cat_-1 1 0 2 [0, 1]
ps_ind_01 7 0 8 [0, 1, 2, 3, 4, 5, 6, 7]
ps_car_11_cat_99 1 0 2 [0, 1]


ps_car_11_cat_98 1 0 2 [0, 1]
ps_car_11_cat_97 1 0 2 [0, 1]
ps_car_11_cat_96 1 0 2 [0, 1]
ps_car_11_cat_95 1 0 2 [0, 1]
ps_car_11_cat_94 1 0 2 [0, 1]
ps_car_11_cat_93 1 0 2 [0, 1]
ps_car_11_cat_92 1 0 2 [0, 1]
ps_car_11_cat_91 1 0 2 [0, 1]
ps_car_11_cat_90 1 0 2 [0, 1]
ps_car_11_cat_9 1 0 2 [0, 1]
ps_car_11_cat_89 1 0 2 [0, 1]
ps_car_11_cat_88 1 0 2 [0, 1]
ps_car_11_cat_87 1 0 2 [0, 1]
ps_car_11_cat_86 1 0 2 [0, 1]
ps_car_11_cat_85 1 0 2 [0, 1]
ps_car_11_cat_84 1 0 2 [0, 1]
ps_car_11_cat_83 1 0 2 [0, 1]
ps_car_11_cat_82 1 0 2 [0, 1]


ps_car_11_cat_81 1 0 2 [0, 1]
ps_car_11_cat_80 1 0 2 [0, 1]
ps_car_11_cat_8 1 0 2 [0, 1]
ps_car_11_cat_79 1 0 2 [0, 1]
ps_car_11_cat_78 1 0 2 [0, 1]
ps_car_11_cat_77 1 0 2 [0, 1]
ps_car_11_cat_76 1 0 2 [0, 1]
ps_car_11_cat_75 1 0 2 [0, 1]
ps_car_11_cat_74 1 0 2 [0, 1]
ps_car_11_cat_73 1 0 2 [0, 1]
ps_car_11_cat_72 1 0 2 [0, 1]
ps_car_11_cat_71 1 0 2 [0, 1]
ps_car_11_cat_70 1 0 2 [0, 1]
ps_car_11_cat_7 1 0 2 [0, 1]
ps_car_11_cat_69 1 0 2 [0, 1]
ps_car_11_cat_68 1 0 2 [0, 1]
ps_car_11_cat_67 1 0 2 [0, 1]


ps_car_11_cat_66 1 0 2 [0, 1]
ps_car_11_cat_65 1 0 2 [0, 1]
ps_car_11_cat_64 1 0 2 [0, 1]
ps_car_11_cat_63 1 0 2 [0, 1]
ps_car_11_cat_62 1 0 2 [0, 1]
ps_car_11_cat_61 1 0 2 [0, 1]
ps_car_11_cat_60 1 0 2 [0, 1]
ps_car_11_cat_6 1 0 2 [0, 1]
ps_car_11_cat_59 1 0 2 [0, 1]
ps_car_11_cat_58 1 0 2 [0, 1]
ps_car_11_cat_57 1 0 2 [0, 1]
ps_car_11_cat_56 1 0 2 [0, 1]
ps_car_11_cat_55 1 0 2 [0, 1]
ps_car_11_cat_54 1 0 2 [0, 1]
ps_car_11_cat_53 1 0 2 [0, 1]
ps_car_11_cat_52 1 0 2 [0, 1]
ps_car_11_cat_51 1 0 2 [0, 1]
ps_car_11_cat_50 1 0 2 [0, 1]


ps_car_11_cat_5 1 0 2 [0, 1]
ps_car_11_cat_49 1 0 2 [0, 1]
ps_car_11_cat_48 1 0 2 [0, 1]
ps_car_11_cat_47 1 0 2 [0, 1]
ps_car_11_cat_46 1 0 2 [0, 1]
ps_car_11_cat_45 1 0 2 [0, 1]
ps_car_11_cat_44 1 0 2 [0, 1]
ps_car_11_cat_43 1 0 2 [0, 1]
ps_car_11_cat_42 1 0 2 [0, 1]
ps_car_11_cat_41 1 0 2 [0, 1]
ps_car_11_cat_40 1 0 2 [0, 1]
ps_car_11_cat_4 1 0 2 [0, 1]
ps_car_11_cat_39 1 0 2 [0, 1]
ps_car_11_cat_38 1 0 2 [0, 1]
ps_car_11_cat_37 1 0 2 [0, 1]
ps_car_11_cat_36 1 0 2 [0, 1]
ps_car_11_cat_35 1 0 2 [0, 1]
ps_car_11_cat_34 1 0 2 [0, 1]


ps_car_11_cat_33 1 0 2 [0, 1]
ps_car_11_cat_32 1 0 2 [0, 1]
ps_car_11_cat_31 1 0 2 [0, 1]
ps_car_11_cat_30 1 0 2 [0, 1]
ps_car_11_cat_3 1 0 2 [0, 1]
ps_car_11_cat_29 1 0 2 [0, 1]
ps_car_11_cat_28 1 0 2 [0, 1]
ps_car_11_cat_27 1 0 2 [0, 1]
ps_car_11_cat_26 1 0 2 [0, 1]
ps_car_11_cat_25 1 0 2 [0, 1]
ps_car_11_cat_24 1 0 2 [0, 1]
ps_car_11_cat_23 1 0 2 [0, 1]
ps_car_11_cat_22 1 0 2 [0, 1]
ps_car_11_cat_21 1 0 2 [0, 1]
ps_car_11_cat_20 1 0 2 [0, 1]
ps_car_11_cat_2 1 0 2 [0, 1]
ps_car_11_cat_19 1 0 2 [0, 1]


ps_car_11_cat_18 1 0 2 [0, 1]
ps_car_11_cat_17 1 0 2 [0, 1]
ps_car_11_cat_16 1 0 2 [0, 1]
ps_car_11_cat_15 1 0 2 [0, 1]
ps_car_11_cat_14 1 0 2 [0, 1]
ps_car_11_cat_13 1 0 2 [0, 1]
ps_car_11_cat_12 1 0 2 [0, 1]
ps_car_11_cat_11 1 0 2 [0, 1]
ps_car_11_cat_104 1 0 2 [0, 1]
ps_car_11_cat_103 1 0 2 [0, 1]
ps_car_11_cat_102 1 0 2 [0, 1]
ps_car_11_cat_101 1 0 2 [0, 1]
ps_car_11_cat_100 1 0 2 [0, 1]
ps_car_11_cat_10 1 0 2 [0, 1]
ps_car_11_cat_1 1 0 2 [0, 1]
ps_car_10_cat_2 1 0 2 [0, 1]
ps_car_10_cat_1 1 0 2 [0, 1]
ps_car_10_cat_0 1 0 2 [0, 1]


ps_car_09_cat_4 1 0 2 [0, 1]
ps_car_09_cat_3 1 0 2 [0, 1]
ps_car_09_cat_2 1 0 2 [0, 1]
ps_car_09_cat_1 1 0 2 [0, 1]
ps_car_09_cat_0 1 0 2 [0, 1]
ps_car_09_cat_-1 1 0 2 [0, 1]
ps_car_08_cat_1 1 0 2 [0, 1]
ps_car_08_cat_0 1 0 2 [0, 1]
ps_car_07_cat_1 1 0 2 [0, 1]
ps_car_07_cat_0 1 0 2 [0, 1]
ps_car_07_cat_-1 1 0 2 [0, 1]
ps_car_06_cat_9 1 0 2 [0, 1]
ps_car_06_cat_8 1 0 2 [0, 1]
ps_car_06_cat_7 1 0 2 [0, 1]
ps_car_06_cat_6 1 0 2 [0, 1]
ps_car_06_cat_5 1 0 2 [0, 1]
ps_car_06_cat_4 1 0 2 [0, 1]
ps_car_06_cat_3 1 0 2 [0, 1]


ps_car_06_cat_2 1 0 2 [0, 1]
ps_car_06_cat_17 1 0 2 [0, 1]
ps_car_06_cat_16 1 0 2 [0, 1]
ps_car_06_cat_15 1 0 2 [0, 1]
ps_car_06_cat_14 1 0 2 [0, 1]
ps_car_06_cat_13 1 0 2 [0, 1]
ps_car_06_cat_12 1 0 2 [0, 1]
ps_car_06_cat_11 1 0 2 [0, 1]
ps_car_06_cat_10 1 0 2 [0, 1]
ps_car_06_cat_1 1 0 2 [0, 1]
ps_car_06_cat_0 1 0 2 [0, 1]
ps_car_05_cat_1 1 0 2 [0, 1]
ps_car_05_cat_0 1 0 2 [0, 1]
ps_car_05_cat_-1 1 0 2 [0, 1]
ps_car_04_cat_9 1 0 2 [0, 1]
ps_car_04_cat_8 1 0 2 [0, 1]
ps_car_04_cat_7 1 0 2 [0, 1]
ps_car_04_cat_6 1 0 2 [0, 1]


ps_car_04_cat_5 1 0 2 [0, 1]
ps_car_04_cat_4 1 0 2 [0, 1]
ps_car_04_cat_3 1 0 2 [0, 1]
ps_car_04_cat_2 1 0 2 [0, 1]
ps_car_04_cat_1 1 0 2 [0, 1]
ps_car_04_cat_0 1 0 2 [0, 1]
ps_car_03_cat_1 1 0 2 [0, 1]
ps_car_03_cat_0 1 0 2 [0, 1]
ps_car_03_cat_-1 1 0 2 [0, 1]
ps_car_02_cat_1 1 0 2 [0, 1]
ps_car_02_cat_0 1 0 2 [0, 1]
ps_car_02_cat_-1 1 0 2 [0, 1]
ps_car_01_cat_9 1 0 2 [0, 1]
ps_car_01_cat_8 1 0 2 [0, 1]
ps_car_01_cat_7 1 0 2 [0, 1]
ps_car_01_cat_6 1 0 2 [0, 1]
ps_car_01_cat_5 1 0 2 [0, 1]
ps_car_01_cat_4 1 0 2 [0, 1]


ps_car_01_cat_3 1 0 2 [0, 1]
ps_car_01_cat_2 1 0 2 [0, 1]
ps_car_01_cat_11 1 0 2 [0, 1]
ps_car_01_cat_10 1 0 2 [0, 1]
ps_car_01_cat_1 1 0 2 [0, 1]
ps_car_01_cat_0 1 0 2 [0, 1]
ps_car_01_cat_-1 1 0 2 [0, 1]
ps_calc_20_bin 1 0 2 [0, 1]
ps_calc_19_bin 1 0 2 [0, 1]
ps_calc_18_bin 1 0 2 [0, 1]
ps_calc_17_bin 1 0 2 [0, 1]
ps_calc_16_bin 1 0 2 [0, 1]
ps_calc_15_bin 1 0 2 [0, 1]
ps_calc_14 23 0 24 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
ps_calc_13 13 0 14 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
ps_calc_12 10 0 11 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
ps_calc_11 19 0 20 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
ps_calc_10 25 0 26 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


ps_calc_09 7 0 8 [0, 1, 2, 3, 4, 5, 6, 7]
ps_calc_08 12 2 11 [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
ps_calc_07 9 0 10 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
ps_calc_06 10 0 11 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
ps_calc_05 6 0 7 [0, 1, 2, 3, 4, 5, 6]
ps_calc_04 5 0 6 [0, 1, 2, 3, 4, 5]


In [10]:
# 保存indexing后的offset
feature_all_config.parameter.put('embedding.index.offset', feature_all_embedding_offset)
feature_all_config.parameter.put('embedding.index.length', feature_all_embedding_length)

In [11]:
data_train_preprocessing_save_1_path = config.data.path('train_preprocessing_save_1.csv')
data_train_preprocessing_save_1_clip_path = config.data.path('train_preprocessing_save_1_clip.csv')


In [12]:
data_train_catergory_onehot_int_indexing.to_csv(data_train_preprocessing_save_1_path, index=False)

In [13]:
data_train_catergory_onehot_int_indexing\
    .groupby('target').apply(lambda x: x.sample(frac=0.005))\
    .to_csv(data_train_preprocessing_save_1_clip_path, index=False)

In [None]:
data_train_catergory_onehot_int_indexing = pd.read_csv(data_train_preprocessing_save_1_path)

In [91]:
# 计算下相关性
cor = data_train_catergory_onehot.corr()

In [96]:
print(cor[abs(cor['ps_reg_03']) > 0.05]['ps_reg_03'].sort_values())

ps_car_01_cat_7     -0.176180
ps_car_02_cat_1     -0.135702
ps_car_06_cat_0     -0.108981
ps_ind_06_bin       -0.096757
ps_car_01_cat_5     -0.092467
ps_car_04_cat_2     -0.086243
ps_ind_02_cat_1     -0.081510
ps_car_01_cat_4     -0.078400
ps_car_09_cat_0     -0.072855
ps_car_06_cat_1     -0.071510
ps_car_11_cat_32    -0.065386
ps_car_11_cat_87    -0.065127
ps_ind_04_cat_0     -0.064709
ps_car_11_cat_65    -0.060460
ps_car_11_cat_1     -0.060075
ps_car_01_cat_6     -0.058145
ps_car_11_cat_100   -0.053164
ps_car_06_cat_17     0.051723
ps_car_04_cat_9      0.055411
ps_car_07_cat_-1     0.057029
ps_ind_17_bin        0.059237
ps_car_01_cat_10     0.060911
ps_car_03_cat_1      0.062367
ps_car_06_cat_9      0.062469
ps_ind_04_cat_1      0.065224
ps_car_11_cat_28     0.066642
ps_ind_02_cat_2      0.068514
ps_car_09_cat_1      0.071275
ps_car_11            0.080365
ps_car_12            0.082621
ps_car_11_cat_104    0.084215
ps_car_06_cat_10     0.085541
ps_car_13            0.099415
ps_car_04_

In [98]:
data_train_catergory_onehot_ps_reg_03_intact = data_train_catergory_onehot[data_train_catergory_onehot['ps_reg_03'] != -1]
data_train_catergory_onehot_ps_reg_03_missing = data_train_catergory_onehot[data_train_catergory_onehot['ps_reg_03'] == -1]

In [101]:
print(len(data_train_catergory_onehot_ps_reg_03_intact))
print(len(data_train_catergory_onehot_ps_reg_03_missing))

487440
107772


In [102]:
preprocessed_data_column_name_list_feature_all = preprocess_utils.get_column_name_list_by_prefix(
    df=data_train_catergory_onehot,
    prefix='ps'
)

In [103]:
print(preprocessed_data_column_name_list_feature_all)

['ps_ind_01', 'ps_ind_03', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin', 'ps_ind_02_cat_-1', 'ps_ind_02_cat_1', 'ps_ind_02_cat_2', 'ps_ind_02_cat_3', 'ps_ind_02_cat_4', 'ps_ind_04_cat_-1', 'ps_ind_04_cat_0', 'ps_ind_04_cat_1', 'ps_ind_05_cat_-1', 'ps_ind_05_cat_0', 'ps_ind_05_cat_1', 'ps_ind_05_cat_2', 'ps_ind_05_cat_3', 'ps_ind_05_cat_4', 'ps_ind_05_cat_5', 'ps_ind_05_cat_6', 'ps_car_01_cat_-1', 'ps_car_01_cat_0', 'ps_ca

In [104]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support



In [None]:
data_fill_reg_0s = data_train_catergory_onehot_ps_reg_03_intact.sample(n=1000)

x_fill_reg_03 = \
    data_fill_reg_0s[preprocessed_data_column_name_list_feature_all]
y_fill_reg_03 = \
    data_fill_reg_0s['ps_reg_03']

x_fill_reg_03_train, x_fill_reg_03_test, y_fill_reg_03_train, y_fill_reg_03_test = train_test_split(
    x_fill_reg_03.values, y_fill_reg_03.values, train_size=0.8)



In [106]:
import xgboost as xgb

print("#"*120)
print("构建模型")
xgb_model = xgb.XGBClassifier(
    objective= 'reg:logistic',
    max_depth=5,
    min_child_weight=1,
    nthread=8,
)


########################################################################################################################
构建模型


In [113]:
rng = np.random.RandomState(31337)
kf = KFold(n_splits=4, shuffle=True, random_state=rng)

for i, (train_index, test_index) in enumerate(kf.split(x_fill_reg_03_train, y_fill_reg_03_train)):
    xgb_model.fit(x_fill_reg_03_train[train_index], y_fill_reg_03_train[train_index])

    _y = xgb_model.predict(x_fill_reg_03_train[test_index])
    y = y_fill_reg_03_train[test_index]

    confuse = confusion_matrix(y, _y)
    print("第{0}次, CV混淆矩阵 = ".format(i))
    print(confuse)

    acc = accuracy_score(xgb_model.predict(x_fill_reg_03_test), y_fill_reg_03_test)

    print("第{0}次, 测试集准确率 = ".format(i))
    print(acc)