In [1]:
# 导入库
import time
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion

In [2]:
# 基本状态查看
def stats_summary(df):
    '''
    查看数据集的记录数、维度数、前2条数据、描述性统计和数据类型
    :param df: 数据框
    :return: 无
    '''
    print('Data Overview:')
    print('Records: {0}\tDimension：{1}'.format(df.shape[0], df.shape[1]))  # 打印数据形状
    print('-' * 30)
    print('The first two rows:')
    print(df.head(2))    # 输出前2条数据
    print('-' * 30)
    print('Data Desc:')
    print(df.describe())  # 输出数据描述性统计信息
    print('-' * 30)
    print('Data Dtypes:')
    print(df.dtypes)      # 输出数据类型
    print('-' * 30)


# 缺失值查看
def na_summary(df):
    '''
    查看数据集的缺失列、行数量
    :param df: 数据框
    :return: 无
    '''
    na_cols = df.isnull().any(axis=0)    # 判断每一列是否具有缺失值
    print('NA Cols:')
    print(na_cols)  
    print('-' * 30)
    print('Valid records for each cols:') # 查看每一列有效值（非NA）的记录数
    print(df.count())                  
    print('-' * 30)
    print('Total number of NA lines is: {0}'.format(df.isnull().any(axis=1).sum()))  # 查看具有缺失值的行的记录数
    print('-' * 30)


# 缺失值替换
def na_replace(df):
    '''
    将数据集中的NA值使用自定义方法替换
    :param df: 数据框
    :return: NA值替换后的数据框
    '''
    na_rules = {'age': df['age'].mean(),
                'total_pageviews': df['total_pageviews'].mean(),
                'edu': df['edu'].median(),
                'edu_ages': df['edu_ages'].median(),
                'user_level': df['user_level'].median(),
                'industry': df['user_level'].median(),
                'act_level': df['act_level'].median(),
                'sex': df['sex'].median(),
                'red_money': df['red_money'].mean(),
                'region': df['region'].median()
                }  # 字典：定义各个列数据转换方法
    df = df.fillna(na_rules)  # 使用指定方法填充缺失值
    print('Check NA exists:')
    print((df.isnull().any().sum()))  # 查找是否还有缺失值
    print(('-' * 30))
    return df


# 样本均衡审查
def label_summary(df,labels,samples):
    '''
    查看每个类的样本量分布
    :param df: 数据框
    :param labels: 类别列名
    :param samples: 其他有效列名(无NA值)
    :return: 无
    '''
    print('Labels samples distribution:')
    print(df[samples].groupby(df[labels]).count())  
    print('-' * 30)


# 基于pipe的特征组合及模型训练
def pipeline_model(X, y=None, project_pipeline=None, train=True):
    '''
    建立一个包含特征组合以及模型训练的复合pipeline，实现基于管道的特征筛选、组合与模型训练一体化
    :param X: 特征集
    :param y: 预测目标集，默认为None
    :param project_pipeline: pipeline对象，训练阶段获取，测试阶段应用，默认为None
    :param train: 所处阶段，默认为True
    :return: 训练阶段返回pipeline对象，预测阶段返回预测值
    '''
    if train:  # 如果是训练阶段
        # 建立pipeline中用到的模型对象
        model_etc = ExtraTreesClassifier()  # ExtraTree分类，用于EFE的模型对象
        model_rfe = RFE(model_etc)  # 使用RFE方法提取重要特征
        model_lda = LinearDiscriminantAnalysis()  # LDA模型对象
        model_adaboost = AdaBoostClassifier()  # AdaBoost分类对象

        # 构建带有嵌套的pipeline
        project_pipeline = Pipeline([
            ('feature_union', FeatureUnion(  # 组合特征pipeline
                transformer_list=[
                    ('model_rfe', model_rfe),  # 通过RFE中提取特征
                    ('model_lda', model_lda),  # 通过LDA提取特征
                ],
                transformer_weights={  # 建立不同特征模型的权重
                    'model_rfe': 1,    # RFE模型权重
                    'model_lda': 0.8,  # LDA模型权重
                },
            )),
            ('model_adaboost', model_adaboost),  # adaboost模型对象
        ])

        # 设置参数值
        project_pipeline.set_params(
            feature_union__model_rfe__estimator__n_estimators=20)  # ExtraTreesClassifier中n_estimators值
        project_pipeline.set_params(
            feature_union__model_rfe__estimator__n_jobs=-1)        # ExtraTreesClassifier中n_jobs值
        project_pipeline.set_params(
            feature_union__model_rfe__n_features_to_select=20)     # RFE中n_features_to_select值
        project_pipeline.set_params(feature_union__model_lda__n_components=1)  # LDA中n_components值
        project_pipeline.set_params(feature_union__n_jobs=-1)                  # FeatureUnion中n_jobs值
        # project_pipeline.get_params()  # 打印pipeline参数详情

        # pipeline交叉检验
        num = 4  # 交叉检验次数
        cv = StratifiedKFold(num)  # 设置交叉检验
        score_list = list()  # 建立空列表，用于存放交叉检验得分
        time_list = list()  # 建立空列表，用于存储时间
        n_estimators_range = [50, 100, 150]  # 设置pipeline中adaboost的n_estimators值域
        for parameter in n_estimators_range:  # 遍历每个参数值
            t1 = time.time()  # 记录交叉检验开始的时间
            print(('set parameters: %s' % parameter))  # 打印当前模型使用的参数
            project_pipeline.set_params(model_adaboost__n_estimators=parameter)  # AdaBoostClassifier中n_estimators值
            score_tmp = cross_val_score(project_pipeline, X, y, scoring='accuracy',cv=cv)  # 使用交叉检验计算得分
            t2 = time.time()  # 记录交叉检验结束时间
            time_list.append(t2 - t1)  # 计算交叉检验时间并追加到列表
            score_list.append(score_tmp)  # 将得分追加到列表

        # 组合交叉检验得分和详情数据
        data = np.hstack((np.array([n_estimators_range, time_list]).T, np.array(score_list)))  
        cols = ['n_estimators', 'time']
        cols.extend([''.join(['score', str(i)]) for i in range(num)])
        score_pd = pd.DataFrame(data, columns=cols)  
        score_pd['score_mean'] = score_pd.iloc[:, 2:].mean(axis=1)   # 计算得分均值
        score_pd['score_std'] = score_pd.iloc[:, 2:-2].std(axis=1)  # 计算得分标准差
        print('pipeline score details:')
        print((score_pd.round(4)))  # 输出交叉检验指标数据，只保留4位小数
        print(('-' * 30))

        # 将最优参数设置到模型中，并训练pipeline
        best_estimators=score_pd['n_estimators'][score_pd['score_mean']==score_pd['score_mean'].max()].values[0].astype(int)
        print(best_estimators)
        project_pipeline.set_params(model_adaboost__n_estimators=best_estimators)  # 设置最优参数值
        project_pipeline.fit(X, y)  # 训练pipeline模型
        return project_pipeline    # 返回训练过的pipeline模型对象
    else:
        return project_pipeline.predict(X), project_pipeline.predict_proba(X)  # 返回预测值及概率

In [3]:
# 数据应用

# 读取训练数据集
raw_data = pd.read_excel('order.xlsx', sheet_name=0)  
X = raw_data.drop('response', axis=1)  
y = raw_data['response']  

# 数据审查和预处理
stats_summary(raw_data)  # 基本状态查看
na_summary(raw_data)     # 缺失值审查
X_t = na_replace(X)      # 替换缺失值 

Data Overview:
Records: 39999	Dimension：114
------------------------------
The first two rows:
    age  total_pageviews  edu  edu_ages  user_level  industry  value_level  \
0  39.0          77516.0  1.0      13.0         1.0       1.0            1   
1  50.0          83311.0  1.0      13.0         2.0       2.0            2   

   act_level  sex  blue_money  ...  label_92  label_93  label_94  label_95  \
0        1.0  1.0        2174  ...         0         1         0         1   
1        1.0  1.0           0  ...         1         1         0         0   

   label_96  label_97  label_98  label_99  label_100  response  
0         1         0         0         0          1         0  
1         1         0         1         0          1         0  

[2 rows x 114 columns]
------------------------------
Data Desc:
                age  total_pageviews           edu      edu_ages  \
count  39998.000000     3.999800e+04  39998.000000  39998.000000   
mean      38.589654     1.895136e+05  

In [4]:
# 样本均衡审查
label_summary(raw_data,'response','label_100') 

Labels samples distribution:
response
0    30415
1     9584
Name: label_100, dtype: int64
------------------------------


In [5]:
# 分类模型训练，获得最佳分类模型参数信息
project_pipeline = pipeline_model(X_t, y) 

set parameters: 50
set parameters: 100
set parameters: 150
pipeline score details:
   n_estimators      time  score0  score1  score2  score3  score_mean  \
0          50.0  337.4907  0.9110  0.9134  0.8876  0.7940      0.8765   
1         100.0  328.9727  0.9138  0.9155  0.8893  0.7970      0.8789   
2         150.0  340.2234  0.9146  0.9164  0.8922  0.7986      0.8804   

   score_std  
0     0.0143  
1     0.0147  
2     0.0135  
------------------------------
150


In [6]:
# 读取测试数据集
new_data = pd.read_excel('order.xlsx', sheet_name=1)  
final_reponse = new_data['final_response']           
new_X = new_data.drop('final_response', axis=1)  

# 数据审查和预处理
stats_summary(new_data)  # 基本状态查看
na_summary(new_data)  # 缺失值审查
new_X_t = na_replace(new_X)  # 替换缺失值

# 分类模型应用
new_X_final, new_X_pro = pipeline_model(new_X_t, project_pipeline=project_pipeline,train=False) 

Data Overview:
Records: 8843	Dimension：114
------------------------------
The first two rows:
   age  total_pageviews  edu  edu_ages  user_level  industry  value_level  \
0   61           243019   10         1         2.0       7.0            2   
1   33           215596    4         5         2.0       7.0            2   

   act_level  sex  blue_money  ...  label_92  label_93  label_94  label_95  \
0          1    1           0  ...         0         1         0         0   
1          5    1           0  ...         0         0         0         0   

   label_96  label_97  label_98  label_99  label_100  final_response  
0         1         1         0         0          0               0  
1         1         1         1         1          0               0  

[2 rows x 114 columns]
------------------------------
Data Desc:
               age  total_pageviews          edu     edu_ages   user_level  \
count  8843.000000     8.843000e+03  8843.000000  8843.000000  8841.000000   
mean

In [7]:
# 输出预测值以及预测概率
predict_labels = pd.DataFrame(new_X_final, columns=['labels'])  # 获得预测标签
predict_labels_pro = pd.DataFrame(new_X_pro, columns=['pro1', 'pro2'])  # 获得预测概率
predict_pd = pd.concat((new_data, predict_labels, predict_labels_pro), axis=1)  # 将预测标签、预测数据和原始数据合并
print('Predict info:')
print(predict_pd.head(2))  
print('-' * 60)

# 后续--与实际效果的比较
print('final accuracy: {0}'.format(accuracy_score(final_reponse, predict_labels)))

# 将预测结果写入Excel
writer = pd.ExcelWriter('order_predict_result.xlsx')  
predict_pd.to_excel(writer, 'Sheet1')  
writer.save()  

Predict info:
   age  total_pageviews  edu  edu_ages  user_level  industry  value_level  \
0   61           243019   10         1         2.0       7.0            2   
1   33           215596    4         5         2.0       7.0            2   

   act_level  sex  blue_money  ...  label_95  label_96  label_97  label_98  \
0          1    1           0  ...         0         1         1         0   
1          5    1           0  ...         0         1         1         1   

   label_99  label_100  final_response  labels      pro1      pro2  
0         0          0               0       0  0.508058  0.491942  
1         1          0               0       0  0.506187  0.493813  

[2 rows x 117 columns]
------------------------------------------------------------
final accuracy: 0.8318443966979532
