In [1]:
# 导入库
import copy  

import numpy as np  
import pandas as pd  
from datetime import datetime
from imblearn.over_sampling import SMOTE  
from sklearn.ensemble import AdaBoostClassifier,VotingClassifier,GradientBoostingClassifier,RandomForestClassifier,BaggingClassifier  
from sklearn.model_selection import StratifiedKFold, cross_val_score  
from sklearn.preprocessing import LabelEncoder  

In [2]:
# 数据审查和预处理函数


# 基本状态查看
def stats_summary(df):
    '''
    查看数据集的记录数、维度数、前2条数据、描述性统计和数据类型
    :param df: 数据框
    :return: 无
    '''
    print('Data Overview:')
    print('Records: {0}\tDimension：{1}'.format(df.shape[0], df.shape[1]))  # 打印数据形状
    print('-' * 30)
    print('The first two rows:')
    print(df.head(2))    # 输出前2条数据
    print('-' * 30)
    print('Data Desc:')
    print(df.describe())  # 输出数据描述性统计信息
    print('-' * 30)
    print('Data Dtypes:')
    print(df.dtypes)      # 输出数据类型
    print('-' * 60)

    
# 缺失值查看
def na_summary(df):
    '''
    查看数据集的缺失列、行数量
    :param df: 数据框
    :return: 无
    '''
    na_cols = df.isnull().any(axis=0)    # 判断每一列是否具有缺失值
    print('NA Cols:')
    print(na_cols)  
    print('-' * 30)
    print('Valid records for each cols:') # 查看每一列有效值（非NA）的记录数
    print(df.count())                  
    print('-' * 30)
    print('Total number of NA lines is: {0}'.format(df.isnull().any(axis=1).sum()))  # 查看具有缺失值的行的记录数
    print('-' * 60)


# 字符串分类转数值分类
def label_encoder(data, model_list=None, train=True):
    '''
    将特征中的字符串分类转换为数值分类
    :param data: 输入数据集
    :param model_list: LabelEncoder对象列表，在训练阶段产生
    :param train: 是否为训练阶段
    :return: 训练阶段产生训练后的LabelEncoder对象列表和转换后的数据，预测阶段产生转换后的数据
    '''
    convert_cols = ['cat', 'attribution', 'pro_id', 'pro_brand', 'order_source', 'pay_type','use_id', 'city']  # 定义要转换的列
    value_list = []  # 存放转换后的数据
    if train:
        model_list = []  # 存放每个特征转换的实例对象
        model_label_encoder = LabelEncoder()
        for i in convert_cols:
            model_label_encoder.fit(data[i])
            value_list.append(model_label_encoder.transform(data[i]))
            model_list.append(copy.copy(model_label_encoder))
            # print(list(model_label_encoder.classes_))
        convert_matrix = np.array(value_list).T
        return model_list, convert_matrix
    else:
        for ind, j in enumerate(convert_cols):
            # print(list(model_list[ind].classes_))
            value_list.append(model_list[ind].transform(data[j]))
        convert_matrix = np.array(value_list).T
        return convert_matrix


# 时间属性拓展
def datetime2int(date_series,time_series):
    '''
    将日期和时间数据拓展出其他属性，例如星期几、周几、小时、分钟等。
    :param date_series: 日期列
    :param time_series: 时间列
    :return: 拓展后的属性矩阵
    '''
    date_set = [datetime.strptime(dates, '%Y-%m-%d') for dates in date_series]  # 将data_series转换为特定日期格式
    weekday_data = [data.weekday() for data in date_set]  # 周几
    day_data = [data.day for data in date_set]            # 当月几号
    month_data = [data.month for data in date_set]        # 月份

    time_set = [datetime.strptime(times, '%H:%M:%S') for times in time_series]  # 将time_series转换为特定时间格式
    second_data = [data.second for data in time_set]  # 秒
    minute_data = [data.minute for data in time_set]  # 分钟
    hour_data = [data.hour for data in time_set]      # 小时

    final_set = [weekday_data, day_data, month_data, second_data, minute_data, hour_data]  # 将属性列表批量组合
    final_matrix = np.array(final_set).T  # 转换为矩阵并转置
    return final_matrix


# 样本均衡审查
def label_summary(df,labels,samples):
    '''
    查看每个类的样本量分布
    :param df: 数据框
    :param labels: 类别列名
    :param samples: 其他有效列名(无NA值)
    :return: 无
    '''
    print('Labels samples distribution:')
    print(df[samples].groupby(df[labels]).count())  
    print('-' * 60)


# 样本均衡
def sample_balance(X, y):
    '''
    使用SMOTE方法对不均衡样本做过抽样处理
    :param X: 输入特征变量X
    :param y: 目标变量y
    :return: 均衡后的X和y
    '''
    model_smote = SMOTE()  # 建立SMOTE模型对象
    x_smote_resampled, y_smote_resampled = model_smote.fit_resample(X, y)  # 输入数据并作过抽样处理
    return x_smote_resampled, y_smote_resampled

In [3]:
# 读取数据集
# 定义特殊字段数据格式
dtypes = {'order_id': object,
          'pro_id': object,
          'use_id': object}
raw_data = pd.read_table('abnormal_orders.txt', delimiter=',', dtype=dtypes)  

# 数据审查
stats_summary(raw_data)  # 基本状态查看
na_summary(raw_data)  # 缺失值审查

Data Overview:
Records: 134190	Dimension：14
------------------------------
The first two rows:
     order_id  order_date order_time     cat attribution      pro_id  \
0  4277880103  2013-10-17   13:09:16     NaN          GO  8000001215   
1  4283851335  2013-09-23   14:09:49  手机摄影数码         POP  8002042497   

  pro_brand  total_money  total_quantity order_source pay_type      use_id  \
0       NaN       1000.0            1000         游戏站点     当当支付  murongchun   
1        三星     766000.0             200           主站     合并支付   dakehu_zy   

  city  abnormal_label  
0  北京市               0  
1  上海市               1  
------------------------------
Data Desc:
         total_money  total_quantity  abnormal_label
count  134189.000000   134190.000000   134190.000000
mean      660.111987        1.195588        0.212065
std      2901.208639        3.230545        0.408772
min         0.500000        1.000000        0.000000
25%        29.000000        1.000000        0.000000
50%        98.4000

In [4]:
label_summary(raw_data,'abnormal_label','use_id')  # 类样本分布审查

Labels samples distribution:
abnormal_label
0    105733
1     28457
Name: use_id, dtype: int64
------------------------------------------------------------


In [5]:
# 数据预处理
drop_na_set = raw_data.dropna()  # 丢弃带有NA值的数据行
X_raw, y_raw = drop_na_set.iloc[:, 1:-1], drop_na_set.iloc[:, -1]  # 分割输入变量X和y
model_list, convert_matrix = label_encoder(X_raw)  # 字符串分类转整数型分类
datetime2int_data = datetime2int(X_raw['order_date'],X_raw['order_time'])  # 拓展日期时间属性
combine_set = np.hstack((convert_matrix, datetime2int_data))  # 合并转换后的分类和拓展后的日期数据集
constant_set = X_raw[['total_money', 'total_quantity']]  # 原始连续数据变量
X_combine = np.hstack((combine_set, constant_set))  # 再次合并数据集
X, y = sample_balance(X_combine, y_raw)  # 样本均衡处理

In [6]:
# 组合分类模型交叉检验
model_rf = RandomForestClassifier(max_features=0.8, random_state=0)  # 随机森林分类模型对象
model_adaC = AdaBoostClassifier(random_state=0)  # Adaboost分类模型对象
model_BagC = BaggingClassifier(random_state=0)  # Bagging分类模型对象
model_gdbc = GradientBoostingClassifier(max_features=0.8, random_state=0)  # GradientBoosting分类模型对象
estimators = [('randomforest', model_rf), ('adaboost', model_adaC),
              ('bagging', model_BagC), ('gradientboosting', model_gdbc)]  # 建立组合评估器列表
model_vot = VotingClassifier(estimators=estimators, voting='soft', weights=[0.9, 1.2, 1.1, 1.1],n_jobs=-1)  # 建立组合评估模型
cv = StratifiedKFold(5)  # 设置交叉检验方法
cv_score = cross_val_score(model_vot, X, y, cv=cv)  # 交叉检验
print('Cross val scores:{}'.format(cv_score))    # 打印每次交叉检验得分
print('Mean scores is: %.2f' % cv_score.mean())  # 打印平均交叉检验得分
model_vot.fit(X, y)  # 模型训练

Cross val scores:[0.42571367 0.85707449 0.91730277 0.8800938  0.80416368]
Mean scores is: 0.78


VotingClassifier(estimators=[('randomforest',
                              RandomForestClassifier(max_features=0.8,
                                                     random_state=0)),
                             ('adaboost', AdaBoostClassifier(random_state=0)),
                             ('bagging', BaggingClassifier(random_state=0)),
                             ('gradientboosting',
                              GradientBoostingClassifier(max_features=0.8,
                                                         random_state=0))],
                 n_jobs=-1, voting='soft', weights=[0.9, 1.2, 1.1, 1.1])

In [7]:
# 新数据集做预测
X_raw_data = pd.read_csv('new_abnormal_orders.csv', dtype=dtypes)  # 读取要预测的数据集
X_raw_new = X_raw_data.iloc[:, 1:]  # 丢弃订单ID列
convert_matrix_new = label_encoder(X_raw_new, model_list, False)  # 字符串分类转整数型分类
datetime2int_data_new = datetime2int(X_raw_new['order_date'],X_raw_new['order_time'])  # 日期时间转换
combine_set_new = np.hstack((convert_matrix_new, datetime2int_data_new))  # 合并转换后的分类和拓展后的日期数据集
constant_set_new = X_raw_new[['total_money', 'total_quantity']]  # 原始连续数据变量
X_combine_new = np.hstack((combine_set_new, constant_set_new))  # 再次合并数据集
y_predict = model_vot.predict(X_combine_new)  # 预测结果
print('Predicted labels:{}'.format(y_predict)) # 打印预测值

Predicted labels:[1 0 0 0 0 0 0]
