In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn import ensemble
from sklearn import preprocessing
import sklearn.tree as tree
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score, roc_auc_score
import shap

In [None]:
df = pd.read_csv('./process_data/train.csv')
df = df.sort_values(by=' timedelta',ascending=False)  #发布时间从早到晚

y = df.iloc[:, 0:2]
X = df.iloc[:, 2:-1]

train_size = int(0.7*df.shape[0])
X_train = X.iloc[0:train_size,:]
X_test = X.iloc[train_size:df.shape[0]+1,:]
y_train = y.iloc[0:train_size]
y_test = y.iloc[train_size:df.shape[0]+1]
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)

In [None]:
# 使用随机森林对测试集进行预测
param_grid = {
    # 'criterion':['entropy','gini'],
    'max_depth':[5, 6, 7, 8],    # 深度：这里是森林中每棵决策树的深度
    'n_estimators':[10, 20, 50, 100, 200, 400],  # 决策树个数-随机森林特有参数
    # 'max_features':[0.3,0.4,0.5],
    #  # 每棵决策树使用的变量占比-随机森林特有参数（结合原理）
    'min_samples_split':[4,8,12,16]  # 叶子的最小拆分样本量
}

rfc = ensemble.RandomForestClassifier()
rfc_cv = GridSearchCV(estimator=rfc, param_grid=param_grid,
                      scoring='roc_auc', cv=4)
rfc_cv.fit(X_train, y_train)
test_est = rfc_cv.predict(X_test)

In [None]:
print('随机森林精确度:')
print(metrics.classification_report(y_test, test_est))
print('随机森林 AUC:')
fpr_test, tpr_test, th_test = metrics.roc_curve(y_test, test_est)
print('AUC = %.4f' %metrics.auc(fpr_test, tpr_test))
print('accuracy     '+str(accuracy_score(y_test,test_est)))
print('precision    '+str(precision_score(y_test,test_est)))
print('recall       '+str(recall_score(y_test,test_est)))
print('f1_score     '+str(f1_score(y_test,test_est)))
print('AUC          '+str(roc_auc_score(y_test,test_est)))