-
Notifications
You must be signed in to change notification settings - Fork 4
/
rulefit-smote-kfold-multi-run.py
134 lines (122 loc) · 6.06 KB
/
rulefit-smote-kfold-multi-run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""
This a implementation of the rulefit algorithm for predict bio datasets with kfold cross-validation and SMOTE.
Derived from: Christoph Molnar
Source: https://github.com/christophM/rulefit
Date: 2019-01-03
"""
import os
import time
import numpy as np
import pandas as pd
from multiprocessing import current_process, Pool
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from rulefit import RuleFit
from rulefit import utils
from imblearn.over_sampling import SMOTE
__author__ = 'Min'
def train_job(train_idx, test_idx, t_size, rf_mode, m_rules, r_seed, X, y, feas, n_samples):
"""
每个 fold 中进行训练和验证
"""
p = current_process()
print('process counter:', p._identity[0], 'pid:', os.getpid())
# 初始化 estimator 训练集进入模型
rf = RuleFit(tree_size=t_size, rfmode=rf_mode,
max_rules=m_rules, random_state=r_seed)
print("\nTree generator:{0}, \n\nMax rules:{1}, Tree size:{2}, Random state:{3}".format(
rf.tree_generator, rf.max_rules, rf.tree_size, rf.random_state))
rf.fit(X[train_idx], y[train_idx], feas)
# 验证测试集 (通过 index 去除 fake data)
real_test_index = test_idx[test_idx < n_samples]
batch_test_x = X[real_test_index]
batch_test_y = y[real_test_index]
batch_test_size = len(real_test_index)
y_pred = rf.predict(batch_test_x)
# 计算测试集 ACC
accTest = accuracy_score(batch_test_y, y_pred)
print("\nTest Accuracy:", "{:.6f}".format(accTest), "Test Size:", batch_test_size)
print("\n=========================================================================")
# 返回测试集和预测结果用于统计
return batch_test_y, y_pred
if __name__ == "__main__":
start_time = time.time()
parser = ArgumentParser(description="This a implementation of the rulefit algorithm for predict bio datasets with kfold cross-validation and SMOTE.",
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument("-t", "--treesize", type=int,
help="Number of terminal nodes in generated trees. If exp_rand_tree_size=True, this will be the mean number of terminal nodes.", default=4)
parser.add_argument("-m", "--maxrules", type=int,
help="approximate total number of rules generated for fitting. Note that actual number of rules will usually be lower than this due to duplicates.", default=2000)
parser.add_argument("--rfmode", type=str,
help="'regress' for regression or 'classify' for binary classification.", default='classify')
parser.add_argument("-k", "--kfolds", type=int,
help="Number of folds. Must be at least 2.", default=10)
parser.add_argument("-r", "--randomseed", type=int,
help="pseudo-random number generator state used for shuffling.", default=None)
parser.add_argument("--datapath", type=str,
help="The path of dataset.", required=True)
args = parser.parse_args()
# 读取数据
df = pd.read_csv(args.datapath)
# 设定分类信息和特征矩阵
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
# Apply SMOTE 生成 fake data
sm = SMOTE(k_neighbors=2)
x_resampled, y_resampled = sm.fit_sample(X, y)
# 标准化处理
scaler = StandardScaler().fit(x_resampled)
x_resampled = scaler.transform(x_resampled)
# 读取特征名称
features = df.columns[1:]
print("\nDataset shape: ", df.shape, " Number of features: ", features.size)
# 不同 Class 统计 (根据 Target 列)
num_categories = np.unique(y).size
sum_y = np.asarray(np.unique(y.astype(int), return_counts=True))
df_sum_y = pd.DataFrame(sum_y.T, columns=['Class', 'Sum'], index=None)
print('\n', df_sum_y)
# after over sampleing 读取分类信息并返回数量
np_resampled_y = np.asarray(np.unique(y_resampled.astype(int), return_counts=True))
df_resampled_y = pd.DataFrame(np_resampled_y.T, columns=['Class', 'Sum'])
print("\nNumber of samples after over sampleing:\n\n{0}".format(df_resampled_y))
# 交叉验证
rs = KFold(n_splits=args.kfolds, shuffle=True, random_state=args.randomseed)
# 生成 k-fold 训练集、测试集索引
resampled_index_set = rs.split(y_resampled)
# 暂存每次选中的测试集和对应预测结果
test_cache = pred_cache = np.array([], dtype=np.int)
print("\nTraining Start...")
# 构建进程池并行训练
pool = Pool(processes=args.kfolds)
res = []
for train_index, test_index in resampled_index_set:
result = pool.apply_async(train_job, args=(train_index, test_index), kwds=dict(t_size=args.treesize, rf_mode=args.rfmode,
m_rules=args.maxrules, r_seed=args.randomseed, X=x_resampled, y=y_resampled, feas=features, n_samples=X.shape[0]))
res.append(result)
pool.close()
pool.join()
for _ in res:
"""
https://jingsam.github.io/2015/12/31/multiprocessing.html
获取返回值的过程最好放在进程池回收之后进行,避免阻塞后面的语句
汇总每次选中的测试集和预测结果
"""
y_res = _.get()
# real label
test_cache = np.concatenate((test_cache, np.array(y_res[0])))
# predicted label
pred_cache = np.concatenate((pred_cache, np.array(y_res[1])))
# 末尾输出rulefit模型参数
print("\n=== Model parameters ===")
# 输出统计结果
if(num_categories > 2):
utils.model_evaluation(num_categories, test_cache, pred_cache)
else:
utils.bi_model_evaluation(test_cache, pred_cache)
end_time = time.time() # 程序结束时间
print("\n[Finished in: {0:.6f} mins = {1:.6f} seconds]".format(
((end_time - start_time) / 60), (end_time - start_time)))