In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('GSE25066_merge.csv')
y = np.array(data.pop("group"))
# y = np.expand_dims(y, axis=1)

In [None]:
origin_X = np.array(data)

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

# 创建一个随机森林分类器，假设要使用100个决策树
rf_classifier = RandomForestClassifier(n_estimators=100)

# 定义k-fold交叉验证，将数据分成10个子集
k_fold = KFold(n_splits=3, random_state=42, shuffle=True)

# 定义变量来存储每次交叉验证的得分
scores = []

# 对于每个训练集和验证集的子集，分别进行训练和预测
for train_indices, test_indices in tqdm(k_fold.split(X)):
    X_train, X_test = origin_X[train_indices], origin_X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    # 训练分类器
    rf_classifier.fit(X_train, y_train)
    
    # 在验证集上进行预测
    score = rf_classifier.score(X_test, y_test)
    
    # 将得分添加到得分列表中
    scores.append(score)

# 打印每次交叉验证的得分以及平均得分
print("Cross-validation scores: {}".format(scores))
print("Average score: {}".format(sum(scores) / len(scores)))

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


# 计算每个指标的平均值

def train(X, y, n_splits=8, test=0.2, random_state=42, n_estimators=200):
    # 创建一个随机森林分类器
    rf_classifier = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators)
    
    # 设置K-fold交叉验证策略
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    X_len = len(X)
    t = int((1-test)*X_len)
    print(f'n_splits:{n_splits},test_data:{test},random_state:{random_state},n_estimators:{n_estimators}')
    num_samples = X.shape[0]

    # 生成随机排列索引
    np.random.seed(random_state)
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test, stratify=y, random_state=random_state)
    # 定义要计算的分类指标
    scoring = {
        'Accuracy': 'accuracy',
        'Recall': 'recall',
        'Precision': 'precision',
        'F1': 'f1',
        'Auc': 'roc_auc'
    }
    score_train = {}
    score_test = {}
    print(X.shape,X_train.shape,X_test.shape)
    for metric_name, metric_func in scoring.items():
        metric_scores = cross_val_score(rf_classifier, X_train, y_train, cv=cv, scoring=metric_func, n_jobs=-1)
        avg_metric_score = metric_scores.mean()
        score_train[metric_name] = round(avg_metric_score, 3)
        print(f'validation: {metric_name}: {avg_metric_score:.3f}')
    # 选择最佳模型参数设置
    best_model_params = rf_classifier.get_params() 
    # 使用最佳参数设置重新训练模型
    best_model = RandomForestClassifier(**best_model_params)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    score_test['Accuracy'] = round(accuracy_score(y_test, y_pred),3)
    score_test['Recall'] = round(recall_score(y_test, y_pred),3 )
    score_test['Precision'] = round(precision_score(y_test, y_pred),3)
    score_test['F1'] = round(f1_score(y_test, y_pred), 3)
    score_test['Auc'] = round(roc_auc_score(y_test, y_pred), 3)
    print('***********test*************')
    print(score_test)
    # for metric_name, metric_func in scoring.items():
    #     metric_scores = cross_val_score(rf_classifier, X_test, y_test, scoring=metric_func, n_jobs=-1)
    #     avg_metric_score = metric_scores.mean()
    #     score_test[metric_name] = round(avg_metric_score, 3)
    #     print(f'test: {metric_name}: {avg_metric_score:.3f}')
    return score_train, score_test

# Origin_score

In [None]:
_,origin_score = train(origin_X, y, 8, test=0.1, n_estimators=100)
origin_score

In [None]:
from sklearn.datasets import make_classification
from collections import Counter
# 获取gan生成的数据
gan_df_X = pd.read_csv('gan_GSE25066.csv')
gan_df_y = np.array(gan_df_X.pop("group"))
print(f'数据集中各类别样本的数量：{Counter(y)}')
print(f'生成对抗网络生成之后各类别样本的数量：{Counter(gan_df_y)}')

In [None]:
gan_X  = np.array(gan_df_X)
gan_X, gan_df_y

# Gan_score

In [None]:
_, gan_score = train(gan_X, gan_df_y, 8, test=0.1, n_estimators=100, random_state=42)
gan_score

# 读取pathways特征

In [None]:
ptways = pd.read_csv('GSE25066_Pathways.csv')
ptways_y = np.array(ptways.pop("group"))
ptways_X = np.array(ptways)

In [None]:
Counter(ptways_y)

In [None]:
train(ptways_X, ptways_y, 8)

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# 创建一个递归特征消除器
estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=245, step=1)

# 使用选择器对特征进行选择
X_selected = selector.fit_transform(ptways_X, ptways_y)

In [None]:
# from sklearn.feature_selection import VarianceThreshold

# # 创建一个方差阈值特征选择器
# selector = VarianceThreshold(threshold=0.070)

# # 使用选择器对特征进行选择
# X_selected = selector.fit_transform(ptways_X)

In [None]:
X_selected.shape

# Gan_pathifier_score

In [None]:
_, gan_pathifier_score = train(X_selected, ptways_y, n_splits=8, test=0.1, n_estimators=100, random_state=42)

In [None]:
origin_score,gan_score, gan_pathifier_score 

In [None]:
# 绘制性能指标对比图像
import matplotlib.pyplot as plt
X = ['Accuracy','Recall', 'Precision', 'F1', 'AUC']
X_labels = ['ORIGIN_RF','GAN_RF', 'GAN_PATHIFIER_RF' ]
ORIGIN_RF_score_bar = [origin_score['Accuracy'], origin_score['Recall'], origin_score['Precision'],origin_score['F1'],origin_score['Auc']]
GAN_RF_score_bar = [gan_score['Accuracy'], gan_score['Recall'], gan_score['Precision'],gan_score['F1'],gan_score['Auc']]
GAN_PATHIFIER_RF_score_bar = [gan_pathifier_score['Accuracy'], gan_pathifier_score['Recall'], gan_pathifier_score['Precision'],gan_pathifier_score['F1'],gan_pathifier_score['Auc']]

def plot_preference(X, X_labels, origin_score, gan_score, gan_pathifier_score, save_fig='GSE25066_Classification.png', title='GSE25066 Classification'):
    plt.figure(figsize=(13,8))
    # 设置柱状图的宽度
    bar_width = 0.25
    # 生成X轴上的位置
    index = np.arange(len(X))
    plt.bar(index, ORIGIN_RF_score_bar, bar_width, label=X_labels[0])
    
    plt.bar(index + bar_width, GAN_RF_score_bar, bar_width, label=X_labels[1])
    
    plt.bar(index + 2 * bar_width, GAN_PATHIFIER_RF_score_bar, bar_width, label=X_labels[2])
    
    # 添加标题和标签
    plt.title(title)
    plt.xlabel('Performance')
    plt.ylabel('Score(%)')
    
    # 添加方法名称到图例中
    plt.legend(fontsize=7, loc='upper left')
    
    # 调整X轴刻度标签
    plt.xticks(index + 1.5 * bar_width, X)
    for i, v in enumerate(ORIGIN_RF_score_bar):
        plt.text(i, v, str(v), ha='center', va='bottom')
    for i, v in enumerate(GAN_RF_score_bar):
        plt.text(i + bar_width, v, str(v), ha='center', va='bottom')
    for i, v in enumerate(GAN_PATHIFIER_RF_score_bar):
        plt.text(i + 2 * bar_width, v, str(v), ha='center', va='bottom')
    plt.savefig(save_fig)
    # 展示图形
    plt.show()

   

In [None]:
plot_preference(X=X, 
                X_labels=X_labels,
                origin_score=origin_score,
                gan_score=gan_score,
                gan_pathifier_score=gan_pathifier_score,
                title='GSE25066 Classification'
               )

# 使用其他分类器对比

In [None]:
from sklearn.svm import SVC

def train_model(X, y, n_splits=8, test=0.2, random_state=42, n_estimators=200, model='rf'):
    # 创建一个随机森林分类器
    rf_classifier = RandomForestClassifier(random_state=random_state, n_estimators=n_estimators)
    
    # 设置K-fold交叉验证策略
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    X_len = len(X)
    t = int((1-test)*X_len)
    print(f'n_splits:{n_splits},test_data:{test},random_state:{random_state},n_estimators:{n_estimators}')
    num_samples = X.shape[0]

    # 生成随机排列索引
    np.random.seed(random_state)
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test, stratify=y, random_state=random_state)
    # 定义要计算的分类指标
    scoring = {
        'Accuracy': 'accuracy',
        'Recall': 'recall',
        'Precision': 'precision',
        'F1': 'f1',
        'Auc': 'roc_auc'
    }
    score_train = {}
    score_test = {}
    print(X.shape,X_train.shape,X_test.shape)
    for metric_name, metric_func in scoring.items():
        metric_scores = cross_val_score(rf_classifier, X_train, y_train, cv=cv, scoring=metric_func, n_jobs=-1)
        avg_metric_score = metric_scores.mean()
        score_train[metric_name] = round(avg_metric_score, 3)
        print(f'validation: {metric_name}: {avg_metric_score:.3f}')
    # 选择最佳模型参数设置
    best_model_params = rf_classifier.get_params() 
    # 使用最佳参数设置重新训练模型
    best_model = RandomForestClassifier(**best_model_params)
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)
    score_test['Accuracy'] = round(accuracy_score(y_test, y_pred),3)
    score_test['Recall'] = round(recall_score(y_test, y_pred),3 )
    score_test['Precision'] = round(precision_score(y_test, y_pred),3)
    score_test['F1'] = round(f1_score(y_test, y_pred), 3)
    score_test['Auc'] = round(roc_auc_score(y_test, y_pred), 3)
    print('***********test*************')
    print(score_test)
    # for metric_name, metric_func in scoring.items():
    #     metric_scores = cross_val_score(rf_classifier, X_test, y_test, scoring=metric_func, n_jobs=-1)
    #     avg_metric_score = metric_scores.mean()
    #     score_test[metric_name] = round(avg_metric_score, 3)
    #     print(f'test: {metric_name}: {avg_metric_score:.3f}')
    return score_train, score_test
def select_model(model):
    models = {
        "rf": RandomForestClassifier(),
        "svm": SVC()
    }

# 使用pycaret训练

## 读取原始数据

In [2]:
import pandas as pd
import numpy as np

In [23]:
train_size = 0.9
fold = 8

In [24]:
origin_data = pd.read_csv('GSE25066_merge.csv')

In [25]:
from pycaret.classification import ClassificationExperiment
s = ClassificationExperiment()
s.set_config('seed', 42)
s.setup(origin_data,
        target = 'group', 
        session_id = 42, 
        train_size=train_size,
        fold_strategy='kfold',
        fold=fold,
       )

Unnamed: 0,Description,Value
0,Session id,42
1,Target,group
2,Target type,Binary
3,Original data shape,"(488, 13237)"
4,Transformed data shape,"(488, 13237)"
5,Transformed train set shape,"(439, 13237)"
6,Transformed test set shape,"(49, 13237)"
7,Numeric features,13236
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7faea4173f40>

In [26]:
s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.8134,0.0,0.9423,0.8419,0.8881,0.2988,0.338,0.425
lr,Logistic Regression,0.8088,0.8088,0.9366,0.841,0.885,0.2889,0.324,0.8138
lda,Linear Discriminant Analysis,0.8043,0.7912,0.939,0.8354,0.8831,0.263,0.2906,0.435
rf,Random Forest Classifier,0.7997,0.733,0.9883,0.8043,0.8862,0.0531,0.0817,0.6362
lightgbm,Light Gradient Boosting Machine,0.7997,0.7281,0.9775,0.8105,0.8853,0.0997,0.1432,0.3375
knn,K Neighbors Classifier,0.7996,0.6925,0.9514,0.8245,0.8823,0.2052,0.2374,0.74
et,Extra Trees Classifier,0.7974,0.7579,0.9914,0.8014,0.8854,0.037,0.0679,0.4725
dummy,Dummy Classifier,0.7974,0.5,1.0,0.7974,0.8863,0.0,0.0,0.3038
gbc,Gradient Boosting Classifier,0.7792,0.7147,0.9432,0.8109,0.871,0.076,0.0803,0.33
svm,SVM - Linear Kernel,0.7657,0.0,0.8581,0.858,0.8474,0.2542,0.2704,0.4425


## 使用gan生成的平衡数据

In [27]:
gan_data = pd.read_csv('gan_GSE25066.csv')

In [28]:
s2 = ClassificationExperiment()
s2.set_config('seed', 42)
s2.setup(gan_data, 
         target = 'group', 
         session_id = 42, 
         train_size=train_size, 
         fold_strategy='kfold',
         fold=fold,
        )

Unnamed: 0,Description,Value
0,Session id,42
1,Target,group
2,Target type,Binary
3,Original data shape,"(778, 13237)"
4,Transformed data shape,"(778, 13237)"
5,Transformed train set shape,"(700, 13237)"
6,Transformed test set shape,"(78, 13237)"
7,Numeric features,13236
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7fae48b09610>

In [29]:
s2.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9272,0.0,0.9197,0.9328,0.9254,0.8507,0.8526,0.375
rf,Random Forest Classifier,0.8743,0.9263,0.9925,0.8013,0.8856,0.7428,0.766,0.54
lr,Logistic Regression,0.8729,0.9403,0.9313,0.8286,0.8765,0.7385,0.7454,0.4325
et,Extra Trees Classifier,0.8714,0.9319,0.9972,0.7941,0.8834,0.7363,0.7626,0.4162
nb,Naive Bayes,0.87,0.8676,1.0,0.7908,0.8823,0.7338,0.7618,0.4138
knn,K Neighbors Classifier,0.8686,0.913,0.9495,0.8146,0.8762,0.7308,0.7416,0.3638
lightgbm,Light Gradient Boosting Machine,0.8685,0.9255,0.96,0.8115,0.8783,0.7309,0.7444,0.3113
svm,SVM - Linear Kernel,0.8643,0.0,0.9135,0.8327,0.8683,0.7225,0.7322,0.265
ada,Ada Boost Classifier,0.857,0.9201,0.9067,0.8255,0.8621,0.707,0.715,0.4588
gbc,Gradient Boosting Classifier,0.8542,0.9302,0.9464,0.7968,0.8646,0.7008,0.7143,0.6075


## 读取pathways score

In [30]:
ptways_data = pd.read_csv('GSE25066_Pathways.csv')
ptways_data

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,...,P287,P288,P289,P290,P291,P292,P293,P294,P295,group
0,0.652611,0.495346,0.047954,0.662963,0.735513,0.555973,0.378495,0.541849,0.584928,0.649398,...,0.609205,0.509882,0.623106,0.574223,0.496415,0.532876,0.722280,0.553822,0.608116,0
1,0.842995,0.037106,0.000000,0.626206,0.679602,0.169469,0.941549,0.444544,0.000000,0.788208,...,0.419986,0.517451,0.355912,0.332875,0.257972,0.404462,0.743766,0.536958,0.104767,0
2,0.408753,0.229237,0.258692,0.571939,0.337781,0.277547,0.259577,0.304155,0.295918,0.354925,...,0.346002,0.490969,0.371533,0.324349,0.282662,0.267778,0.586543,0.392367,0.330188,0
3,0.399076,0.296493,0.002233,0.878496,0.642641,0.485244,0.820809,0.531869,0.467797,0.702102,...,0.537548,0.542883,0.529179,0.509160,0.372515,0.504179,0.602727,0.458319,0.477012,0
4,0.619888,0.344163,0.145156,0.629054,0.792862,0.612424,0.645237,0.636430,0.640469,0.425681,...,0.584482,0.533999,0.633018,0.576897,0.570664,0.510094,0.766075,0.555534,0.622362,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
773,0.780953,0.472283,0.402083,0.848834,0.841829,0.335453,0.816556,0.250886,0.529502,0.699910,...,0.458262,0.412346,0.328081,0.238987,0.345733,0.475420,0.578152,0.440174,0.335495,1
774,0.376542,0.664727,0.188929,0.467122,0.288284,0.247069,0.840098,0.378218,0.441010,0.370800,...,0.663944,0.380639,0.451076,0.377037,0.242163,0.799175,0.436973,0.358878,0.381521,1
775,0.383277,0.373971,0.360171,0.355983,0.295145,0.381857,0.260191,0.185358,0.350350,0.417225,...,0.372857,0.452868,0.373583,0.340856,0.348174,0.471055,0.583116,0.444745,0.367293,1
776,0.546009,0.306282,0.212256,0.686410,0.569083,0.040985,0.489773,0.509462,0.235780,0.510976,...,0.728129,0.388307,0.783554,0.735152,0.128658,0.437308,0.232475,0.511149,0.796648,1


In [31]:
s3 = ClassificationExperiment()
s3.set_config('seed', 42)
s3.setup(ptways_data, 
         target = 'group', 
         session_id = 42, 
         train_size=train_size, 
         fold_strategy='kfold',
         fold=fold
        )

Unnamed: 0,Description,Value
0,Session id,42
1,Target,group
2,Target type,Binary
3,Original data shape,"(778, 296)"
4,Transformed data shape,"(778, 296)"
5,Transformed train set shape,"(700, 296)"
6,Transformed test set shape,"(78, 296)"
7,Numeric features,295
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x7faea4173340>

In [32]:
s3.compare_models(cross_validation=False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9744,0.998,0.9487,1.0,0.9737,0.9487,0.95,0.07
et,Extra Trees Classifier,0.9744,1.0,0.9487,1.0,0.9737,0.9487,0.95,0.07
lightgbm,Light Gradient Boosting Machine,0.9744,0.9934,0.9487,1.0,0.9737,0.9487,0.95,0.05
gbc,Gradient Boosting Classifier,0.9615,0.9961,0.9744,0.95,0.962,0.9231,0.9234,0.04
dt,Decision Tree Classifier,0.8974,0.8974,0.8974,0.8974,0.8974,0.7949,0.7949,0.04
knn,K Neighbors Classifier,0.8846,0.9707,0.7949,0.9688,0.8732,0.7692,0.7819,0.03
ada,Ada Boost Classifier,0.8846,0.9474,0.8974,0.875,0.8861,0.7692,0.7695,0.08
lr,Logistic Regression,0.859,0.9435,0.8205,0.8889,0.8533,0.7179,0.7201,0.04
ridge,Ridge Classifier,0.8333,0.8333,0.7436,0.9062,0.8169,0.6667,0.6777,0.04
nb,Naive Bayes,0.7692,0.8189,0.8205,0.7442,0.7805,0.5385,0.5413,0.04
