In [6]:
import os
from tqdm import tqdm
import pickle
import numpy as np
import pandas as pd

from sklearn.base import ClassifierMixin
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.calibration import CalibratedClassifierCV

from skopt.space import Real, Categorical
from skopt import BayesSearchCV
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, auc

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, auc,precision_recall_curve
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
def get_evaluation(label: list, pred: list, pro_cutoff: float = None):
    fpr, tpr, thresholds = roc_curve(label, pred)
    if pro_cutoff is None:
        best_one_optimal_idx = np.argmax(tpr - fpr)
        pro_cutoff = thresholds[best_one_optimal_idx]
    pred_l = [1 if i >= pro_cutoff else 0 for i in pred]
    #后面新增的计算prAUC
    confusion_matrix_1d = confusion_matrix(label, pred_l).ravel()
    confusion_dict = {N: n for N, n in zip(['tn', 'fp', 'fn', 'tp'], list(
        confusion_matrix_1d * 2 / np.sum(confusion_matrix_1d)))}
    
    precision, recall, _ = precision_recall_curve(label, pred)
    pr_auc = auc(recall, precision)
    
    evaluation = {
        "accuracy": accuracy_score(label, pred_l),
        "precision": precision_score(label, pred_l),
        "f1_score": f1_score(label, pred_l),
        "mmc": matthews_corrcoef(label, pred_l),
        "rocAUC": auc(fpr, tpr),
        "prAUC": pr_auc,
        "specificity": confusion_dict['tn'] / (confusion_dict['tn'] + confusion_dict['fp']),
        "sensitivity": confusion_dict['tp'] / (confusion_dict['tp'] + confusion_dict['fn']),
        'pro_cutoff': pro_cutoff
    }
    return evaluation

def plot_roc_curve(target, pred, path_to_: str):
    fpr, tpr, thresholds = roc_curve(target, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(19.2, 10.8))
    plt.plot(fpr, tpr, color='red', lw=2,
             label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) curve')
    plt.legend(loc="lower right")

    plt.savefig(f"{path_to_}")
    plt.clf()

class MyOptimitzer:
        def __init__(self, classifier_name: str, classifier_class: ClassifierMixin, classifier_param_dict: dict) -> None:
            self.classifier_name = classifier_name
            self.classifier_class = classifier_class
            self.classifier_param_dict = classifier_param_dict

            self.grid_search: GridSearchCV = None
            self.best_predicted_pair = None
            pass

        def find_best(self, X, y, validation: tuple):
            self.grid_search = BayesSearchCV(
                self.classifier_class(),
                search_spaces=self.classifier_param_dict,
                cv=RepeatedStratifiedKFold(
                    n_splits=5, #原来是5
                    n_repeats=2,
                    random_state=42
                ),
                scoring='roc_auc',
                # n_jobs=-3, #控制线程 ：正数表示使用多少线程；负数表示在系统总线程上减去相应线程来使用
                n_jobs=-4,
                refit=True
            )
            self.grid_search.fit(X, y)
            self.best_predicted_pair = [
                self.grid_search.predict_proba(
                    X=validation[0]
                ),
                validation[1]
            ]
            return self

        def get_summary(self, path_to_dir: str = None):
            os.makedirs(path_to_dir, exist_ok=True)
            model_path = f"{path_to_dir}/{self.classifier_name}.pkl"
            if path_to_dir is not None:
                with open(model_path, "bw+") as f:
                    pickle.dump(
                        self.grid_search, f
                    )
            else:
                model_path = "-"
            plot_roc_curve(
                target=self.best_predicted_pair[1],
                pred=self.best_predicted_pair[0][:, 1],
                path_to_=f"{path_to_dir}/{self.classifier_name}.pdf"
            )
            return pd.Series({
                "Classifier_Name": self.classifier_name,
                "Optimitied_Param": self.grid_search.best_params_,
                "Model_Path": model_path
            } | get_evaluation(
                label=self.best_predicted_pair[1],
                pred=self.best_predicted_pair[0][:, 1],
            ))
find_space = [
        {
        "name": "LogisticRegression",
        "class": LogisticRegression,
        "param": [{
            "penalty": ['l2', ],
            "C": [0.1, 1.0, ],
            "solver": ['lbfgs',],
            'class_weight': ['balanced',],
            'max_iter': [1000, ]
        }, ],
        "Bayes": False
    },
    ]
a = 1
#读取数据
f = pd.read_excel(f'/mnt/md0/Public/T3_T4/feature.xlsx')
feature = f.iloc[0:,0:-1]
feature_ = feature.astype("float").values
target = f.loc[0:,'拔管成功']

target_ = target.values

model_path_to_save = f'BT/{a}'
os.makedirs(model_path_to_save, exist_ok=True)

result_list = []

# Bootstrapping 循环

n_iterations = 100
bootstrapped_results = []

for model_index in tqdm(range(len(find_space))):
    model_results = []

    for i in range(n_iterations):
        # 生成Bootstrap样本
        train_id = np.random.choice(range(len(feature_)), size=len(feature_), replace=True)
        test_id = np.setdiff1d(range(len(feature_)), train_id)

        # 训练和验证模型
        result = MyOptimitzer(
            find_space[model_index]["name"],
            find_space[model_index]["class"],
            find_space[model_index]["param"],
        ).find_best(
            X=feature_[train_id],
            y=target_[train_id],
            validation=(feature_[test_id], target_[test_id])
        ).get_summary(
            path_to_dir=f"{model_path_to_save}/{i}"
        )
        
        model_results.append(result)

    # 合并当前模型的结果
    bootstrapped_results = pd.concat(model_results, axis=1).T

    bootstrapped_results.loc[:, ["Classifier_Name", "Optimitied_Param", "Model_Path"]].to_csv(
        f"{model_path_to_save}/{find_space[model_index]['name']}_Param.csv"
    )
    fivecross_result_splited = bootstrapped_results.loc[:, [
        "accuracy", "precision", "f1_score", "mmc", "rocAUC", "specificity", "sensitivity", "pro_cutoff","prAUC"]]
    fivecross_result_splited.to_csv(
        f"{model_path_to_save}/{find_space[model_index]['name']}_Bootstrapping.csv"
    )

    series = fivecross_result_splited.sum(axis=0) / 100
    series.name = find_space[model_index]["name"]
    result_list.append(series)

pd.concat(
                result_list, axis=1,
            ).T.to_csv(
                f"{model_path_to_save}/_results.csv",
                index=True
            )






[A[A[A


100%|██████████| 1/1 [00:38<00:00, 38.08s/it]


<Figure size 1920x1080 with 0 Axes>

In [13]:
from Bio import SeqIO
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, recall_score,auc,precision_recall_curve

data = {'rocAUC': '', 'prAUC': '', 'MCC': '', 'F1': '', 
        'Precision': '', 'Accuracy': '', 'Sensitivity': '', 
        'Specificity': '', 'FPR': '', 'Recall': '','pro_cutoff':'',
        'Youden':''}
df = pd.DataFrame(columns=data.keys())





                       
predict_result_list = []


def calculate_fpr(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
    fpr = fp / (fp + tn)
    return fpr

n_iterations = 100
f = pd.read_excel(f'/mnt/md0/Public/T3_T4/feature.xlsx')
feature = f.iloc[0:,0:1]
feature_ = feature.astype("float").values
target = f.loc[0:,'拔管成功']
target_ = target.values
p = 1

rocauc = []
prAUC = []
MCC = []
F1 = []
Precision = []
Accuracy = []
Sensitivity = []
Specificity = []
FPR_ = []
Recall = []
pro_cutoff_ = []
Youden = []
# prc = []
p = 1
while p <= 50:
    for i in range(n_iterations):
        # 生成Bootstrap样本
        train_id = np.random.choice(range(len(feature_)), size=len(feature_), replace=True)
        feat = feature_[train_id]
        
        tar = target_[train_id]
        model = LogisticRegression()
        model.fit(feat, tar)
        pred = model.predict_proba(feature_)[:, 1]
        fpr, tpr, thresholds = roc_curve(target_, pred)
                            
                            
        # best_one_optimal_idx = np.argmax(tpr - fpr)
        # pro_cutoff = thresholds[best_one_optimal_idx]
        # p_ = np.array(p)
        # p_ = p_.reshape(1, -1)
        
        # pro_cutoff = model.predict_proba(p_)[:, 1]
        coef = model.coef_[0][0]
        intercept = model.intercept_[0]
        # log_odds = np.log(pro_cutoff / (1 - pro_cutoff))
        # X_boundary = (log_odds - intercept) / coef
        logit = intercept + np.dot(coef, p)  # 计算 logit 值
        pro_cutoff = 1 / (1 + np.exp(-logit))
        #print(pro_cutoff)
        pred_l = [1 if i >= pro_cutoff else 0 for i in pred]
        
        
        #后面新增的计算prAUC
        confusion_matrix_1d = confusion_matrix(target_, pred_l).ravel()
        confusion_dict = {N: n for N, n in zip(['tn', 'fp', 'fn', 'tp'], list(
            confusion_matrix_1d * 2 / np.sum(confusion_matrix_1d)))}

        

        precision, recall, _ = precision_recall_curve(target_, pred)
        
        rocauc.append(auc(fpr, tpr))
        prAUC.append(auc(recall, precision))
        MCC.append(matthews_corrcoef(target_, pred_l))
        F1.append(f1_score(target_, pred_l))
        Precision.append(precision_score(target_, pred_l,zero_division=1))
        Accuracy.append(accuracy_score(target_, pred_l))
        Sensitivity.append(confusion_dict['tp'] / (confusion_dict['tp'] + confusion_dict['fn']))
        Specificity.append(confusion_dict['tn'] / (confusion_dict['tn'] + confusion_dict['fp']))
        FPR_.append(calculate_fpr(target_,pred_l))
        Recall.append(recall_score(target_, pred_l))
        Youden.append(confusion_dict['tp'] / (confusion_dict['tp'] + confusion_dict['fn'])+confusion_dict['tn'] / (confusion_dict['tn'] + confusion_dict['fp'])-1)
        # coef = model.coef_[0][0]
        # intercept = model.intercept_[0]
        # # log_odds = np.log(pro_cutoff / (1 - pro_cutoff))
        # # X_boundary = (log_odds - intercept) / coef
        # logit = intercept + np.dot(coef, p)  # 计算 logit 值

        
        # prc.append(X_boundary)
    evaluation = {
        "rocAUC": np.mean(rocauc),
        "prAUC": np.mean(prAUC),
        "MCC": np.mean(MCC),
        "F1": np.mean(F1),
        "Precision": np.mean(Precision),
        "Accuracy": np.mean(Accuracy),
        "Sensitivity": np.mean(Sensitivity),
        "Specificity": np.mean(Specificity),
        "FPR":np.mean(FPR_),
        "Recall":np.mean(Recall),
        'pro_cutoff':p,
        'Youden':np.mean(Youden)
    }
    df = pd.concat([df, pd.DataFrame(evaluation, index=[0])], ignore_index=True)
    p+=1

df.to_excel(f'boot_logistics3.xlsx', index=False)

In [7]:
import pandas as pd
import numpy as np
n_iterations = 100
f = pd.read_excel(f'/mnt/md0/Public/T3_T4/feature.xlsx')
feature = f.iloc[0:,0:1]
feature_ = feature.astype("float").values
target = f.loc[0:,'拔管成功']
target_ = target.values
for i in range(n_iterations):
        # 生成Bootstrap样本
        train_id = np.random.choice(range(len(feature_)), size=len(feature_)*0.8, replace=True)
        test_id = np.setdiff1d(range(len(feature_)), train_id)
        feat = feature_[test_id]
        print(feat)

[[24.]
 [27.]
 [24.]
 [20.]
 [20.]
 [26.]
 [25.]
 [26.]
 [27.]
 [24.]
 [21.]
 [19.]
 [23.]
 [26.]
 [26.]
 [24.]
 [29.]
 [31.]
 [30.]
 [34.]
 [36.]
 [35.]
 [35.]
 [35.]
 [30.]
 [34.]
 [35.]
 [30.]
 [31.]
 [27.]
 [36.]
 [29.]]
[[25.]
 [26.]
 [28.]
 [27.]
 [20.]
 [21.]
 [24.]
 [19.]
 [26.]
 [23.]
 [25.]
 [25.]
 [29.]
 [30.]
 [30.]
 [31.]
 [35.]
 [36.]
 [36.]
 [35.]
 [35.]
 [34.]
 [33.]
 [30.]
 [28.]
 [31.]
 [29.]
 [27.]
 [34.]
 [36.]
 [33.]
 [32.]]
[[24.]
 [25.]
 [23.]
 [27.]
 [20.]
 [24.]
 [26.]
 [25.]
 [20.]
 [19.]
 [23.]
 [25.]
 [29.]
 [30.]
 [30.]
 [34.]
 [33.]
 [27.]
 [35.]
 [34.]
 [33.]
 [36.]
 [35.]
 [33.]
 [28.]
 [32.]
 [30.]
 [28.]
 [27.]
 [28.]
 [36.]
 [33.]
 [29.]
 [33.]]
[[24.]
 [28.]
 [27.]
 [26.]
 [20.]
 [28.]
 [24.]
 [26.]
 [27.]
 [19.]
 [23.]
 [25.]
 [26.]
 [24.]
 [28.]
 [29.]
 [29.]
 [31.]
 [35.]
 [35.]
 [35.]
 [36.]
 [33.]
 [36.]
 [35.]
 [35.]
 [36.]
 [28.]
 [34.]
 [32.]
 [31.]
 [27.]
 [28.]
 [36.]
 [33.]
 [34.]]
[[25.]
 [28.]
 [24.]
 [20.]
 [24.]
 [24.]
 [27.]
 [24.]
 [