In [4]:

import os
from tqdm import tqdm
import pickle
import numpy as np
import pandas as pd

from sklearn.base import ClassifierMixin
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
from sklearn.calibration import CalibratedClassifierCV

from skopt.space import Real, Categorical
from skopt import BayesSearchCV
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, auc

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, auc,precision_recall_curve
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
def get_evaluation(label: list, pred: list, pro_cutoff: float = None):
    fpr, tpr, thresholds = roc_curve(label, pred)
    if pro_cutoff is None:
        best_one_optimal_idx = np.argmax(tpr - fpr)
        pro_cutoff = thresholds[best_one_optimal_idx]
    pred_l = [1 if i >= pro_cutoff else 0 for i in pred]
    #后面新增的计算prAUC
    confusion_matrix_1d = confusion_matrix(label, pred_l).ravel()
    confusion_dict = {N: n for N, n in zip(['tn', 'fp', 'fn', 'tp'], list(
        confusion_matrix_1d * 2 / np.sum(confusion_matrix_1d)))}
    
    precision, recall, _ = precision_recall_curve(label, pred)
    pr_auc = auc(recall, precision)
    
    evaluation = {
        "accuracy": accuracy_score(label, pred_l),
        "precision": precision_score(label, pred_l),
        "f1_score": f1_score(label, pred_l),
        "mmc": matthews_corrcoef(label, pred_l),
        "rocAUC": auc(fpr, tpr),
        "prAUC": pr_auc,
        "specificity": confusion_dict['tn'] / (confusion_dict['tn'] + confusion_dict['fp']),
        "sensitivity": confusion_dict['tp'] / (confusion_dict['tp'] + confusion_dict['fn']),
        'pro_cutoff': pro_cutoff
    }
    return evaluation

def plot_roc_curve(target, pred, path_to_: str):
    fpr, tpr, thresholds = roc_curve(target, pred)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(19.2, 10.8))
    plt.plot(fpr, tpr, color='red', lw=2,
             label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) curve')
    plt.legend(loc="lower right")

    plt.savefig(f"{path_to_}")
    plt.clf()

class MyOptimitzer:
        def __init__(self, classifier_name: str, classifier_class: ClassifierMixin, classifier_param_dict: dict) -> None:
            self.classifier_name = classifier_name
            self.classifier_class = classifier_class
            self.classifier_param_dict = classifier_param_dict

            self.grid_search: GridSearchCV = None
            self.best_predicted_pair = None
            pass

        def find_best(self, X, y, validation: tuple):
            self.grid_search = BayesSearchCV(
                self.classifier_class(),
                search_spaces=self.classifier_param_dict,
                cv=RepeatedStratifiedKFold(
                    n_splits=5, #原来是5
                    n_repeats=2,
                    random_state=42
                ),
                scoring='roc_auc',
                # n_jobs=-3, #控制线程 ：正数表示使用多少线程；负数表示在系统总线程上减去相应线程来使用
                n_jobs=-4,
                refit=True
            )
            self.grid_search.fit(X, y)
            self.best_predicted_pair = [
                self.grid_search.predict_proba(
                    X=validation[0]
                ),
                validation[1]
            ]
            return self

        def get_summary(self, path_to_dir: str = None):
            os.makedirs(path_to_dir, exist_ok=True)
            model_path = f"{path_to_dir}/{self.classifier_name}.pkl"
            if path_to_dir is not None:
                with open(model_path, "bw+") as f:
                    pickle.dump(
                        self.grid_search, f
                    )
            
            plot_roc_curve(
                target=self.best_predicted_pair[1],
                pred=self.best_predicted_pair[0][:, 1],
                path_to_=f"{path_to_dir}/{self.classifier_name}.pdf"
            )
            return pd.Series({
                "Classifier_Name": self.classifier_name,
                "Optimitied_Param": self.grid_search.best_params_,
                "Model_Path": model_path
            } | get_evaluation(
                label=self.best_predicted_pair[1],
                pred=self.best_predicted_pair[0][:, 1],
            ))
find_space = [
    #     {
    #     "name": "LogisticRegression",
    #     "class": LogisticRegression,
    #     "param": [{
            
    #         # "penalty": ['none'],
    #         "solver": ['newton-cg', 'lbfgs', 'sag', 'saga'],
    #         'class_weight': [None, 'balanced'],
    #         'max_iter': [100, 500, 1000]
    #     }, {
    #         "penalty": ['l1', ],
    #         "C": [0.001, 0.01, 0.1, 1, 10, 100],
    #         "solver": ['liblinear', 'saga'],
    #         'class_weight': [None, 'balanced'],
    #         'max_iter': [100, 500, 1000]
    #     }, {
    #         "penalty": ['l2', ],
    #         "C": [0.001, 0.01, 0.1, 1, 10, 100],
    #         "solver": ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    #         'class_weight': [None, 'balanced'],
    #         'max_iter': [100, 500, 1000]
    #     }, {
    #         "penalty": ['elasticnet', ],
    #         'l1_ratio': [0.1 * i for i in range(0, 11, 1)],
    #         "C": [0.001, 0.01, 0.1, 1, 10, 100],
    #         "solver": ['saga'],
    #         'class_weight': [None, 'balanced'],
    #         'max_iter': [100, 500, 1000]
    #     }, ]
    # },
    { "name": "LogisticRegression", 
                "class": LogisticRegression,
                "param": [{
                "penalty": ['l2', ],
                "C": [0.1, 1.0, ],
                "solver": ['lbfgs',],
                'class_weight': ['balanced',],
                'max_iter': [1000, ]
                }, ],
                "Bayes": False
    },
    ]
a = 'all'

f = pd.read_csv(f'new_LR/{a}/feature.csv')
fea = f.iloc[0:,0:-1]
feature_ = fea.astype("float").values
la = f.loc[0:,'label']
target_ = la.values

model_path_to_save = f'new_LR/{a}'
os.makedirs(model_path_to_save, exist_ok=True)


result_list = []
for model_index in tqdm(range(len(find_space))):
    fivecross_result = pd.concat([
        MyOptimitzer(
            find_space[model_index]["name"],
            find_space[model_index]["class"],
            find_space[model_index]["param"],
        ).find_best(
            X=feature_[train_id],
            y=target_[train_id],
            validation=(feature_[test_id], target_[test_id])
        ).get_summary(
            path_to_dir=f"{model_path_to_save}/{Kfold_id}"
        )
        for Kfold_id, (train_id, test_id) in enumerate(StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(feature_, target_))
    ], axis=1).T

    print(fivecross_result)

    fivecross_result.loc[:, ["Classifier_Name", "Optimitied_Param", "Model_Path"]].to_csv(
        f"{model_path_to_save}/{find_space[model_index]['name']}_Param.csv"
    )
    fivecross_result_splited = fivecross_result.loc[:, [
        "accuracy", "precision", "f1_score", "mmc", "rocAUC", "specificity", "sensitivity", "pro_cutoff","prAUC"]]
    fivecross_result_splited.to_csv(
        f"{model_path_to_save}/{find_space[model_index]['name']}_5Fold.csv"
    )

#     series = fivecross_result_splited.sum(axis=0) / 5
#     series.name = find_space[model_index]["name"]
#     result_list.append(series)

# pd.concat(
#                 result_list, axis=1,
#             ).T.to_csv(
#                 f"{model_path_to_save}/5fold_results.csv",
#                 index=True
#             )


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

      Classifier_Name                                   Optimitied_Param  \
0  LogisticRegression  {'C': 0.9874014708529704, 'class_weight': 'bal...   
1  LogisticRegression  {'C': 0.9999630509169221, 'class_weight': 'bal...   
2  LogisticRegression  {'C': 0.9998849207732251, 'class_weight': 'bal...   
3  LogisticRegression  {'C': 0.7051754124441058, 'class_weight': 'bal...   
4  LogisticRegression  {'C': 0.21542237038187795, 'class_weight': 'ba...   

                            Model_Path  accuracy precision  f1_score  \
0  new_LR/all/0/LogisticRegression.pkl  0.666667  0.571429  0.727273   
1  new_LR/all/1/LogisticRegression.pkl  0.555556  0.428571       0.6   
2  new_LR/all/2/LogisticRegression.pkl  0.888889      0.75  0.857143   
3  new_LR/all/3/LogisticRegression.pkl      0.75       1.0       0.5   
4  new_LR/all/4/LogisticRegression.pkl     0.875      0.75  0.857143   

        mmc    rocAUC     prAUC specificity sensitivity pro_cutoff  
0  0.478091      0.65  0.625595         0




<Figure size 1920x1080 with 0 Axes>

<Figure size 1920x1080 with 0 Axes>

<Figure size 1920x1080 with 0 Axes>

<Figure size 1920x1080 with 0 Axes>

<Figure size 1920x1080 with 0 Axes>

In [None]:
# import pandas as pd
# from sklearn.linear_model import Lasso
# from sklearn.feature_selection import SelectFromModel
# import os
# # 创建示例数据
# f = pd.read_excel('/mnt/md0/Public/T3_T4/0806/clinical.xlsx')

# X = f.iloc[:, 1:16]  

# y = f.iloc[:,19]
# # 创建并拟合 Lasso 回归模型
# lasso = Lasso(alpha=0.1)  # 调整 alpha 参数来控制正则化强度
# lasso.fit(X, y)

# sfm = SelectFromModel(lasso)
# X_selected = sfm.transform(X)

# # 获取选择的特征索引
# selected_feature_indices = sfm.get_support(indices=True)

# selected_features = X.iloc[:, selected_feature_indices]
# selected_features['label'] = y.values  # 将标签数据转换为数组形式
# # 获取选择的特征的系数
# selected_feature_contributions = lasso.coef_[selected_feature_indices]
# os.makedirs('/mnt/md0/Public/T3_T4/new/2', exist_ok=True)
# selected_features.to_csv('/mnt/md0/Public/T3_T4/new/2/feature.csv')
# # 打印选择的特征的贡献度
# print("Selected Feature Contributions:")
# for i, idx in enumerate(selected_feature_indices):
#     print(f"Feature {X.columns[idx]}: {selected_feature_contributions[i]}")

In [None]:
# from sklearn.model_selection import StratifiedKFold
# import os
# from tqdm import tqdm
# import pickle
# import numpy as np
# import pandas as pd
# import os
# import typing
# from datetime import datetime

# from sklearn.base import ClassifierMixin


# from sklearn.model_selection import GridSearchCV, StratifiedKFold,StratifiedShuffleSplit
# from sklearn.model_selection._search import BaseSearchCV
# from skopt import BayesSearchCV

# from sklearn.base import ClassifierMixin
# from sklearn.svm import SVC
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
# from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neural_network import MLPClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier
# from sklearn.calibration import CalibratedClassifierCV
# from skopt.space import Real, Categorical
# from skopt import BayesSearchCV
# from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
# from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, f1_score, matthews_corrcoef, auc,precision_recall_curve
# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, auc
# import warnings
# warnings.filterwarnings("ignore")


# def get_evaluation(label: list, pred: list, pro_cutoff: float = None):
#     fpr, tpr, thresholds = roc_curve(label, pred)
#     if pro_cutoff is None:
#         best_one_optimal_idx = np.argmax(tpr - fpr)
#         pro_cutoff = thresholds[best_one_optimal_idx]
#     pred_l = [1 if i >= pro_cutoff else 0 for i in pred]
#     #后面新增的计算prAUC
#     confusion_matrix_1d = confusion_matrix(label, pred_l).ravel()
#     confusion_dict = {N: n for N, n in zip(['tn', 'fp', 'fn', 'tp'], list(
#         confusion_matrix_1d * 2 / np.sum(confusion_matrix_1d)))}
    
#     precision, recall, _ = precision_recall_curve(label, pred)
#     pr_auc = auc(recall, precision)
    
#     evaluation = {
#         "accuracy": accuracy_score(label, pred_l),
#         "precision": precision_score(label, pred_l),
#         "f1_score": f1_score(label, pred_l),
#         "mmc": matthews_corrcoef(label, pred_l),
#         "rocAUC": auc(fpr, tpr),
#         "prAUC": pr_auc,
#         "specificity": confusion_dict['tn'] / (confusion_dict['tn'] + confusion_dict['fp']),
#         "sensitivity": confusion_dict['tp'] / (confusion_dict['tp'] + confusion_dict['fn']),
#         'pro_cutoff': pro_cutoff
#     }
#     return evaluation

# def plot_roc_curve(target, pred, path_to_: str):
#     fpr, tpr, thresholds = roc_curve(target, pred)
#     roc_auc = auc(fpr, tpr)

#     plt.figure(figsize=(19.2, 10.8))
#     plt.plot(fpr, tpr, color='red', lw=2,
#              label='ROC curve (area = %0.2f)' % roc_auc)
#     plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
#     plt.xlim([0.0, 1.0])
#     plt.ylim([0.0, 1.05])
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.title('Receiver operating characteristic (ROC) curve')
#     plt.legend(loc="lower right")

#     plt.savefig(f"{path_to_}")
#     plt.clf()
# class MyOptimitzer:
#     def __init__(self, classifier_name: str, classifier_class: ClassifierMixin, classifier_param_dict: dict) -> None:
#         self.classifier_name = classifier_name
#         self.classifier_class = classifier_class
#         self.classifier_param_dict = classifier_param_dict

#         self.grid_search: BaseSearchCV = None
#         self.train_best_predicted_pair = None
#         self.train_best_5C_predicted_pair = None
#         self.best_predicted_pair = None
#         self.best_5C_predicted_pair = None
#         self.start_to_train_time = datetime.now()
#         self.end_of_train_time = None
#         pass

#     def find_best(
#         self,
#         X: np.ndarray,
#         y: np.ndarray,
#         validation: tuple,
#         search_method: typing.Literal["GridSearchCV", "BayesSearchCV"],
#         n_jobs: int = 20
#     ):

        

#         if search_method == "GridSearchCV":
#             self.grid_search = GridSearchCV(
#                 self.classifier_class(),
#                 param_grid=self.classifier_param_dict,
#                 cv=StratifiedKFold(
#                     n_splits=5,
#                     shuffle=True,
#                     random_state=42
#                 ),
#                 scoring='roc_auc',
#                 n_jobs=n_jobs,
#                 refit=True
#             )
#         elif search_method == "BayesSearchCV":
#             self.grid_search = BayesSearchCV(
#                 self.classifier_class(),
#                 search_spaces=self.classifier_param_dict,
#                 cv=StratifiedKFold(
#                     n_splits=5,
#                     shuffle=True,
#                     random_state=42
#                 ),
#                 scoring='roc_auc',
#                 n_jobs=n_jobs,
#                 n_points=n_jobs,
#                 n_iter=5,
#                 refit=True
#             )
        
#         y_origin = y
#         full_X = np.concatenate([
#             X, validation[0]
#         ])
#         full_y = np.concatenate([
#             y_origin, validation[1]
#         ])

#         self.grid_search.fit(full_X, full_y)
#         self.best_predicted_pair = [
#             np.nan_to_num(self.grid_search.predict_proba(
#                 X=validation[0]
#             ), nan=0.0),
#             validation[1]
#         ]
#         self.train_best_predicted_pair = [
#             np.nan_to_num(self.grid_search.predict_proba(
#                 X=X
#             ), nan=0.0),
#             y
#         ]

#         # 5倍交叉验证
        
#         # 跑模型
#         self.best_5C_predicted_pair = []
#         self.train_best_5C_predicted_pair = []
#         for Kfold_id, (train_id, test_id) in enumerate(
#             StratifiedKFold(
#                 n_splits=5,
#                 shuffle=True,
#                 random_state=42
#             ).split(full_X, full_y)
#         ):
            

#             # 定义模型并加载参数
#             fiveC_model = self.classifier_class(
#                 **self.grid_search.best_params_,
#             )
#             y_to_train = full_y[train_id].copy()
#             if self.classifier_name == "LabelPropagation":
#                 y_to_train[
#                     np.random.choice(
#                         a=np.arange(y_to_train.shape[0]),
#                         size=max(int(y_to_train.shape[0] * 0.25), 1)
#                     )
#                 ] = -1

            
#             fiveC_model.fit(
#                 full_X[train_id],
#                 y_to_train
#             )

#             # 预测并记录
#             self.best_5C_predicted_pair.append([
#                 np.nan_to_num(fiveC_model.predict_proba(
#                     X=full_X[test_id]
#                 ), nan=0.0),
#                 full_y[test_id]
#             ])
#             self.train_best_5C_predicted_pair.append([
#                 np.nan_to_num(fiveC_model.predict_proba(
#                     X=full_X[train_id]
#                 ), nan=0.0),
#                 y_to_train
#             ])

#         return self

#     def get_summary(self, path_to_dir: str = None):
#         os.makedirs(path_to_dir, exist_ok=True)
#         model_path = "-"
        

#         model_path = f"{path_to_dir}/{self.classifier_name}.pkl"
#         if path_to_dir is not None:
#             with open(model_path, "bw+") as f:
#                 pickle.dump(
#                     self.grid_search, f
#                 )
            
#         training_testing_performance = get_evaluation(
#             label=self.best_predicted_pair[1],
#             pred=self.best_predicted_pair[0][:, 1],
#         )

#         # 计算5C中的平均表现
#         FiveFold_result = {}
#         for keys in training_testing_performance.keys():
#             value_list = []
#             for item in self.best_5C_predicted_pair:

#                 item_performance = get_evaluation(
#                     label=item[1],
#                     pred=item[0][:, 1],
#                 )
#                 value_list.append(item_performance[keys])

#             if keys == "pro_cutoff":
#                 FiveFold_result[keys] = value_list
#             else:
#                 FiveFold_result[keys] = sum(value_list) / len(value_list)

#         self.end_of_train_time = datetime.now()

#         return pd.Series({
#                         "Classifier_Name": self.classifier_name,
#                         "Optimitied_Param": dict(self.grid_search.best_params_),
#                         "Model_Path": model_path
#                     } | FiveFold_result
#                         )
# find_space = [ { "name": "LogisticRegression", 
#                 "class": LogisticRegression,
#                 "param": [{
#                 "penalty": ['l2', ],
#                 "C": [0.1, 1.0, ],
#                 "solver": ['lbfgs',],
#                 'class_weight': ['balanced',],
#                 'max_iter': [1000, ]
#                 }, ],
#                 "Bayes": False
#                 },
#             ]
# a = 'all'
# f = pd.read_csv(f'new/{a}/feature.csv')
# feature = f.iloc[0:,0:-1]
# feature_ = feature.astype("float").values
# target = f.loc[0:,'label']
# target_ = target.values
# model_path_to_save = f'new/{a}'
# os.makedirs(model_path_to_save, exist_ok=True)
# result_list = []
# for model_index in tqdm(range(len(find_space))):
#     fivecross_result = pd.concat([
#         MyOptimitzer(
#             find_space[model_index]["name"],
#             find_space[model_index]["class"],
#             find_space[model_index]["param"],
#         ).find_best(
#             X=feature_[train_id],
#             y=target_[train_id],
#             search_method = "BayesSearchCV",
#             validation=(feature_[test_id], target_[test_id])
#         ).get_summary(
#             path_to_dir=f"{model_path_to_save}/{Kfold_id}"
#         )
#         for Kfold_id, (train_id, test_id) in enumerate(StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(feature_, target_))
#     ], axis=1).T

#     print(fivecross_result)

#     fivecross_result.loc[:, ["Classifier_Name", "Optimitied_Param", "Model_Path"]].to_csv(
#         f"{model_path_to_save}/{find_space[model_index]['name']}_Param.csv"
#     )
#     fivecross_result_splited = fivecross_result.loc[:, [
#         "accuracy", "precision", "f1_score", "mmc", "rocAUC", "specificity", "sensitivity", "pro_cutoff","prAUC"]]
#     fivecross_result_splited.to_csv(
#         f"{model_path_to_save}/{find_space[model_index]['name']}_5Fold.csv"
#     )

#     series = fivecross_result_splited
#     series.name = find_space[model_index]["name"]
#     result_list.append(series)

