In [None]:
%matplotlib inline
import numpy as np
#import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#import missingno as msno

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ["OMP_NUM_THREADS"] = '9' 

#---------------------------------------------------
import sklearn
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

#---------------------------
from sklearn.model_selection import GridSearchCV

In [None]:

def get_model(name):
    if name=="SVC":
        clf=GridSearchCV(
            svm.SVC(),
            {
                "kernel": ['linear', "poly"],
                "C": [0.01,0.1,1]
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="DT":
        clf=GridSearchCV(
            tree.DecisionTreeClassifier(),
            {
                "criterion": ['entropy',"gini"],
                "max_depth": [2,10,50,100,200,500] ,
                "min_samples_split": [1,5,10,20] ,
                "min_samples_leaf": [1,5,10,20] ,
                "min_impurity_decrease": [0.001,0.01]
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="LR":
        clf = GridSearchCV(
            linear_model.LogisticRegression(),
            {
                "penalty":[None,'l2',"l1"],
                "C":[1,2,3,4,5,6,7,8,9,10],
                "solver":["newton-cg", "lbfgs", "liblinear", "sag"],
                "class_weight":["balanced",None]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="NB":
        clf = GridSearchCV(
            GaussianNB(),
            {},
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
        
    if name=="KNN":
        clf = GridSearchCV(
            KNeighborsClassifier(),
            {
                "n_neighbors": [3,5,10,12]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="RF":
        clf = GridSearchCV(
            RandomForestClassifier(),
            {
                "n_estimators" : [10,15,20,25,30],
                "criterion" : ["entropy","gini"],
                "max_depth" : [8,12,16,20],
                "min_samples_split" : [2,3,4,5],
                "min_samples_leaf" : [1,2,3,4,5]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="XB":
        clf=GridSearchCV(
            XGBClassifier(),
            {
                "n_estimators":[5,10,15,20], 
                "max_depth":[6,12,18,24,30], 
                "min_child_weight":[0.5,0.6,0.7],
                "scale_pos_weight":[2.4,2.5],
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    return clf

In [None]:

def get_train_test(df,fold,seed):
    nums = df.shape[0]
    chunk_size = nums // fold
    print("样本总数：",nums)
    print("划分后每折样本数：",chunk_size)

    df_shuffled = df.sample(frac=1, random_state=seed)
    chunks = [df_shuffled.iloc[i:i + chunk_size] for i in range(0, nums, chunk_size)]

    if fold==3:
        train = [pd.concat([chunks[0],chunks[1]]),
                 pd.concat([chunks[0],chunks[2]]),
                 pd.concat([chunks[1],chunks[2]]),
                ]
        test = [chunks[2],chunks[1],chunks[0]]
        
    
    if fold==5:
        train = [pd.concat([chunks[0],chunks[1],chunks[2],chunks[3]]),
                 pd.concat([chunks[0],chunks[1],chunks[2],chunks[4]]),
                 pd.concat([chunks[0],chunks[1],chunks[3],chunks[4]]),
                 pd.concat([chunks[0],chunks[2],chunks[3],chunks[4]]),
                 pd.concat([chunks[1],chunks[2],chunks[3],chunks[4]]),
                ]
        test = [chunks[4],chunks[3],chunks[2],chunks[1],chunks[0]]
    return train,test

def get_data(data,name):
    train_data = data 
    target_name = name 
    
    df= train_data
    features=df.columns.tolist()
    features.remove( target_name )
    y=df[ target_name ]
    X=df[features].values
    return X,y




def train_model(data_num,target_name,**dic):
    clf = get_model(name=dic["name"])
    clf.fit(X, y)
    X,y = get_data(test[data_num],target_name)
    return clf,X,y


def train_model_(df1,df2,target_name,**dic):
    clf = get_model(name=dic["name"])
    clf.fit(X, y)
    X,y = get_data(df2,target_name)
    return clf,X,y 

In [None]:
flag = 3

if flag == 1:
    fpath="./lasso/1_train_liux.xlsx"
    df=pd.read_excel(fpath)

    fpath="./lasso/1_train_liux.xlsx"
    df1=pd.read_excel(fpath)

    fpath="./lasso/2_test_liux.xlsx"
    df2=pd.read_excel(fpath)
    
if flag == 2:
    fpath="./lasso/3_train_subx.xlsx"
    df=pd.read_excel(fpath)

    fpath="./lasso/3_train_subx.xlsx"
    df1=pd.read_excel(fpath)

    fpath="./lasso/4_test_subx.xlsx"
    df2=pd.read_excel(fpath)
    
if flag == 3:
    fpath="./lasso/5_train_liux_subx.xlsx"
    df=pd.read_excel(fpath)

    fpath="./lasso/5_train_liux_subx.xlsx"
    df1=pd.read_excel(fpath)

    fpath="./lasso/6_test_liux_subx.xlsx"
    df2=pd.read_excel(fpath)

In [None]:
fig, ax = plt.subplots()
import warnings
from sklearn.metrics import roc_curve, auc

In [None]:
warnings.filterwarnings("ignore")

tp1 = []
tp2 = []
#num = 10
for s in range(0,50):#[1]:#
    print("----------------------------seed:",s)
    train,test = get_train_test(df=df,fold=3,seed=s)
    
    
    
    auc_record = []
    #"""

    for i in [ "RF", "XB"]:#"SVC", "LR", "KNN", "NB", "DT",
        
        name = i

        fold = 3
        target_name = "随访>=3"
        model_name = i#"RF"#SVC LR KNN NB DT RF XB

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0,1, 100)

        for i in range(3):
            clf0,X0,y0 = train_model(data_num=i,target_name=target_name,name=model_name)
            viz = RocCurveDisplay.from_estimator(clf0, X0, y0, name='clf'+str(i), ax=ax, alpha=0.8, linewidth=0.6)

            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)    
        
        auc_record.append(mean_auc)
        print(name,round(mean_auc,2))
    #"""
    
    #"""
    auc_record_ = []
    print("##############")
    for i in ["RF", "XB"]:#"SVC", "LR", "KNN", "NB", "DT", 
        fig, ax = plt.subplots()
        ax.plot([0,1],[0,1],linestyle='--',color='r',linewidth=0.8)#对角线
        
        name = i

        fold = 3
        target_name = "随访>=3"
        model_name = i#"RF"#SVC LR KNN NB DT RF XB

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for i in range(3):
            clf0,X0,y0 = train_model_(df1=df1,df2=df2,target_name=target_name,name=model_name)
            viz = RocCurveDisplay.from_estimator(clf0, X0, y0, name='clf'+str(i), ax=ax, alpha=0.8, linewidth=0.6)

            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        
        auc_record_.append(mean_auc)
        print(name,round(mean_auc,2))
        
    tp1.append(auc_record)
    tp2.append(auc_record_)
    #"""
    input()

In [None]:
####################for img

def get_model(name):
    if name=="SVC":
        clf=GridSearchCV(
            svm.SVC(),
            {
                "kernel": ['linear', "poly"],
                "C": [0.01,0.1,1]
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="DT":
        clf=GridSearchCV(
            tree.DecisionTreeClassifier(),
            {
                "criterion": ['entropy',"gini"],
                "max_depth": [2,5,10,20,30,40] ,
                "min_samples_split": [1,3,5,8,10,20] ,
                "min_samples_leaf": [1,3,5,8,10,20] ,
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="LR":
        clf = GridSearchCV(
            linear_model.LogisticRegression(),
            {
                "penalty":[None,'l2',"l1"],
                "C":[1,2,3,4,5,6,7,8,9,10],
                "solver":["newton-cg", "lbfgs", "liblinear", "sag"],
                "class_weight":["balanced",None]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="NB":
        clf = GridSearchCV(
            GaussianNB(),
            {},
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
        
    if name=="KNN":
        clf = GridSearchCV(
            KNeighborsClassifier(),
            {
                "n_neighbors": [2,4,6,8,10,15,20,25]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="RF":
        clf = GridSearchCV(
            RandomForestClassifier(),
            {
                "n_estimators" : [4,8,12,16,32,64],
                "criterion" : ["entropy","gini"],
                "max_depth" : [2,4,6,8],
                "min_samples_split" : [2,3,4,5],
                "min_samples_leaf" : [1,2,3,4,5]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="XB":
        clf=GridSearchCV(
            XGBClassifier(),
            {
                "n_estimators":[1,2,5,8,15,20,50,80], 
                "max_depth":[1,2,3,4,5,6],
                "min_child_weight":[0.2,0.4,0.6,0.8],
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    return clf

In [None]:
###########for sub

def get_model(name):
    if name=="SVC":
        clf=GridSearchCV(
            svm.SVC(),
            {
                "kernel": ['linear'],
                "C": [5,10]
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="LR":
        clf = GridSearchCV(
            linear_model.LogisticRegression(),
            {
                "C":[10,20,30],
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="DT":
        clf=GridSearchCV(
            tree.DecisionTreeClassifier(),
            {
                "criterion": ['entropy',"gini"],
                "min_samples_split": [1,5,10,20] ,
                "min_impurity_decrease": [0.001,0.01]
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="NB":
        clf = GridSearchCV(
            GaussianNB(),
            {},
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
        
    if name=="KNN":
        clf = GridSearchCV(
            KNeighborsClassifier(),
            {
                "n_neighbors": [3,5,10,12]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="RF":
        clf = GridSearchCV(
            RandomForestClassifier(),
            {
                "n_estimators" : [10,15,20,25,30],
                "criterion" : ["entropy","gini"],
                "max_depth" : [8,12,16,20],
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="XB":
        clf=GridSearchCV(
            XGBClassifier(),
            {
                "n_estimators":[5,20],
                "max_depth":[6,12,18,24,30], 
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    return clf

In [None]:
import warnings
from sklearn.metrics import roc_curve, auc
warnings.filterwarnings("ignore")

tp1 = []
tp2 = []
#num = 10
for s in [1]:#range(0,50):#[1]:#
    print("----------------------------seed:",s)
    train,test = get_train_test(df=df,fold=3,seed=s)
    
    
    
    auc_record = []
    #"""

    for i in ["XB"]:#["SVC", "LR", "KNN", "NB", "DT", "RF", "XB"]:
        fig, ax = plt.subplots()
        ax.plot([0,1],[0,1],linestyle='--',color='r',linewidth=0.8)
        
        name = i

        fold = 3
        target_name = "随访>=3"
        model_name = i#"RF"#SVC LR KNN NB DT RF XB

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0,1, 100)

        for i in range(3):
            clf0,X0,y0 = train_model(data_num=i,target_name=target_name,name=model_name)
            viz = RocCurveDisplay.from_estimator(clf0, X0, y0, name='clf'+str(i), ax=ax, alpha=0.8, linewidth=0.6)

            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)    
        
        auc_record.append(mean_auc)
        print(name,round(mean_auc,2))
        
        
        #----------------------------------------
        std_auc = np.std(aucs)
        ax.plot(mean_fpr, mean_tpr, color='b',
                label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
                lw=2, alpha=.8)

        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + 1.96*std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - 1.96*std_tpr, 0)
        ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                        label="95% CI")#r'$\pm$ 1 std. dev.'


        ax.legend(loc="lower right",fontsize=8)

        plt.title('ROC Curve Comparison')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.show()
        #------------------------------------------
    #"""
    
    #"""
    auc_record_ = []
    print("##############")
    for i in ["XB"]:#["SVC", "LR", "KNN", "NB", "DT", "RF", "XB"]:
        fig, ax = plt.subplots()
        ax.plot([0,1],[0,1],linestyle='--',color='r',linewidth=0.8)
        
        name = i

        fold = 3
        target_name = "随访>=3"
        model_name = i#"RF"#SVC LR KNN NB DT RF XB

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for i in range(1):
            clf0,X0,y0 = train_model_(df1=df1,df2=df2,target_name=target_name,name=model_name)
            viz = RocCurveDisplay.from_estimator(clf0, X0, y0, name='clf'+str(i), ax=ax, alpha=1, linewidth=1, color='b')

            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        
        auc_record_.append(mean_auc)
        print(name,round(mean_auc,2))
        
        
        #----------------------------------------
        #std_auc = np.std(aucs)
        #ax.plot(mean_fpr, mean_tpr, color='b',
        #        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        #        lw=2, alpha=.8)

        #std_tpr = np.std(tprs, axis=0)
        #tprs_upper = np.minimum(mean_tpr + 1.96*std_tpr, 1)
        #tprs_lower = np.maximum(mean_tpr - 1.96*std_tpr, 0)
        #ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
        #               label="95% CI")#r'$\pm$ 1 std. dev.'


        #ax.legend(loc="lower right",fontsize=8)

        plt.title('ROC Curve Comparison')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.show()
        #------------------------------------------
        
    tp1.append(auc_record)
    tp2.append(auc_record_)
    #"""
    #input()

In [None]:
import warnings
from sklearn.metrics import roc_curve, auc
warnings.filterwarnings("ignore")

tp1 = []
tp2 = []
#num = 10
for s in [1]:#range(0,50):#[1]:#
    print("----------------------------seed:",s)
    train,test = get_train_test(df=df,fold=3,seed=s)
    
    
    
    auc_record = []
 
  
    
    #"""
    fig, ax = plt.subplots()
    ax.plot([0,1],[0,1],linestyle='--',color='r',linewidth=0.8)#对角线
    
    auc_record_ = []
    print("##############")
    for i in ["SVC", "LR", "KNN", "NB", "DT", "RF", "XB"]:
        
        name = i

        fold = 3
        target_name = "随访>=3"
        model_name = i#"RF"#SVC LR KNN NB DT RF XB

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for j in range(1):
            clf0,X0,y0 = train_model_(df1=df1,df2=df2,target_name=target_name,name=model_name)
            viz = RocCurveDisplay.from_estimator(clf0, X0, y0, name=str(i), ax=ax, alpha=1, linewidth=0.6)

            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        
        auc_record_.append(mean_auc)
        print(name,round(mean_auc,2))
        
        
        #----------------------------------------
        #std_auc = np.std(aucs)
        #ax.plot(mean_fpr, mean_tpr, color='b',
        #        label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
        #        lw=2, alpha=.8)

        #std_tpr = np.std(tprs, axis=0)
        #tprs_upper = np.minimum(mean_tpr + 1.96*std_tpr, 1)
        #tprs_lower = np.maximum(mean_tpr - 1.96*std_tpr, 0)
        #ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
        #               label="95% CI")#r'$\pm$ 1 std. dev.'


        #ax.legend(loc="lower right",fontsize=8)
        #------------------------------------------
        

    #plt.title('ROC Curve Comparison')
    #plt.xlabel('False Positive Rate')
    #plt.ylabel('True Positive Rate')
    plt.show()
    
    tp1.append(auc_record)
    tp2.append(auc_record_)
    #"""
    #input()