In [None]:
%matplotlib inline
import numpy as np
#import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#import missingno as msno

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ["OMP_NUM_THREADS"] = '9' 

#---------------------------------------------------
import sklearn
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier

#---------------------------
from sklearn.model_selection import GridSearchCV

In [None]:

def get_model(name):
    if name=="SVC":
        clf=GridSearchCV(
            svm.SVC(),
            {
                "kernel": ['linear', "poly"],
                "C": [0.01,0.1,1]
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="DT":
        clf=GridSearchCV(
            tree.DecisionTreeClassifier(),
            {
                "criterion": ['entropy',"gini"],
                "max_depth": [2,10,50,100,200,500] ,
                "min_samples_split": [1,5,10,20] ,
                "min_samples_leaf": [1,5,10,20] ,
                "min_impurity_decrease": [0.001,0.01]
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="LR":
        clf = GridSearchCV(
            linear_model.LogisticRegression(),
            {
                "penalty":[None,'l2',"l1"],
                "C":[1,2,3,4,5,6,7,8,9,10],
                "solver":["newton-cg", "lbfgs", "liblinear", "sag"],
                "class_weight":["balanced",None]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="NB":
        clf = GridSearchCV(
            GaussianNB(),
            {},
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
        
    if name=="KNN":
        clf = GridSearchCV(
            KNeighborsClassifier(),
            {
                "n_neighbors": [3,5,10,12]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="RF":
        clf = GridSearchCV(
            RandomForestClassifier(),
            {
                "n_estimators" : [10,15,20,25,30],
                "criterion" : ["entropy","gini"],
                "max_depth" : [8,12,16,20],
                "min_samples_split" : [2,3,4,5],
                "min_samples_leaf" : [1,2,3,4,5]
            },
            refit=True,
            cv=3,
            scoring="accuracy",
            n_jobs=-1
        )
    if name=="XB":
        clf=GridSearchCV(
            XGBClassifier(),
            {
                "n_estimators":[5,10,15,20], 
                "max_depth":[6,12,18,24,30], 
                "min_child_weight":[0.5,0.6,0.7],
                "scale_pos_weight":[2.4,2.5],
            },
            refit = True,
            cv = 3,
            scoring="accuracy",
            n_jobs=-1
        )
    return clf

In [None]:

def get_train_test(df,fold,seed):
    nums = df.shape[0]
    chunk_size = nums // fold
    print("样本总数：",nums)
    print("划分后每折样本数：",chunk_size)

    df_shuffled = df.sample(frac=1, random_state=seed)
    chunks = [df_shuffled.iloc[i:i + chunk_size] for i in range(0, nums, chunk_size)]

    if fold==3:
        train = [pd.concat([chunks[0],chunks[1]]),
                 pd.concat([chunks[0],chunks[2]]),
                 pd.concat([chunks[1],chunks[2]]),
                ]
        test = [chunks[2],chunks[1],chunks[0]]
        
    
    if fold==5:
        train = [pd.concat([chunks[0],chunks[1],chunks[2],chunks[3]]),
                 pd.concat([chunks[0],chunks[1],chunks[2],chunks[4]]),
                 pd.concat([chunks[0],chunks[1],chunks[3],chunks[4]]),
                 pd.concat([chunks[0],chunks[2],chunks[3],chunks[4]]),
                 pd.concat([chunks[1],chunks[2],chunks[3],chunks[4]]),
                ]
        test = [chunks[4],chunks[3],chunks[2],chunks[1],chunks[0]]
    return train,test

def get_data(data,name):
    train_data = data 
    target_name = name 
    
    df= train_data
    features=df.columns.tolist()
    features.remove( target_name )
    y=df[ target_name ]
    X=df[features].values
    return X,y




def train_model(data_num,target_name,**dic):
    clf = get_model(name=dic["name"])
    clf.fit(X, y)
    X,y = get_data(test[data_num],target_name)
    return clf,X,y


def train_model_(df1,df2,target_name,**dic):
    clf = get_model(name=dic["name"])
    clf.fit(X, y)
    X,y = get_data(df2,target_name)
    return clf,X,y 

In [None]:

fpath="./lasso_/8_train_score_med_xb.xlsx"
df=pd.read_excel(fpath)


fpath="./lasso_/8_train_score_med_xb.xlsx"
df1=pd.read_excel(fpath)

fpath="./lasso_/9_test_score_med_xb.xlsx"
df2=pd.read_excel(fpath)

In [None]:
df
print(df.shape,df1.shape,df2.shape)

In [None]:
warnings.filterwarnings("ignore")

tp1 = []
tp2 = []
#num = 10
for s in range(0,50):#[1]:#
    print("----------------------------seed:",s)
    train,test = get_train_test(df=df,fold=3,seed=s)
    
    
    
    auc_record = []
    #"""

    for i in ["SVC", "LR", "KNN", "NB", "DT", "RF", "XB"]:
        
        name = i

        fold = 3
        target_name = "随访>=3"
        model_name = i#"RF"#SVC LR KNN NB DT RF XB

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0,1, 100)

        for i in range(3):
            clf0,X0,y0 = train_model(data_num=i,target_name=target_name,name=model_name)
            viz = RocCurveDisplay.from_estimator(clf0, X0, y0, name='clf'+str(i), ax=ax, alpha=0.8, linewidth=0.6)

            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)    
        
        auc_record.append(mean_auc)
        print(name,round(mean_auc,2))
    #"""
    
    #"""
    auc_record_ = []
    print("##############")
    for i in ["SVC", "LR", "KNN", "NB", "DT", "RF", "XB"]:
        fig, ax = plt.subplots()
        ax.plot([0,1],[0,1],linestyle='--',color='r',linewidth=0.8)#
        
        name = i

        fold = 3
        target_name = "随访>=3"
        model_name = i#"RF"#SVC LR KNN NB DT RF XB

        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)

        for i in range(3):
            clf0,X0,y0 = train_model_(df1=df1,df2=df2,target_name=target_name,name=model_name)
            viz = RocCurveDisplay.from_estimator(clf0, X0, y0, name='clf'+str(i), ax=ax, alpha=0.8, linewidth=0.6)

            interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        
        auc_record_.append(mean_auc)
        print(name,round(mean_auc,2))
        
    tp1.append(auc_record)
    tp2.append(auc_record_)
    #"""
    input()