In [1]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
%matplotlib inline

# 1. 读取数据

In [2]:
def load_ICU_data(data_path, label_path):
    if(not data_path.endswith(".csv")):
        print("数据需要csv格式的文件！")
        return None
    if(not label_path.endswith(".csv")):
        print("标签需要csv格式的文件！")
        return None
    data_df = pd.read_csv(data_path)
    label_df = pd.read_csv(label_path)
    data = np.array(data_df)
    label = np.array(label_df)
    return data, label

# 2. SVM

In [3]:
train1_data, train1_label = load_ICU_data("./data1forEx1to4/train1_icu_data.csv", "./data1forEx1to4/train1_icu_label.csv")
test1_data, test1_label = load_ICU_data("./data1forEx1to4/test1_icu_data.csv", "./data1forEx1to4/test1_icu_label.csv")
train2_data, train2_label = load_ICU_data("./data1forEx1to4/train2_icu_data.csv", "./data1forEx1to4/train2_icu_label.csv")
test2_data, test2_label = load_ICU_data("./data1forEx1to4/test2_icu_data.csv", "./data1forEx1to4/test2_icu_label.csv")
train_data = np.concatenate([train1_data, train2_data], axis = 0)
train_label = np.concatenate([train1_label, train2_label], axis = 0)
test_data = np.concatenate([test1_data, test2_data], axis = 0)
test_label = np.concatenate([test1_label, test2_label], axis = 0)

In [4]:
def K_fold_divide(X, Y, K=5):
    classes = np.unique(Y).reshape(-1)
    Y=Y.reshape((-1,1))
    folds = []
    for item in classes:
        x = X[np.where(Y[:,0]==item)]
        y = Y[np.where(Y[:,0]==item)]
        data = np.concatenate([x,y.reshape((-1,1))], axis=1)
        np.random.shuffle(data)
        M=data.shape[0]//K
        if((data.shape[0]-M*K)>0):
            M+=1
    
        for i in range(K):
            valid = data[i*M:min((i+1)*M, data.shape[0])]
            train = np.concatenate([data[0:M*i],data[min((i+1)*M, data.shape[0]):data.shape[0]]], axis = 0)
            train_data = train[:,:data.shape[1]-1]
            train_label = train[:,data.shape[1]-1]
            valid_data = valid[:,:data.shape[1]-1]
            valid_label = valid[:,data.shape[1]-1]
            if(len(folds)<K):
                folds.append([train_data, train_label, valid_data, valid_label])
            else:
                folds[i][0] = np.concatenate([folds[i][0], train_data], axis=0)
                folds[i][1] = np.concatenate([folds[i][1], train_label], axis=0)
                folds[i][2] = np.concatenate([folds[i][2], valid_data], axis=0)
                folds[i][3] = np.concatenate([folds[i][3], valid_label], axis=0)
    
    return folds

In [5]:
# 进行K折交叉验证，并对数据做标准化预处理
n_folds = 5
mean = np.mean(train1_data,axis=0,keepdims=True)
std = np.std(train1_data,axis=0,keepdims=True)
#train1_X = train1_data
#test1_X = test1_data
train1_X = (train1_data - mean)/(std + 1e-12) 
test1_X = (test1_data - mean)/(std + 1e-12) 
folds = K_fold_divide(train1_X, train1_label, K = n_folds)

In [6]:
def train_and_test(classifier = SVC()):
    line9 = "---------"
    line16 = "----------------"
    print("|%-9s|%-16s|%-16s|%-16s|%-16s|%-16s|%-16s|%-16s|%-16s|%-16s|"%("Fold", "train accuracy", "train sensi", "train speci", "valid accuracy", "valid sensi", "valid speci", "test accuracy", "test sensi", "test speci"))
    print("|%-9s|%-16s|%-16s|%-16s|%-16s|%-16s|%-16s|%-16s|%-16s|%-16s|"%(line9, line16, line16, line16, line16, line16, line16, line16, line16, line16))
    train_accus = []
    valid_accus = []
    test_accus = []

    svms = []
    train_accu = []
    train_sensi = []
    train_speci = []
    valid_accu = []
    valid_sensi = []
    valid_speci = []
    test_accu = []
    test_sensi = []
    test_speci = []
    
    for i in range(n_folds):
        train_data, train_label, valid_data, valid_label = folds[i]
        svms.append(classifier)
        svms[i].fit(train_data, train_label.reshape(-1))
        tn, fp, fn, tp = confusion_matrix(train_label.reshape(-1), svms[i].predict(train_data)).ravel()
        train_accu.append((tn+tp)/(tn+fp+fn+tp))
        train_sensi.append(tp/(tp+fn))
        train_speci.append(tn/(tn+fp))
    
        tn, fp, fn, tp = confusion_matrix(valid_label.reshape(-1), svms[i].predict(valid_data)).ravel()
        valid_accu.append((tn+tp)/(tn+fp+fn+tp))
        valid_sensi.append(tp/(tp+fn))
        valid_speci.append(tn/(tn+fp))
        
        tn, fp, fn, tp = confusion_matrix(test1_label.reshape(-1), svms[i].predict(test1_X)).ravel()
        test_accu.append((tn+tp)/(tn+fp+fn+tp))
        test_sensi.append(tp/(tp+fn))
        test_speci.append(tn/(tn+fp))
        print("|%-9d|%-16f|%-16f|%-16f|%-16f|%-16f|%-16f|%-16f|%-16f|%-16f|"%(i+1, train_accu[i], train_sensi[i], train_speci[i], valid_accu[i], valid_sensi[i], valid_speci[i], test_accu[i], test_sensi[i], test_speci[i]))
    mean_train_accu = np.mean(train_accu)
    mean_valid_accu = np.mean(valid_accu)
    mean_test_accu = np.mean(test_accu)
    print("|%-9s|%-16f|%-16f|%-16f|%-16f|%-16f|%-16f|%-16f|%-16f|%-16f|"%("Average", mean_train_accu, np.mean(train_sensi), np.mean(train_speci), mean_valid_accu, np.mean(valid_sensi), np.mean(valid_speci), mean_test_accu, np.mean(test_sensi), np.mean(test_speci)))

### 线性核

In [7]:
print("kernel = \"linear\", C = 0")
train_and_test(SVC(kernel = "linear", C = 1e-12))
print()

print("kernel = \"linear\", C = 1")
train_and_test(SVC(kernel = "linear", C = 1))
print()

print("kernel = \"linear\", C = 10")
train_and_test(SVC(kernel = "linear", C = 10))
print()

kernel = "linear", C = 0
|Fold     |train accuracy  |train sensi     |train speci     |valid accuracy  |valid sensi     |valid speci     |test accuracy   |test sensi      |test speci      |
|---------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
|1        |0.510000        |1.000000        |0.000000        |0.510000        |1.000000        |0.000000        |0.498633        |1.000000        |0.000000        |
|2        |0.510000        |1.000000        |0.000000        |0.510000        |1.000000        |0.000000        |0.498633        |1.000000        |0.000000        |
|3        |0.510000        |1.000000        |0.000000        |0.510000        |1.000000        |0.000000        |0.498633        |1.000000        |0.000000        |
|4        |0.510000        |1.000000        |0.000000        |0.510000        |1.000000        |0.000000        |0.498633        |1.000000        |0.0

### 高斯核

In [8]:
print("kernel = \"rbf\", gamma = \"scale\", C = 1")
train_and_test(SVC(kernel = "rbf", gamma = "scale", C = 1))
print()

print("kernel = \"rbf\", gamma = \"auto\", C = 1")
train_and_test(SVC(kernel = "rbf", gamma = "auto", C = 1))
print()

print("kernel = \"rbf\", gamma = \"scale\", C = 1e-12")
train_and_test(SVC(kernel = "rbf", gamma = "scale", C = 1e-12))
print()

kernel = "rbf", gamma = "scale", C = 1
|Fold     |train accuracy  |train sensi     |train speci     |valid accuracy  |valid sensi     |valid speci     |test accuracy   |test sensi      |test speci      |
|---------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
|1        |0.881750        |0.887255        |0.876020        |0.794000        |0.792157        |0.795918        |0.791249        |0.793419        |0.789091        |
|2        |0.882750        |0.880882        |0.884694        |0.803000        |0.813725        |0.791837        |0.782133        |0.784278        |0.780000        |
|3        |0.878000        |0.872549        |0.883673        |0.800000        |0.805882        |0.793878        |0.784868        |0.780622        |0.789091        |
|4        |0.883250        |0.876961        |0.889796        |0.774000        |0.774510        |0.773469        |0.790337        |0.7861

### 多项式核

In [9]:
print("kernel = \"poly\", coef0 = 0, degree = 2")
train_and_test(SVC(kernel = "poly", coef0 = 0, degree = 2))
print()

print("kernel = \"poly\", coef0 = 0, degree = 3")
train_and_test(SVC(kernel = "poly", coef0 = 0, degree = 3))
print()

print("kernel = \"poly\", coef0 = 1, degree = 3")
train_and_test(SVC(kernel = "poly", coef0 = 1, degree = 3))
print()

print("kernel = \"poly\", coef0 = 1, degree = 4")
train_and_test(SVC(kernel = "poly", coef0 = 1, degree = 4))
print()

kernel = "poly", coef0 = 0, degree = 2
|Fold     |train accuracy  |train sensi     |train speci     |valid accuracy  |valid sensi     |valid speci     |test accuracy   |test sensi      |test speci      |
|---------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|----------------|
|1        |0.848500        |0.832843        |0.864796        |0.722000        |0.664706        |0.781633        |0.738377        |0.687386        |0.789091        |
|2        |0.844500        |0.817647        |0.872449        |0.721000        |0.652941        |0.791837        |0.735643        |0.689214        |0.781818        |
|3        |0.845250        |0.811765        |0.880102        |0.736000        |0.694118        |0.779592        |0.723792        |0.658135        |0.789091        |
|4        |0.842250        |0.807843        |0.878061        |0.743000        |0.719608        |0.767347        |0.723792        |0.6727