In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score,recall_score,accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn import metrics
file_dir = './All_features_RFS.csv'
slide_dir = './pathlogics_features.csv'
combined_dir = './feature_pool.csv'
external_dir = './external_feature_pool.csv'
external_radiomics_dir = './external_radiomics.csv'
external_pathomics_dir = './external_pathomics.csv'
radiomics_data = pd.read_csv(file_dir,index_col=0)
patho_data= pd.read_csv(slide_dir,index_col=0)
combined_data= pd.read_csv(combined_dir,index_col=0)
external_combined_data= pd.read_csv(external_dir,index_col=0)
external_radiomics_data= pd.read_csv(external_radiomics_dir,index_col=0)
external_pathomics_data= pd.read_csv(external_pathomics_dir,index_col=0)

x = combined_data.iloc[:,1:]
y = combined_data.iloc[:,0]

x_external = external_combined_data.iloc[:,1:]
y_external = external_combined_data.iloc[:,0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y)

scaler = StandardScaler()
x_train_ss = scaler.fit_transform(x_train)
x_test_ss = scaler.transform(x_test)
x_external_ss = scaler.transform(x_external)
x_train_ss = pd.DataFrame(x_train_ss,columns=x_train.columns,index=x_train.index)
x_test_ss = pd.DataFrame(x_test_ss,columns=x_test.columns,index=x_test.index)
x_external_ss = pd.DataFrame(x_external_ss,columns=x_external.columns,index=x_external.index)

selector_var = VarianceThreshold()
x_train_var = selector_var.fit_transform(x_train_ss)
x_test_var = selector_var.transform(x_test_ss)
x_external_var = selector_var.transform(x_external_ss)
x.columns[selector_var.get_support()]
X_train = pd.DataFrame(x_train_var, columns = x.columns[selector_var.get_support()],index=x_train.index)
X_test = pd.DataFrame(x_test_var, columns = x.columns[selector_var.get_support()],index=x_test.index)
X_external = pd.DataFrame(x_external_var, columns = x.columns[selector_var.get_support()],index=x_external.index)

model_all_features =DecisionTreeClassifier(random_state=0)
model_all_features.fit(X_train, y_train)
y_pred_test = model_all_features.predict_proba(X_test)[:, 1]
features = pd.Series(model_all_features.feature_importances_)
features.index = X_train.columns
features.sort_values(ascending=False, inplace=True)
features = list(features.index)
model_one_feature = SVC(kernel = 'rbf',class_weight = 'balanced',random_state = 0,probability=True)
model_one_feature.fit(X_train[features[0]].to_frame(), y_train)
y_pred_test = model_one_feature.predict_proba(X_test[features[0]].to_frame())[:, 1]
auc_score_first = roc_auc_score(y_test, y_pred_test)
print(auc_score_first)

print('doing recursive feature addition')
features_to_keep = [features[0]]
count = 1
for feature in features[1:]:
    #print()
    #print('testing feature: ', feature, ' which is feature ', count,
    # ' out of ', len(features))
    count = count + 1
    model_int = SVC(kernel='rbf', class_weight='balanced', random_state=0, probability=True)

    model_int.fit(X_train[features_to_keep + [feature]], y_train)

    y_pred_test = model_int.predict_proba(X_test[features_to_keep + [feature] ])[:, 1]
    auc_score_int = roc_auc_score(y_test, y_pred_test)
    #print('New Test ROC AUC={}'.format((recall_score_int)))
    #print('All features Test ROC AUC={}'.format((recall_score_int)))

    diff_auc = auc_score_int - auc_score_first

    if diff_auc >= 0.02:
        #print('Increase in ROC AUC={}'.format(diff_auc))
        #print('keep: ', feature)
        #print

        auc_score_first = auc_score_int
        features_to_keep.append(feature)
    #else:
        #print('Increase in ROC AUC={}'.format(diff_auc))
        #print('remove: ', feature)
        #print
    #print('DONE!!')
    #print('total features to keep: ', len(features_to_keep))


print('Features kept:',features_to_keep)

final_model= SVC(kernel = 'rbf', class_weight = 'balanced',random_state = 0,probability=True)
final_model.fit(X_train[features_to_keep], y_train)

print('cross validation accuracy in the training dataset:', cross_val_score(final_model,X_train,y_train,cv=5).mean())
print('cross validation accuracy in the validation dataset:',cross_val_score(final_model,X_test,y_test,cv=5).mean())


y_train_pred = final_model.predict(X_train[features_to_keep])
y_test_pred = final_model.predict(X_test[features_to_keep])
y_external_pred = final_model.predict(X_external[features_to_keep])

# 计算Accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
external_accuracy = accuracy_score(y_external, y_external_pred)
print(f"Accuracy in the training datatset: {train_accuracy:.2f}")
print(f"Accuracy in the test datatset: {test_accuracy:.2f}")
print(f"Accuracy in the external datatset: {external_accuracy:.2f}")

#计算auc
auc_train = roc_auc_score(y_train, final_model.decision_function(X_train[features_to_keep]))
auc_test = roc_auc_score(y_test, final_model.decision_function(X_test[features_to_keep]))
auc_external = roc_auc_score(y_external, final_model.decision_function(X_external[features_to_keep]))
print('AUC in the training datatset:', auc_train)
print('AUC in the validation datatset:', auc_test)
print('AUC in the external validation datatset:', auc_external)

#计算Sensitivity
result_train = final_model.predict(X_train[features_to_keep])
recall_train = recall_score(y_train,result_train)
result_test = final_model.predict(X_test[features_to_keep])
recall_test = recall_score(y_test,result_test)
result_external = final_model.predict(X_external[features_to_keep])
recall_external = recall_score(y_external,result_external)
print('Sensitivity in the training dataset:', recall_train)
print('Sensitivity in the validation dataset:', recall_test)
print('Sensitivity in the external validation dataset:', recall_external)


# 计算Specificity
train_specificity = metrics.precision_score(y_train, y_train_pred, pos_label=0)
test_specificity = metrics.precision_score(y_test, y_test_pred, pos_label=0)
external_specificity = metrics.precision_score(y_external, y_external_pred, pos_label=0)
print(f"train Specificity: {train_specificity}")
print(f"test Specificity: {test_specificity}")
print(f"external Specificity: {external_specificity}")

