In [41]:
import xgboost as xgb
from sklearn.model_selection import RepeatedKFold, cross_validate, train_test_split
import pandas as pd
from sklearn.metrics import RocCurveDisplay, ConfusionMatrixDisplay, roc_auc_score, f1_score

fittime = []
scoretime = []
testAUC = []
testf1 = []
testacc = []


data = pd.read_excel("~/projects/Sano/sanoproject_ENO.xlsx",sheet_name="fukuoka", index_col="ID")
data = data[data.target != 3]
data["maligvsbenign"] = [1 if x<3 else 0 for x in data.target]
data["Sex"] = [1 if x == "F" else 0 for x in data.Sex]
X = data[['intbronch','plasmacellinfil', 'eosinoinfil', 'lymphoidagg', 'fibroelastosis', 'op',]]
X = X.rename(columns={'intbronch':"IB/B",
                       'plasmacellinfil':"PLC",
                       'eosinoinfil':"Eo",
                       'lymphoidagg':"Ly",
                       'fibroelastosis':"FE",
                       'op':"OP",
                       })
y = data["maligvsbenign"]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=data["target"], random_state=184)


clf = xgb.XGBClassifier(gamma = 0.0001, 
                        learning_rate = 1, 
                        max_depth = 3,
                        min_child_weight = 1,
                        n_estimators = 1000,
                        objective = "binary:logistic",
                        reg_alpha = 1,
                        reg_lambda = 1,
                        subsample = 1,
                        tree_method = "hist",)

scorers = {"AUC":"roc_auc",
            "f1":"f1",
            "Accuracy":"accuracy"}

scores = cross_validate(clf, 
                        X_train, y_train, 
                        scoring=scorers,
                        cv=RepeatedKFold(n_splits=10, n_repeats=1, random_state=84),
                        n_jobs=-1)

fittime.append(scores["fit_time"])
scoretime.append(scores["score_time"])
testAUC.append(scores["test_AUC"])
testf1.append(scores["test_f1"])
testacc.append(scores["test_Accuracy"])

print("No exclusions")
print(f"Tuned AUC: {scores['test_AUC'].mean():.3f}, F1: {scores['test_f1'].mean():.3f}, Accuracy: {scores['test_Accuracy'].mean():.3f}")
trainedclf = clf.fit(X_train, y_train)
preds = trainedclf.predict(X_test)

print(f"Test Set Accuracy: {trainedclf.score(X_test,y_test):.3f}")

# cm = ConfusionMatrixDisplay.from_predictions(y_test, preds).ax_.set_title("XGB")

# xgb.plot_importance(trainedclf, importance_type="gain")
# RocCurveDisplay.from_estimator(trainedclf, X_test, y_test)
# xgb.to_graphviz(trainedclf, num_trees=3)

features = ["IB/B",
            "PLC",
            "Eo",
            "Ly",
            "FE",
            "OP"
            ]



for i in range(6):
    print(f"Excluding {features[i]}")
    X = data[['intbronch','plasmacellinfil', 'eosinoinfil', 'lymphoidagg', 'fibroelastosis', 'op',]]
    X = X.rename(columns={'intbronch':"IB/B",
                        'plasmacellinfil':"PLC",
                        'eosinoinfil':"Eo",
                        'lymphoidagg':"Ly",
                        'fibroelastosis':"FE",
                        'op':"OP",
                        })
    X = X.drop(features[i], axis = 1)
    y = data["maligvsbenign"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=data["target"], random_state=184)


    clf = xgb.XGBClassifier(gamma = 0.0001, 
                            learning_rate = 1, 
                            max_depth = 3,
                            min_child_weight = 1,
                            n_estimators = 1000,
                            objective = "binary:logistic",
                            reg_alpha = 1,
                            reg_lambda = 1,
                            subsample = 1,
                            tree_method = "hist",)

    scorers = {"AUC":"roc_auc",
                "f1":"f1",
                "Accuracy":"accuracy"}

    scores = cross_validate(clf, 
                            X_train, y_train, 
                            scoring=scorers,
                            cv=RepeatedKFold(n_splits=10, n_repeats=1, random_state=84),
                            n_jobs=-1)
    fittime.append(scores["fit_time"])
    scoretime.append(scores["score_time"])
    testAUC.append(scores["test_AUC"])
    testf1.append(scores["test_f1"])
    testacc.append(scores["test_Accuracy"])
    print(f"Tuned AUC: {scores['test_AUC'].mean():.3f}, F1: {scores['test_f1'].mean():.3f}, Accuracy: {scores['test_Accuracy'].mean():.3f}")
    trainedclf = clf.fit(X_train, y_train)
    preds = trainedclf.predict(X_test)

    print(f"Test Set Accuracy: {trainedclf.score(X_test,y_test):.3f}")
    
fittime = pd.DataFrame(fittime, index = ["all", "-IB/B", "-PLC", "-Eo", "-Ly", "-FE", "-OP"]).T
scoretime = pd.DataFrame(scoretime, index = ["all", "-IB/B", "-PLC", "-Eo", "-Ly", "-FE", "-OP"]).T
testAUC = pd.DataFrame(testAUC, index = ["all", "-IB/B", "-PLC", "-Eo", "-Ly", "-FE", "-OP"]).T
testf1 = pd.DataFrame(testf1, index = ["all", "-IB/B", "-PLC", "-Eo", "-Ly", "-FE", "-OP"]).T
testacc = pd.DataFrame(testacc, index = ["all", "-IB/B", "-PLC", "-Eo", "-Ly", "-FE", "-OP"]).T

No exclusions
Tuned AUC: 0.740, F1: 0.841, Accuracy: 0.780
Test Set Accuracy: 0.745
Excluding IB/B
Tuned AUC: 0.711, F1: 0.778, Accuracy: 0.700
Test Set Accuracy: 0.706
Excluding PLC
Tuned AUC: 0.750, F1: 0.838, Accuracy: 0.775
Test Set Accuracy: 0.725
Excluding Eo
Tuned AUC: 0.746, F1: 0.838, Accuracy: 0.775
Test Set Accuracy: 0.745
Excluding Ly
Tuned AUC: 0.742, F1: 0.838, Accuracy: 0.775
Test Set Accuracy: 0.725
Excluding FE
Tuned AUC: 0.725, F1: 0.829, Accuracy: 0.760
Test Set Accuracy: 0.765
Excluding OP
Tuned AUC: 0.742, F1: 0.829, Accuracy: 0.760
Test Set Accuracy: 0.647


In [42]:
testacc

Unnamed: 0,all,-IB/B,-PLC,-Eo,-Ly,-FE,-OP
0,0.8,0.65,0.8,0.8,0.75,0.8,0.8
1,0.8,0.7,0.8,0.8,0.8,0.75,0.75
2,0.8,0.8,0.8,0.8,0.8,0.85,0.75
3,0.8,0.65,0.8,0.75,0.8,0.75,0.8
4,0.7,0.65,0.7,0.7,0.7,0.7,0.65
5,0.85,0.7,0.8,0.85,0.9,0.9,0.9
6,0.85,0.8,0.85,0.85,0.85,0.8,0.85
7,0.8,0.65,0.8,0.8,0.8,0.75,0.8
8,0.7,0.65,0.7,0.7,0.65,0.65,0.7
9,0.7,0.75,0.7,0.7,0.7,0.65,0.6
