In [165]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import sklearn.metrics as metrics
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from xgboost import cv
import xgboost as xgb

In [2]:
df = pd.read_csv("/users/edfarber/scratch/dataset/multimodalpulmonaryembolismdataset/merged_pe_data.csv")

In [3]:
df = df.drop("pe_type", axis=1)

In [4]:
train_df = df.loc[df["split"] == "train"]

In [181]:
val_df = df.loc[df["split"] == "val"]
test_df = df.loc[df["split"] == "test"]

In [6]:
x_train = train_df.drop(["idx", "label", "split"], axis=1).to_numpy()

In [7]:
y_train = train_df["label"].to_numpy()

In [8]:
scaler = StandardScaler()

In [9]:
x_train = scaler.fit_transform(x_train)

In [143]:
model = LogisticRegression(random_state=42, solver="liblinear", penalty="l1", max_iter=150, C=0.055)

In [150]:
model.fit(x_train, y_train)

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,0.055
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,150


In [151]:
scores = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc')
print(f"CV AUC = {scores.mean():.3f} ± {scores.std():.3f}")

CV AUC = 0.925 ± 0.015


In [152]:
x_val = val_df.drop(["idx", "label", "split"], axis=1).to_numpy()
x_val = scaler.transform(x_val)

In [153]:
y_val = val_df["label"].to_numpy()

In [154]:
y_prob = model.predict_proba(x_val)[:, 1]

In [155]:
print(f"Validation AUROC: {metrics.roc_auc_score(y_val, y_prob):.3f}")

Validation AUROC: 0.910


In [183]:
model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=123,
    eval_metric='auc',
    early_stopping_rounds=10,
    alpha=10,
    objective="binary:logistic",
    colsample_bytree=0.3
)

In [184]:
model.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=True)

[0]	validation_0-auc:0.72778
[1]	validation_0-auc:0.88785
[2]	validation_0-auc:0.88535
[3]	validation_0-auc:0.89434
[4]	validation_0-auc:0.88377
[5]	validation_0-auc:0.88126
[6]	validation_0-auc:0.88519
[7]	validation_0-auc:0.89003
[8]	validation_0-auc:0.89412
[9]	validation_0-auc:0.89586
[10]	validation_0-auc:0.89336
[11]	validation_0-auc:0.89281
[12]	validation_0-auc:0.89259
[13]	validation_0-auc:0.89357
[14]	validation_0-auc:0.89444
[15]	validation_0-auc:0.89336
[16]	validation_0-auc:0.89455
[17]	validation_0-auc:0.89673
[18]	validation_0-auc:0.89630
[19]	validation_0-auc:0.89804
[20]	validation_0-auc:0.89657
[21]	validation_0-auc:0.89809
[22]	validation_0-auc:0.89668
[23]	validation_0-auc:0.89853
[24]	validation_0-auc:0.89858
[25]	validation_0-auc:0.89880
[26]	validation_0-auc:0.89815
[27]	validation_0-auc:0.89858
[28]	validation_0-auc:0.89880
[29]	validation_0-auc:0.89815
[30]	validation_0-auc:0.89739
[31]	validation_0-auc:0.89924
[32]	validation_0-auc:0.89826
[33]	validation_0-au

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.3
,device,
,early_stopping_rounds,10
,enable_categorical,False


In [185]:
x_test = scaler.transform(test_df.drop(["idx", "label", "split"], axis=1).to_numpy())
y_test = test_df["label"].to_numpy()
y_prob = model.predict_proba(x_test)[:, 1]
print(f"Test AUROC: {metrics.roc_auc_score(y_test, y_prob):.3f}")

Test AUROC: 0.939


In [186]:
y_prob_val = model.predict_proba(x_val)[:, 1]

In [187]:
print(f"Test AUROC: {metrics.roc_auc_score(y_val, y_prob_val):.3f}")

Test AUROC: 0.917


In [189]:
test_df = df.loc[df["split"] == "test"]
exported_df = test_df[["idx", "label"]].copy()
exported_df["text_preds"] = y_prob_test
exported_df.to_csv("./test_proba_xgb.csv", index=False)

In [190]:
val_df = df.loc[df["split"] == "val"]
exported_val = val_df[["idx", "label"]].copy()
exported_val["text_preds"] = y_prob_val
exported_val.to_csv("./validation_proba_xgb.csv", index=False)