import libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load original dataset (missing values)
df_original = pd.read_excel("Prediction model for culture_v4_FINAL_no imputations.xlsx")

# Load imputed dataset (20 sets)
df_imputed = pd.read_excel("FINAL_MII_Prediction model for culture_23-9-25_1+20 sets.xlsx")

# Check shapes
print("Original data shape:", df_original.shape)
print("Imputed data shape:", df_imputed.shape)

Original data shape: (3629, 70)
Imputed data shape: (76209, 69)


Explore Missing Values

In [None]:
# Check how many missing values each column has
missing_summary = df_original.isnull().sum()

# Show only columns that have missing values
print(missing_summary[missing_summary > 0])

# See percentage of missing values
missing_percent = (df_original.isnull().mean() * 100).round(2)
print(missing_percent[missing_percent > 0])


ga                        1
sex                       2
enr_oth_cli            2622
enr_crp_val            1205
enr_pct_val            3418
enr_tlc_val             751
enr_anc_val            1997
enr_cult_date             1
enr_org                2840
enr_org_oth            3421
enr_contam             2416
enr_con_re             3542
enr_org_sn_ampic       3084
enr_org_sn_penic       3086
enr_org_sn_cloxa       3086
enr_org_sn_cefaz       3087
enr_org_sn_cefot       3087
enr_org_sn_cefta       3086
enr_org_sn_cefop       3087
enr_org_sn_pipera      3089
enr_org_sn_pipera_t    3087
enr_org_sn_genta       3086
enr_org_sn_amika       3085
enr_org_sn_netil       3088
enr_org_sn_eryth       3090
enr_org_sn_clind       3090
enr_org_sn_cipro       3086
enr_org_sn_imipe       3087
enr_org_sn_merop       3087
enr_org_sn_vanco       3088
enr_org_sn_teico       3087
enr_org_sn_linez       3089
enr_org_sn_colist      3087
dtype: int64
ga                      0.03
sex                     0.06
enr_o

In [None]:
# Encode categorical variables (convert everything to string first)
for col in categorical_cols:
    X_original[col] = X_original[col].astype(str)
    X_original[col] = LabelEncoder().fit_transform(X_original[col])

# Repeat for imputed dataset
for col in categorical_cols_imp:
    X_imputed[col] = X_imputed[col].astype(str)
    X_imputed[col] = LabelEncoder().fit_transform(X_imputed[col])

print("Original dataset - X shape after encoding:", X_original.shape)
print("Imputed dataset - X shape after encoding:", X_imputed.shape)


Original dataset - X shape after encoding: (3629, 39)
Imputed dataset - X shape after encoding: (76209, 40)


Handle Missing Values for original dataset


In [None]:
from sklearn.model_selection import train_test_split

# -----------------------------
# 1️⃣ Case 1: Full dataset
# -----------------------------
X_full = X_original.copy()
y_full = y_original.copy()

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_full, y_full, test_size=0.25, random_state=42, stratify=y_full
)

print("Case 1 - Full dataset shapes:")
print("X_train:", X_train_full.shape, "X_test:", X_test_full.shape)
print("y_train:", y_train_full.shape, "y_test:", y_test_full.shape)
print("-"*50)

# -----------------------------
# Case 2: Exclude PCT and ANC
# -----------------------------
drop_pct_anc = [c for c in X_original.columns if 'pct' in c.lower() or 'anc' in c.lower()]
X_no_pct_anc = X_original.drop(columns=drop_pct_anc, errors="ignore").copy()
y_no_pct_anc = y_original.copy()

X_train_no_pct_anc, X_test_no_pct_anc, y_train_no_pct_anc, y_test_no_pct_anc = train_test_split(
    X_no_pct_anc, y_no_pct_anc, test_size=0.25, random_state=42, stratify=y_no_pct_anc
)

print("Case 2 - Exclude PCT and ANC shapes:")
print("X_train:", X_train_no_pct_anc.shape, "X_test:", X_test_no_pct_anc.shape)
print("y_train:", y_train_no_pct_anc.shape, "y_test:", y_test_no_pct_anc.shape)
print("-"*50)

# -----------------------------
# Case 3: Include only CRP and TLC
# -----------------------------
# First check available CRP/TLC columns to avoid KeyError
available_crp_tlc = [c for c in X_original.columns if 'crp' in c.lower() or 'tic' in c.lower()]
print("Available CRP/TLC columns:", available_crp_tlc)

X_crp_tlc = X_original[available_crp_tlc].copy()
y_crp_tlc = y_original.copy()

X_train_crp_tlc, X_test_crp_tlc, y_train_crp_tlc, y_test_crp_tlc = train_test_split(
    X_crp_tlc, y_crp_tlc, test_size=0.25, random_state=42, stratify=y_crp_tlc
)

print("Case 3 - Only CRP and TLC shapes:")
print("X_train:", X_train_crp_tlc.shape, "X_test:", X_test_crp_tlc.shape)
print("y_train:", y_train_crp_tlc.shape, "y_test:", y_test_crp_tlc.shape)
print("-"*50)


Case 1 - Full dataset shapes:
X_train: (2721, 39) X_test: (908, 39)
y_train: (2721,) y_test: (908,)
--------------------------------------------------
Case 2 - Exclude PCT and ANC shapes:
X_train: (2721, 37) X_test: (908, 37)
y_train: (2721,) y_test: (908,)
--------------------------------------------------
Available CRP/TLC columns: ['enr_crp_val']
Case 3 - Only CRP and TLC shapes:
X_train: (2721, 1) X_test: (908, 1)
y_train: (2721,) y_test: (908,)
--------------------------------------------------


In [None]:
# -----------------------------
# 4️⃣ Case 4: Imputed dataset
# -----------------------------
# Make sure X_imputed and y_imputed are properly defined and cleaned
X_imputed_model = X_imputed.copy()
y_imputed_model = y_imputed.copy()

# Encode categorical variables if any (like we did for original dataset)
categorical_cols_imputed = X_imputed_model.select_dtypes(include=["object", "bool"]).columns
if len(categorical_cols_imputed) > 0:
    X_imputed_model = pd.get_dummies(X_imputed_model, columns=categorical_cols_imputed, drop_first=True)

# Train-test split
X_train_imputed, X_test_imputed, y_train_imputed, y_test_imputed = train_test_split(
    X_imputed_model, y_imputed_model, test_size=0.25, random_state=42, stratify=y_imputed_model
)

print("Case 4 - Imputed dataset shapes:")
print("X_train:", X_train_imputed.shape, "X_test:", X_test_imputed.shape)
print("y_train:", y_train_imputed.shape, "y_test:", y_test_imputed.shape)
print("-"*50)


Case 4 - Imputed dataset shapes:
X_train: (57156, 40) X_test: (19053, 40)
y_train: (57156,) y_test: (19053,)
--------------------------------------------------


Install & Import XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score

def train_xgb_model(X_train, X_test, y_train, y_test, case_name="Dataset"):
    xgb_model = xgb.XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42,
        n_estimators=200,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8
    )

    # Train
    xgb_model.fit(X_train, y_train)

    # Predict
    y_pred = xgb_model.predict(X_test)
    y_prob = xgb_model.predict_proba(X_test)[:,1]

    # Evaluation
    print(f"\n--- XGBoost Results: {case_name} ---")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
    print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    return xgb_model


In [None]:
model_full = train_xgb_model(X_train, X_test, y_train, y_test, case_name="Full Original Dataset")


XGBoostError: [21:30:42] /workspace/src/objective/./regression_loss.h:68: Check failed: base_score > 0.0f && base_score < 1.0f: base_score must be in (0,1) for logistic loss, got: 0
Stack trace:
  [bt] (0) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x2a6e7c) [0x7ee4110a6e7c]
  [bt] (1) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0xeda699) [0x7ee411cda699]
  [bt] (2) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x6826d3) [0x7ee4114826d3]
  [bt] (3) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x682a9c) [0x7ee411482a9c]
  [bt] (4) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(+0x68cfeb) [0x7ee41148cfeb]
  [bt] (5) /usr/local/lib/python3.12/dist-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x77) [0x7ee410fb6f57]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7ee4623b6e2e]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7ee4623b3493]
  [bt] (8) /usr/lib/python3.12/lib-dynload/_ctypes.cpython-312-x86_64-linux-gnu.so(+0x98c1) [0x7ee4635da8c1]

