In [1]:
# -- Imports
import pandas as pd
import numpy as np

from recruitment_fairness.data.loader import ClinicalTrialsWebCollector
from recruitment_fairness.data.preprocess import ClinicalTrialPreprocessor
from recruitment_fairness.data.clinicalbert_embedder import ClinicalBERTEmbedder
from recruitment_fairness.models.catboost_net import CatBoostNet
from recruitment_fairness.models.fairness_utils import demographic_parity_difference, equal_opportunity_difference


  from .autonotebook import tqdm as notebook_tqdm


!pip install catboost 

In [2]:
# -- 1. Data Collection
collector = ClinicalTrialsWebCollector("data/raw")
df = collector.search_trials("", 1500)  # Use a moderate number for first run

Fetching all: 100%|██████████| 1500/1500 [00:07<00:00, 205.64it/s]

✅ 1500 trials saved to data\raw\raw_clinical_trials_20250724_223827.csv





In [3]:
# -- 2. Preprocessing (split, clean)
preproc = ClinicalTrialPreprocessor("data/raw", "data/processed")
train, val, test = preproc.preprocess(df)

Splits: train=1080, val=120, test=300


In [4]:

# get structured features + cat indices
X_train_struct, cat_feat_idx = preproc.get_structured_features(train)
X_val_struct, _            = preproc.get_structured_features(val)
X_test_struct, _           = preproc.get_structured_features(test)


In [5]:
print("Structured columns:", X_train_struct.columns.tolist())
print("Cat feature indices:", cat_feat_idx)
print("Cat feature names:", [X_train_struct.columns[i] for i in cat_feat_idx])


Structured columns: ['phase_EARLY_PHASE1', 'phase_NA', 'phase_PHASE1', 'phase_PHASE1|PHASE2', 'phase_PHASE2', 'phase_PHASE2|PHASE3', 'phase_PHASE3', 'phase_PHASE4', 'phase_unknown', 'sponsor_class', 'enrollment_count']
Cat feature indices: [9]
Cat feature names: ['sponsor_class']


In [6]:
# -- 4. ClinicalBERT Embeddings (eligibility/summary text)
texts_train = train["brief_summary"].fillna("").astype(str).tolist()
texts_val = val["brief_summary"].fillna("").astype(str).tolist()
texts_test = test["brief_summary"].fillna("").astype(str).tolist()

embedder = ClinicalBERTEmbedder()
X_train_text = embedder.embed_texts(texts_train, batch_size=16)
X_val_text = embedder.embed_texts(texts_val, batch_size=16)
X_test_text = embedder.embed_texts(texts_test, batch_size=16)

Embedding texts: 100%|██████████| 68/68 [01:52<00:00,  1.65s/it]
Embedding texts: 100%|██████████| 8/8 [00:11<00:00,  1.49s/it]
Embedding texts: 100%|██████████| 19/19 [00:30<00:00,  1.59s/it]


In [7]:
# -- 5. Combine features
X_train = np.hstack([X_train_struct.to_numpy(), X_train_text])
X_val = np.hstack([X_val_struct.to_numpy(), X_val_text])
X_test = np.hstack([X_test_struct.to_numpy(), X_test_text])
y_train = train["is_success"].to_numpy()
y_val = val["is_success"].to_numpy()
y_test = test["is_success"].to_numpy()

In [9]:
# 1) Build a single pandas DataFrame for all features
X_tr_df = X_train_struct.copy()            # your one-hot phases + sponsor_class + enrollment_count
# append ClinicalBERT text embeddings as new numeric columns
for i in range(X_train_text.shape[1]):
    X_tr_df[f"text_{i}"] = X_train_text[:, i]

X_val_df = X_val_struct.copy()
for i in range(X_val_text.shape[1]):
    X_val_df[f"text_{i}"] = X_val_text[:, i]

X_te_df = X_test_struct.copy()
for i in range(X_test_text.shape[1]):
    X_te_df[f"text_{i}"] = X_test_text[:, i]


# 2) Define which **column names** are categorical
#    (sponsor_class is a low-cardinality string)
cat_cols_names = ["sponsor_class"]


# 3) Build CatBoost Pools
from catboost import Pool

train_pool = Pool(
    data=X_tr_df,
    label=y_train,
    cat_features=cat_cols_names
)
val_pool = Pool(
    data=X_val_df,
    label=y_val,
    cat_features=cat_cols_names
)


# 4) Fit via your CatBoostNet wrapper
cb = CatBoostNet(cat_features=cat_cols_names)
# Note: CatBoostNet stores these names and passes them into CatBoostClassifier internally
cb.fit(train_pool, None, val_pool, None)
# (or you can bypass the wrapper and call cb.model.fit(train_pool, eval_set=val_pool))

# 5) Predict
y_pred_proba = cb.predict_proba(X_te_df)  # it will detect you passed a DataFrame


  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train_text[:, i]
  X_tr_df[f"text_{i}"] = X_train

0:	total: 302ms	remaining: 1m 30s
100:	total: 12.9s	remaining: 25.4s
200:	total: 25.8s	remaining: 12.7s
299:	total: 40.5s	remaining: 0us


In [13]:
# -- 6. CatBoost Training (RecruitmentNet)

catboost_model = CatBoostNet(cat_features=cat_feat_idx)
catboost_model.fit(X_train, y_train, X_val, y_val)
y_pred_proba = catboost_model.predict_proba(X_test)


CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=8]="FED": Cannot convert 'FED' to float

In [10]:
# -- 7. Fairness Metrics Example
# Suppose sponsor type is the sensitive attribute
sensitive_attr = test["sponsor_class"].fillna("unknown").to_numpy()
demog_parity = demographic_parity_difference(y_test, (y_pred_proba > 0.5), sensitive_attr)
eq_opp = equal_opportunity_difference(y_test, (y_pred_proba > 0.5), sensitive_attr)
print(f"Demographic parity difference: {demog_parity:.3f}")
print(f"Equal opportunity difference: {eq_opp:.3f}")

Demographic parity difference: 0.062
Equal opportunity difference: 0.122


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [11]:
# -- 8. FairOutcomeNet (Second stage, using recruitment prediction as input)
X_train_2nd = np.hstack([X_train, catboost_model.predict_proba(X_train).reshape(-1, 1)])
X_val_2nd = np.hstack([X_val, catboost_model.predict_proba(X_val).reshape(-1, 1)])
X_test_2nd = np.hstack([X_test, catboost_model.predict_proba(X_test).reshape(-1, 1)])

catboost_fair = CatBoostNet(cat_features=cat_feat_idx)
catboost_fair.fit(X_train_2nd, y_train, X_val_2nd, y_val)
y_pred_fair = catboost_fair.predict_proba(X_test_2nd)


CatBoostError: There is no trained model to use predict_proba(). Use fit() to train model. Then use this method.

In [12]:
# -- 9. Fairness evaluation (as above)
demog_parity_2 = demographic_parity_difference(y_test, (y_pred_fair > 0.5), sensitive_attr)
eq_opp_2 = equal_opportunity_difference(y_test, (y_pred_fair > 0.5), sensitive_attr)
print(f"FairOutcomeNet Demographic parity: {demog_parity_2:.3f}")
print(f"FairOutcomeNet Equal opportunity: {eq_opp_2:.3f}")  

NameError: name 'y_pred_fair' is not defined