Configuration & Path Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



# 1. Dynamic Path Setup
current_dir = os.getcwd()
project_root = current_dir

# Climb up until we find 'src'
while not os.path.exists(os.path.join(project_root, 'src')):
    parent = os.path.dirname(project_root)
    if parent == project_root:
        raise FileNotFoundError("Could not find 'src'. Are you in the project folder?")
    project_root = parent

if project_root not in sys.path:
    sys.path.append(project_root)

DATA_PATH = os.path.join(project_root, 'data')
print(f"Project Root: {project_root}")
print(f"Data Path:    {DATA_PATH}")


In [None]:
# 2. Import Custom Modules

from src.data_loader_tx import ClinicalTrialLoader
from src.preprocessing_XGB import get_pipeline

 Smart Data Loading (Auto-Generation)

In [None]:
CSV_PATH = os.path.join(DATA_PATH, 'project_data.csv')
FORCE_REGENERATE = False  # Set to True if you changed code in data_loader.py

if os.path.exists(CSV_PATH) and not FORCE_REGENERATE:
    print(f">>> Loading existing dataset from: {CSV_PATH}")
    df = pd.read_csv(CSV_PATH)
else:
    print(">>> File not found (or forced regeneration). Triggering ETL pipeline...")
    loader = ClinicalTrialLoader(data_path=DATA_PATH)

    # 1. Load & Clean
    df = loader.load_and_clean()

    # 2. Add Features (Hierarchy, Competition, Text)
    df = loader.add_features(df)

    # 3. Save
    loader.save(df, filename='project_data.csv')

print(f"Data Ready. Shape: {df.shape}")

Temporal Split (Time Travel) <br>
Why: We sort by date to ensure strict separation of Past (Train) and Future (Test).

In [None]:
# ----------------------------------------------------------------------------------
# TEMPORAL SPLIT & STATISTICS REPORT
# ----------------------------------------------------------------------------------

# 1. Sort by Date to ensure we predict the future from the past
df = df.sort_values('start_year').reset_index(drop=True)

# 2. Define Split Point (e.g., 80% Train, 20% Test)
split_idx = int(len(df) * 0.80)

train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

# 3. Define Features & Target

target_col = 'target'

drop_cols = [target_col, 'overall_status', 'nct_id', 'why_stopped', 'start_date_type']

X_train = train_df.drop(columns=drop_cols, errors='ignore')
y_train = train_df[target_col]

X_test = test_df.drop(columns=drop_cols, errors='ignore')
y_test = test_df[target_col]

# ----------------------------------------------------------------------------------
# PRINT STATISTICS
# ----------------------------------------------------------------------------------
def print_stats(name, dataset, y):
    n_total = len(dataset)
    n_good = len(dataset[y == 0]) # 0 = Completed
    n_bad = len(dataset[y == 1])  # 1 = Terminated/Withdrawn

    pct_good = (n_good / n_total) * 100
    pct_bad = (n_bad / n_total) * 100

    min_year = dataset['start_year'].min()
    max_year = dataset['start_year'].max()

    print(f"--- {name} SET ---")
    print(f"Time Period:   {int(min_year)} to {int(max_year)}")
    print(f"Total Trials:  {n_total}")
    print(f"Good (0):      {n_good} ({pct_good:.1f}%)")
    print(f"Bad (1):       {n_bad} ({pct_bad:.1f}%)")
    print("")

print("="*40)
print("DATASET SPLIT STATISTICS")
print("="*40)

# Overall
print_stats("OVERALL", df, df[target_col])

# Training
print_stats("TRAINING (Past)", train_df, y_train)

# Testing
print_stats("TESTING (Future)", test_df, y_test)

print("="*40)
print(f"Split Ratio: {len(train_df)/len(df):.0%} Train / {len(test_df)/len(df):.0%} Test")

Model training

In [None]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import numpy as np


# 1. Calculate Class Weight (Scale Pos Weight)
# Formula: (Count of Negatives) / (Count of Positives)
# This tells XGBoost: "Pay X times more attention to failures"
neg_count = np.sum(y_train == 0)
pos_count = np.sum(y_train == 1)
scale_weight = neg_count / pos_count


# 2. Build Pipeline

model = Pipeline(steps=[

    ('preprocessor', get_pipeline()),
    ('classifier', XGBClassifier(

        n_estimators=1000,               # Number of trees (conservative)
        learning_rate=0.05,             # Step size (lower = more robust)

        # --- Tree Control ---
        max_depth=6,                    # Tree depth (lower = less overfitting)
        min_child_weight=1,             # Min samples per leaf
        gamma=0.1,                      # Min loss reduction to split

        # --- Sampling ---
        subsample=0.8,                  # Train on 80% of rows per tree
        colsample_bytree=0.8,           # Train on 80% of columns per tree
        colsample_level=0.8,            # Column subsample per level

        # --- Imbalance & Regularization ---
        scale_pos_weight=scale_weight,  # Handle Imbalance
        reg_alpha=0,                  # L1 Regularization
        reg_lambda=1,                 # L2 Regularization

        # --- Configuration ---
        objective='binary:logistic',  # Binary Classification
        eval_metric='aucpr',         # Evaluation Metric ('logloss', 'auc', 'aucpr', etc.)
        random_state=42,
        n_jobs=-1                       # Parallel processing
    ))
])



# 2. Fit
print(f"Training Logistic Regression on {len(X_train)} trials...")

model.fit(X_train, y_train)

print("Training Complete.")


Evaluation & Visualization

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    classification_report,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    PrecisionRecallDisplay
)

# 1. Get Predictions
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# 2. Calculate Advanced Metrics
roc_score = roc_auc_score(y_test, y_prob)
pr_score = average_precision_score(y_test, y_prob)

print(f"--- MODEL PERFORMANCE METRICS ---")
print(f"ROC-AUC Score:      {roc_score:.4f}  (0.5 = Random, 1.0 = Perfect)")
print(f"PR-AUC Score:       {pr_score:.4f}   (Baseline: {y_test.mean():.4f})")
print("-" * 40)
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))

# 3. Visualizations
fig, ax = plt.subplots(1, 3, figsize=(18, 5))

# A. Confusion Matrix (Normalized)
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred,
    normalize='true',
    cmap='Blues',
    display_labels=['Completed', 'Failed'],
    ax=ax[0]
)
ax[0].set_title("Confusion Matrix (Normalized)")

# B. ROC Curve
RocCurveDisplay.from_predictions(y_test, y_prob, ax=ax[1], name='LogReg')
ax[1].set_title(f"ROC Curve (AUC = {roc_score:.2f})")
ax[1].plot([0, 1], [0, 1], "k--", label="Chance")

# C. Precision-Recall Curve
PrecisionRecallDisplay.from_predictions(y_test, y_prob, ax=ax[2], name='LogReg')
ax[2].set_title(f"PR Curve (Avg Prec = {pr_score:.2f})")
ax[2].plot([0, 1], [y_test.mean(), y_test.mean()], "k--", label="Baseline")

plt.tight_layout()
plt.show()

# 3. Business Rule (Custom Threshold)
# We lower the threshold to catch more failures (High Recall strategy)

#threshold = 0.4
#y_pred_custom = (y_prob >= threshold).astype(int)
#tn, fp, fn, tp = confusion_matrix(y_test, y_pred_custom).ravel()

#recall = tp / (tp + fn)
#precision = tp / (tp + fp)


# 3. Visualizations
#fig, ax = plt.subplots(1, 3, figsize=(18, 5))

# A. Confusion Matrix (Normalized)
#ConfusionMatrixDisplay.from_predictions(
#    y_test, y_pred,
#    normalize='true',
#    cmap='Blues',
#    display_labels=['Completed', 'Failed'],
#    ax=ax[0]
#)
#ax[0].set_title("Confusion Matrix (Normalized)")

# B. ROC Curve
#RocCurveDisplay.from_predictions(y_test, y_prob, ax=ax[1], name='LogReg')
#ax[1].set_title(f"ROC Curve (AUC = {roc_score:.2f})")
#ax[1].plot([0, 1], [0, 1], "k--", label="Chance")

## C. Precision-Recall Curve
#PrecisionRecallDisplay.from_predictions(y_test, y_prob, ax=ax[2], name='LogReg')
#ax[2].set_title(f"PR Curve (Avg Prec = {pr_score:.2f})")
#ax[2].plot([0, 1], [y_test.mean(), y_test.mean()], "k--", label="Baseline")

#plt.tight_layout()
#plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import learning_curve, TimeSeriesSplit
from sklearn.metrics import roc_auc_score

def check_logreg_overfitting_temporal(model, X_train, y_train, X_test, y_test):
    """
    1. Calculates the Generalization Gap (Train vs Future Test).
    2. Generates a Time-Aware Learning Curve using TimeSeriesSplit.
       This prevents 'looking ahead' into the future during validation.
    """

    # --- PART 1: THE NUMBERS (Train vs Future Test) ---
    print("--- 1. GENERALIZATION GAP (2000-2013 vs 2013-2015) ---")

    y_train_prob = model.predict_proba(X_train)[:, 1]
    y_test_prob = model.predict_proba(X_test)[:, 1]

    train_auc = roc_auc_score(y_train, y_train_prob)
    test_auc = roc_auc_score(y_test, y_test_prob)
    gap = train_auc - test_auc

    print(f"Train AUC (Past):   {train_auc:.4f}")
    print(f"Test AUC (Future):  {test_auc:.4f}")
    print(f"Gap:                {gap:.4f}")

    if gap > 0.10:
        print("Verdict:   ⚠️ HIGH OVERFITTING (Memorizing the past, failing the future)")
    elif gap > 0.05:
        print("Verdict:   ⚠️ MODERATE OVERFITTING")
    else:
        print("Verdict:   ✅ GOOD TEMPORAL GENERALIZATION")

    # --- PART 2: THE VISUAL (Time-Series Learning Curve) ---
    print("\n--- 2. GENERATING TEMPORAL LEARNING CURVE ---")
    print("(Using 5-Split Expanding Window: Train on Year 0-N, Validate on Year N+1)")

    # CRITICAL CHANGE: Use TimeSeriesSplit instead of standard CV
    # This ensures we never train on future data to predict the past
    tscv = TimeSeriesSplit(n_splits=5)

    train_sizes, train_scores, val_scores = learning_curve(
        model,
        X_train,
        y_train,
        cv=tscv,           # <--- The Fix: Enforce Time Order
        scoring='roc_auc',
        n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 5),
        shuffle=False      # <--- The Fix: Do not shuffle rows
    )

    # Calculate means and standard deviations
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)

    # Plot
    plt.figure(figsize=(10, 6))

    # Plot Training Line
    plt.plot(train_sizes, train_mean, 'o-', color="blue", label="Training Score (Past)")
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="blue")

    # Plot Validation Line
    plt.plot(train_sizes, val_mean, 'o-', color="green", label="Validation Score (Future)")
    plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color="green")

    plt.title("Temporal Learning Curve (Expanding Window)")
    plt.xlabel("Training Examples Used (Chronological)")
    plt.ylabel("ROC-AUC Score")
    plt.legend(loc="best")
    plt.grid(True, alpha=0.3)

    # Add annotation explaining the split
    plt.figtext(0.5, -0.05, "Note: Validation scores are calculated on 'future' folds relative to the training set.",
                ha="center", fontsize=10, style='italic')

    plt.tight_layout()
    plt.show()

# --- EXECUTE ---
# Note: X_train must be sorted by date for this to work correctly.
# Your notebook sorted df by 'start_year' before splitting, so X_train is already sorted.
check_logreg_overfitting_temporal(model, X_train, y_train, X_test, y_test)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.calibration import calibration_curve
from sklearn.metrics import roc_auc_score, precision_score, recall_score

def run_pro_analysis(model, X_test, y_test, raw_test_df):
    """
    Runs a 4-angle deep dive on model predictions.
    Requires:
    - model: Trained pipeline/model
    - X_test: Processed features (or raw if pipeline handles it)
    - y_test: Target labels
    - raw_test_df: The original dataframe for the test set (to extract Year/Phase/etc.)
    """

    # 1. Get Probabilities
    # Note: We take the probability of Class 1 (Failure)
    y_probs = model.predict_proba(X_test)[:, 1]

    # Create a temporary analysis dataframe
    analysis_df = raw_test_df.copy().reset_index(drop=True)
    analysis_df['target'] = y_test.values
    analysis_df['prob_failure'] = y_probs

    # Setup Plotting Grid
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    plt.subplots_adjust(hspace=0.3)

    # ==============================================================================
    # ANGLE 1: CALIBRATION CURVE (Reliability)
    # ==============================================================================
    prob_true, prob_pred = calibration_curve(y_test, y_probs, n_bins=10)
    ax1 = axes[0, 0]
    ax1.plot(prob_pred, prob_true, marker='o', label='Model', color='blue')
    ax1.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly Calibrated')
    ax1.set_xlabel('Mean Predicted Probability')
    ax1.set_ylabel('Fraction of Positives (Actual Failure Rate)')
    ax1.set_title('Angle 1: Calibration Plot (Reliability)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)

    # ==============================================================================
    # ANGLE 2: VINTAGE ANALYSIS (Temporal Stability)
    # ==============================================================================
    # Group by Start Year and calculate AUC per year
    vintage_stats = []
    years = sorted(analysis_df['start_year'].unique())

    for year in years:
        subset = analysis_df[analysis_df['start_year'] == year]
        if len(subset) > 0 and subset['target'].nunique() > 1:
            auc = roc_auc_score(subset['target'], subset['prob_failure'])
            count = len(subset)
            vintage_stats.append({'Year': int(year), 'AUC': auc, 'Count': count})

    vintage_df = pd.DataFrame(vintage_stats)

    ax2 = axes[0, 1]
    sns.lineplot(data=vintage_df, x='Year', y='AUC', marker='o', color='green', ax=ax2)
    ax2.set_ylim(0.5, 1.0)
    ax2.set_title('Angle 2: Vintage Analysis (Stability over Time)')
    ax2.set_ylabel('ROC-AUC Score')
    ax2.set_xticks(vintage_df['Year'])
    ax2.grid(True, alpha=0.3)

    # Add count labels
    for index, row in vintage_df.iterrows():
        ax2.text(row['Year'], row['AUC'] + 0.01, f"n={int(row['Count'])}", ha='center', fontsize=9)

    # ==============================================================================
    # ANGLE 3: RISK DECILE ANALYSIS (Business Impact)
    # ==============================================================================
    # Bin predictions into 10 buckets (Deciles)
    analysis_df['decile'] = pd.qcut(analysis_df['prob_failure'], 10, labels=False, duplicates='drop')

    decile_stats = analysis_df.groupby('decile').agg({
        'target': 'mean',          # Actual Failure Rate
        'prob_failure': 'mean'     # Predicted Failure Rate
    }).reset_index()

    ax3 = axes[1, 0]
    width = 0.35
    x = np.arange(len(decile_stats))

    ax3.bar(x - width/2, decile_stats['prob_failure'], width, label='Predicted Risk', color='skyblue', alpha=0.7)
    ax3.bar(x + width/2, decile_stats['target'], width, label='Actual Failure Rate', color='salmon', alpha=0.7)

    ax3.set_xlabel('Risk Decile (0=Lowest Risk, 9=Highest Risk)')
    ax3.set_ylabel('Failure Rate')
    ax3.set_title('Angle 3: Risk Deciles (Lift Chart)')
    ax3.legend()
    ax3.grid(axis='y', alpha=0.3)

    # ==============================================================================
    # ANGLE 4: COHORT SLICING (Phase & Sponsor)
    # ==============================================================================
    # Calculate AUC for specific slices
    slices = []

    # Slice by Phase
    for phase in analysis_df['phase'].unique():
        subset = analysis_df[analysis_df['phase'] == phase]
        if len(subset) > 50 and subset['target'].nunique() > 1:
            auc = roc_auc_score(subset['target'], subset['prob_failure'])
            slices.append({'Slice': f"Phase: {phase}", 'AUC': auc})

    # Slice by Sponsor Tier
    for tier in analysis_df['sponsor_tier'].unique():
        subset = analysis_df[analysis_df['sponsor_tier'] == tier]
        if len(subset) > 50 and subset['target'].nunique() > 1:
            auc = roc_auc_score(subset['target'], subset['prob_failure'])
            slices.append({'Slice': f"Sponsor: {tier}", 'AUC': auc})

    slice_df = pd.DataFrame(slices).sort_values('AUC', ascending=False)

    ax4 = axes[1, 1]
    sns.barplot(data=slice_df, x='AUC', y='Slice', palette='viridis', ax=ax4)
    ax4.set_xlim(0.4, 1.0)
    ax4.set_title('Angle 4: Cohort Performance (Bias Check)')
    ax4.axvline(0.5, color='red', linestyle='--', label='Random')
    ax4.grid(axis='x', alpha=0.3)

    plt.tight_layout()
    plt.show()

    return analysis_df

# ---------------------------------------------------------
# EXECUTE ANALYSIS
# ---------------------------------------------------------
# Ensure we pass the raw test dataframe to get columns like 'start_year' and 'phase' back
# We use test_df (which you defined in cell 11 of your notebook)
print("Running Deep Dive Analysis on Baseline Model...")
results_df = run_pro_analysis(model, X_test, y_test, test_df)

In [None]:
# 1. Instantiate the Preprocessor
preprocessor = get_pipeline()

# 2. Fit and Transform X_train
# CRITICAL FIX: Pass y_train here! TargetEncoder needs it.
print("Running preprocessor... this might take a moment due to SVD...")
X_train_transformed = preprocessor.fit_transform(X_train, y_train)

# --- FIX FOR SPARSE MATRICES ---
if hasattr(X_train_transformed, "toarray"):
    X_train_transformed = X_train_transformed.toarray()

# 3. Get Feature Names
feature_names = preprocessor.get_feature_names_out()

# 4. Convert to DataFrame for Easy Viewing
df_transformed = pd.DataFrame(
    X_train_transformed,
    columns=feature_names,
    index=X_train.index
)

# --- INSPECTION REPORT ---
print(f"\nOriginal Shape: {X_train.shape}")
print(f"Transformed Shape: {df_transformed.shape}")
print("-" * 50)

# 5. Inspect SVD Columns
svd_cols = [col for col in df_transformed.columns if 'svd' in col.lower()]
if svd_cols:
    print(f"\n✅ Found {len(svd_cols)} Text SVD Features")
    print(df_transformed[svd_cols[:5]].head(3))
else:
    print("\n❌ Could not identify SVD columns by name.")

# 6. Show Head
print("\nFirst 5 rows of transformed data:")
try:
    display(df_transformed.head())
except:
    print(df_transformed.head())

In [None]:
import pandas as pd
import numpy as np

# 1. Load Data (Assuming 'df' is your loaded dataframe from the loader)
# If df is not in memory, uncomment:
# from src.data_loader_tx import ClinicalTrialLoader
# loader = ClinicalTrialLoader(data_path='data/')
# df = loader.load_and_clean()
# df = loader.add_features(df)

print("=== 1. PHASE MAPPING AUDIT ===")
# Check how raw phases map to the ordinal feature
phase_check = df[['phase', 'phase_ordinal']].drop_duplicates().sort_values('phase_ordinal')
print(phase_check)

print("\n=== 2. FEATURE DISTRIBUTION AUDIT (Pre-Scaling) ===")
# Check stats of numerical columns to see their natural range
num_cols = ['competition_broad', 'num_primary_endpoints', 'criteria_len_log', 'start_year']
print(df[num_cols].describe().loc[['min', 'max', 'mean', 'std']])

print("\n=== 3. MISSING VALUES CHECK ===")
# XGBoost handles NaNs automatically, but good to know
missing = df.isnull().sum()
print(missing[missing > 0])