Configuration & Path Setup

In [None]:
import os
import sys

# 1. Dynamic Path Setup
# Assumes your folder structure is:
# project_root/
# ├── data/
# ├── src/
# └── 03_baseline_model.ipynb

# Get the current working directory
PROJECT_ROOT = os.getcwd()

# Define Data Path relative to project root
DATA_PATH = os.path.join(PROJECT_ROOT, 'data')
CSV_PATH = os.path.join(DATA_PATH, 'project_data.csv')

# Add 'src' to Python path so we can import our modules
sys.path.append(os.path.join(PROJECT_ROOT, 'src'))

print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Path:    {DATA_PATH}")

# Check if data folder exists
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Data folder not found at {DATA_PATH}. Please create it and add your raw .txt files.")

 Smart Data Loading (Auto-Generation)

In [None]:
import pandas as pd
from src.data_loader import ClinicalTrialLoader

# Check if the processed file exists
if os.path.exists(CSV_PATH):
    print(f"Loading existing dataset from: {CSV_PATH}")
    df = pd.read_csv(CSV_PATH)
else:
    print("Processed file not found. Triggering ETL pipeline...")
    print("This may take 1-2 minutes.")

    # Initialize Loader with the path defined in Cell 1
    loader = ClinicalTrialLoader(data_path=DATA_PATH)

    # Run Pipeline
    df = loader.load_and_clean()
    df = loader.add_features(df)

    # Save for next time
    loader.save(df, filename='project_data.csv')
    print("ETL Complete. Data loaded.")

print(f"Data Shape: {df.shape}")

Temporal Split (Time Travel) <br>
Why: We sort by date to ensure strict separation of Past (Train) and Future (Test).

In [None]:
# 1. Sort by Start Year (Crucial for Time Series/Evolution)
df = df.sort_values('start_year').reset_index(drop=True)

# 2. Define Split Point (80% Train / 20% Test)
split_idx = int(len(df) * 0.8)

# 3. Split
train_df = df.iloc[:split_idx].copy()
test_df = df.iloc[split_idx:].copy()

# 4. Define Features (X) and Target (y)
target_col = 'target'
drop_cols = [target_col, 'overall_status', 'nct_id']

X_train = train_df.drop(columns=drop_cols)
y_train = train_df[target_col]

X_test = test_df.drop(columns=drop_cols)
y_test = test_df[target_col]

print(f"Train Set: {train_df['start_year'].min()} - {train_df['start_year'].max()} (n={len(train_df)})")
print(f"Test Set:  {test_df['start_year'].min()} - {test_df['start_year'].max()} (n={len(test_df)})")

Model training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from src.preprocessing import get_pipeline

# 1. Get the Preprocessing Pipeline
preprocessor = get_pipeline()

# 2. Define the Model
# Class Weight 'balanced' is key for our imbalanced dataset (19% failures)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        class_weight='balanced',
        solver='liblinear',
        C=0.01,
        max_iter=1000,
        random_state=42
    ))
])

# 3. Train
print("Training Baseline Model...")
model.fit(X_train, y_train)
print("Training Complete.")

Evaluation & Visualization

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, classification_report, ConfusionMatrixDisplay, RocCurveDisplay

# 1. Predict
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# 2. Metrics
roc = roc_auc_score(y_test, y_prob)
print(f"ROC-AUC Score: {roc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 3. Plots
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Confusion Matrix
ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred,
    normalize='true',
    cmap='Blues',
    display_labels=['Completed', 'Failed'],
    ax=ax[0]
)
ax[0].set_title("Confusion Matrix (Normalized)")

# ROC Curve
RocCurveDisplay.from_predictions(y_test, y_prob, ax=ax[1], name='Baseline')
ax[1].plot([0, 1], [0, 1], "k--", label="Chance")
ax[1].set_title(f"ROC Curve (AUC={roc:.2f})")

plt.tight_layout()
plt.show()

Save Model <br>
Why: Saves the trained pipeline so the Streamlit app can load it later.

In [None]:

import joblib

# Create models directory if it doesn't exist
os.makedirs(os.path.join(PROJECT_ROOT, 'models'), exist_ok=True)

model_path = os.path.join(PROJECT_ROOT, 'models', 'baseline_pipeline.joblib')
joblib.dump(model, model_path)

print(f"Model saved successfully to: {model_path}")