Configuration & Path Setup

In [13]:
%load_ext autoreload
%autoreload 2

import warnings

# Ignore FutureWarnings from Scikit-Learn to clean up plot outputs
warnings.filterwarnings("ignore", category=FutureWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Identify the correct path on own computer

**goal** : not to have to change any path manually when running the code

In [14]:
import sys
from pathlib import Path

# 1. Define the Project Root
# Start at the current directory and look for 'src' to identify the root
current_dir = Path.cwd()
project_root = current_dir

while not (project_root / 'src').exists():
    if project_root == project_root.parent:
        # Hit the filesystem root without finding the project
        raise FileNotFoundError("Could not find project root containing 'src'")
    project_root = project_root.parent

# 2. Add Project Root to System Path
# This allows 'import src...' to work from anywhere
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# 3. Define Key Paths
DATA_PATH = project_root / "data"
MODELS_PATH = project_root / "models"

# 4. Verification
print(f"Project Root: {project_root}")
print(f"Data Path:    {DATA_PATH}")
print(f"Models Path:  {MODELS_PATH}")

Project Root: /home/delaunan/code/delaunan/clintrialpredict
Data Path:    /home/delaunan/code/delaunan/clintrialpredict/data
Models Path:  /home/delaunan/code/delaunan/clintrialpredict/models


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Custom Modules/Closses
from src.prep.data_loader import ClinicalTrialLoader
from src.prep.preprocessing import preprocessor

### Loading of source data project_data.csv
Delete the one already present in clintrialpredict/data/ if you want the latest version. <br>
if data_project.csv already exists in clintrialpredict/data/, the existing one will not be replaced by the latest one <br>
**goal** : not to have to rerun and build the source file every time.

In [16]:
import pandas as pd
# We don't even need 'import os' if we use pathlib correctly

# 1. Define path using the '/' operator (works because DATA_PATH is a Path object)
CSV_PATH = DATA_PATH / 'project_data.csv'
FORCE_REGENERATE = False

# 2. Check existence using .exists() method
if CSV_PATH.exists() and not FORCE_REGENERATE:
    print(f">>> Loading existing dataset from: {CSV_PATH}")
    df = pd.read_csv(CSV_PATH)
else:
    print(">>> File not found (or forced regeneration). Triggering ETL pipeline...")

    # Ensure ClinicalTrialLoader is imported from your src
    # from src.WP5_utils.data_loader import ClinicalTrialLoader

    loader = ClinicalTrialLoader(data_path=DATA_PATH)

    # 1. Load & Clean
    df = loader.load_and_clean()

    # 2. Add Features (Hierarchy, Competition, Text)
    df = loader.add_features(df)

    # 3. Save
    loader.save(df, filename='project_data.csv')

print(f"Data Ready. Shape: {df.shape}")

>>> Loading existing dataset from: /home/delaunan/code/delaunan/clintrialpredict/data/project_data.csv
Data Ready. Shape: (88374, 67)


### Audit of features in data_project.csv (to be runned only to dig deeper)
Audit file with all information saved to clintrialpredict/data/audit_features.txt

In [17]:
from src.prep.audit_utils import run_master_audit

#Run the audit of features loaded into project_data.csv (will generate audit_full_report.txt in your data folder)

run_master_audit(DATA_PATH)

Reading /home/delaunan/code/delaunan/clintrialpredict/data/project_data.csv...
[1/9] Checking Dataset Health...
[2/9] Analyzing Categorical Risk Signals...
[3/9] Analyzing Numerical Impact...
[4/9] Generating Preprocessing Strategy...
[5/9] Calculating Correlations...
[6/9] Ranking Categorical Features...
[7/9] Checking Collinearity...
[8/9] Running Business Logic Checks...
[9/9] Writing Documentation...

Done. Audit saved to /home/delaunan/code/delaunan/clintrialpredict/data/audit_features.txt


### final fit, full database

In [18]:
import pandas as pd
import numpy as np

# 1. Define columns to drop (Target + Metadata + Leakage)
forbidden_cols = [
    'target',               # The binary target itself (0/1)
    'overall_status',       # The text target (e.g. "Terminated") - LEAKAGE
    'nct_id',               # ID - Metadata
    'start_date',           # Date - Metadata
    'start_year',           # Date - Metadata
    'why_stopped',          # Future info - LEAKAGE
    'scientific_success'    # Future info - LEAKAGE
]

# 2. Create X (Features) and y (Target)
X = df.drop(columns=forbidden_cols, errors='ignore')
y = df['target']

print(f"Ready for Pipeline. X Shape: {X.shape}, y Shape: {y.shape}")

Ready for Pipeline. X Shape: (88374, 60), y Shape: (88374,)


In [22]:
import numpy as np

# 1. Count your zeros and ones
num_neg = np.sum(y == 0)
num_pos = np.sum(y == 1)

# 2. Calculate the ratio (Negatives / Positives)
# If you have fewer 1s, this number will be > 1, telling the model to pay more attention to them.
ratio = float(num_neg) / float(num_pos)

print(f"Count 0s (Negative): {num_neg}")
print(f"Count 1s (Positive): {num_pos}")
print(f"Calculated scale_pos_weight: {ratio:.4f}")

Count 0s (Negative): 74566
Count 1s (Positive): 13808
Calculated scale_pos_weight: 5.4002


### Model Training (model.fit)

In [23]:
from sklearn.linear_model import LogisticRegression
from src.prep.preprocessing import preprocessor
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# --- 1. Define the Model ---
print("Initializing Logistic Regression...")
model = Pipeline([
    ('preprocessor', preprocessor()),
    ('classifier', XGBClassifier(
        # --- Core Parameters ---
        objective='binary:logistic',
        eval_metric='aucpr',

        # --- AGGRESSIVE REGULARIZATION (The settings that fixed the gap) ---
        n_estimators=300,            # Fixed low number
        learning_rate=0.02,          # Slow learning
        max_depth=3,                 # Shallow trees (CRITICAL)
        reg_lambda=15,               # Heavy L2 penalty
        gamma=1.0,                   # High minimum loss reduction
        min_child_weight=5,          # High minimum child weight

        # --- Standard Parameters ---
        subsample=0.8,
        colsample_bytree=0.8,

        # --- Imbalance Handling ---
        scale_pos_weight=ratio,

        # --- Technical Settings ---
        random_state=42,
        n_jobs=1,
        tree_method='hist'
    ))
])


# --- Fit the Model ---
print("Fitting XGBoost Model...")
model.fit(X, y)
print("✅ XGBoost Model Trained - FINAL!")

Initializing Logistic Regression...
Fitting XGBoost Model...
✅ XGBoost Model Trained - FINAL!


### Save model (not to have to re-run model.fit next time so that you start with the correct weights)
prerequisite will be Train_test_split to recuperate the X_train, y_train, X_test, y_test to feed into pretrained model.

In [24]:
import joblib

# 1. Define the filename in a variable
MODEL_FILENAME = 'ctp_model.joblib'

# 2. Construct the full path using the pathlib object
# This ensures it goes to 'CLINTRIALPREDICT/models/', not 'src/models/'
save_path = MODELS_PATH / MODEL_FILENAME

# 3. Save the model
print(f"Saving model to: {save_path} ...")
joblib.dump(model, save_path)

print(f"✅ Model successfully saved as '{MODEL_FILENAME}' in the project root models folder.")

Saving model to: /home/delaunan/code/delaunan/clintrialpredict/models/ctp_model.joblib ...
✅ Model successfully saved as 'ctp_model.joblib' in the project root models folder.


### Imprort pre-trained model (not to have to re-run model.fit)
prerequisite Train_test_split to recuperate the X_train, y_train, X_test, y_test to feed into pretrained model.

In [None]:
# Import the loader
from src.models.model_utils import load_model

# Retrieve the model from the file
my_loaded_model = load_model('ctp_model.joblib')

# Verify it works by making a prediction on the first 5 rows of test data
#print("\nTesting loaded model...")
#sample_probs = my_loaded_model.predict_proba(X_test.head(5))[:, 1]
#print(f"Predictions for first 5 trials: {sample_probs}")

>>> Loading model from: /home/delaunan/code/delaunan/clintrialpredict/src/models/logreg_model.joblib

Testing loaded model...
Predictions for first 5 trials: [0.40835685 0.31814536 0.59581191 0.11837591 0.33292536]
