### Configuration and Path Setup

**Objective:** Define the absolute path to the data directory.
**Rationale:** Ensures the code is portable across different user environments (e.g., local machine vs. cloud).


In [22]:
import os

# Define the absolute path to the data directory
# Update this string if running on a different machine
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"

# Verify that the directory exists
if os.path.exists(DATA_PATH):
    print(f"Data Path configured: {DATA_PATH}")
else:
    print(f"Error: Path not found: {DATA_PATH}")

Data Path configured: /home/delaunan/code/delaunan/clintrialpredict/data


### Setup and Data Loading

**Objective:** Load the dataset and enforce chronological order.
**Rationale:** Predictive modeling for future events requires data to be sorted by time to prevent temporal leakage (using future data to predict the past).

In [23]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Construct File Path
file_path = os.path.join(DATA_PATH, 'project_data.csv')

# 2. Load Data
try:
    df = pd.read_csv(file_path, low_memory=False)
    print(f"Data Loaded Successfully: {len(df)} rows.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")

# 3. Sort by Time
# Essential for temporal splitting.
if 'df' in locals():
    df = df.sort_values(by='start_year').reset_index(drop=True)
    print(f"Date Range: {df['start_year'].min()} to {df['start_year'].max()}")

Data Loaded Successfully: 105336 rows.
Date Range: 2000.0 to 2024.0


### Temporal Split (Train/Test)

**Objective:** Split the data into Training (Past) and Testing (Future) sets based on an 80/20 ratio.
**Rationale:**
*   **Temporal Split:** Simulates real-world conditions where the model must predict future trials based on historical data.
*   **COVID Check:** Verifies if the training set includes data from 2019+ to ensure the model learns the `covid_exposure` signal.

In [24]:
# 1. Define Split Point (80% Train / 20% Test)
split_idx = int(len(df) * 0.8)

# 2. Separate Features and Target
X = df.drop(columns=['target', 'overall_status'])
y = df['target']

# 3. Perform Temporal Split
X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test  = y.iloc[split_idx:]

# 4. Extract Years for Post-Hoc Analysis
train_years = X_train['start_year'].values
test_years  = X_test['start_year'].values

# 5. Verification Report
print(f"TRAIN Set: {train_years.min()} - {train_years.max()} (Rows: {len(X_train)})")
print(f"TEST Set:  {test_years.min()} - {test_years.max()} (Rows: {len(X_test)})")

# Check for COVID coverage in Training
if train_years.max() >= 2019:
    print("Status: Training set includes COVID-era data (2019+).")
else:
    print("Status: Training set ends before COVID era.")

TRAIN Set: 2000.0 - 2019.0 (Rows: 84268)
TEST Set:  2019.0 - 2024.0 (Rows: 21068)
Status: Training set includes COVID-era data (2019+).


### Encoding Pipeline Definition

**Objective:** Define the transformation logic for each data type based on the Audit Blueprint.
**Strategies:**
*   **Binary:** `OneHotEncoder` (drop one column).
*   **Nominal (<50):** `OneHotEncoder` (keep all columns).
*   **High Cardinality (>50):** `TargetEncoder` (maps category to failure probability).
*   **Text Tags:** `TfidfVectorizer` (top 100 keywords).
*   **Numeric:** Dropped in this step (handled by separate scaling pipeline).

In [None]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
import re

# --- CONFIGURATION ---
N_TEXT_FEATURES = 100

# 1. Define Feature Groups
CAT_BINARY = ['is_international', 'covid_exposure', 'healthy_volunteers',
              'adult', 'child', 'older_adult', 'includes_us']

CAT_NOMINAL = ['gender', 'agency_class', 'masking', 'intervention_model',
               'primary_purpose', 'therapeutic_area', 'allocation']

CAT_HIGH_CARD = ['therapeutic_subgroup_name', 'best_pathology']

# --- TEXT FEATURE (DISABLED FOR BASELINE) ---
# TEXT_TAGS = 'txt_tags'

# 2. Define Stop Words (DISABLED)
# clinical_stop_words = [
#     'study', 'trial', 'clinical', 'phase', 'group', 'cohort', 'arm',
#     'randomized', 'randomised', 'controlled', 'double', 'blind', 'open', 'label',
#     'safety', 'efficacy', 'comparison', 'evaluation',
#     'patient', 'subject', 'participant', 'volunteer'
# ]
# final_stop_words = list(ENGLISH_STOP_WORDS) + clinical_stop_words

# 3. Define Text Cleaning Logic (DISABLED)
# def clean_text_logic(series):
#     s = series.fillna('').str.lower()
#     s = s.str.replace(r'\d+', '', regex=True)
#     s = s.str.replace(r'(\w{2,})ies\b', r'\1y', regex=True)
#     s = s.str.replace(r'(\w{3,})s\b', r'\1', regex=True)
#     return s

# 4. Define Pipelines
pipe_bin = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary', dtype=int, handle_unknown='ignore')
)

pipe_nom = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='UNKNOWN'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=int)
)

pipe_high = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='UNKNOWN'),
    TargetEncoder(target_type='binary', smooth=10.0, random_state=42)
)

# Text Pipeline (DISABLED)
# pipe_txt = make_pipeline(
#     FunctionTransformer(clean_text_logic, validate=False, feature_names_out='one-to-one'),
#     TfidfVectorizer(max_features=N_TEXT_FEATURES, stop_words=final_stop_words)
# )

# 5. Assemble ColumnTransformer
encoder_step = ColumnTransformer(
    transformers=[
        ('binary', pipe_bin, CAT_BINARY),
        ('nominal', pipe_nom, CAT_NOMINAL),
        ('high_card', pipe_high, CAT_HIGH_CARD),
        # ('text', pipe_txt, TEXT_TAGS)  <-- DISABLED HERE
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

### Execution and Verification

**Objective:** Fit the pipeline on the Training set and transform both sets.
**Output:** Returns the processed categorical matrices ready for integration with scaled numeric data.

In [32]:
print("Processing Categorical Features...")

# 1. Fit on Train, Transform Train
X_train_cat = encoder_step.fit_transform(X_train, y_train)

# 2. Transform Test (No Fitting)
X_test_cat = encoder_step.transform(X_test)

# 3. Output Verification
print(f"Encoded Train Shape: {X_train_cat.shape}")
print(f"Encoded Test Shape:  {X_test_cat.shape}")

# Verify Feature Names
# This should now work because FunctionTransformer passes the names correctly
new_features = encoder_step.get_feature_names_out()
print(f"Total Features: {len(new_features)}")
print(f"Sample Features: {new_features[:10]}")

Processing Categorical Features...
Encoded Train Shape: (84268, 70)
Encoded Test Shape:  (21068, 70)
Total Features: 70
Sample Features: ['is_international_1' 'covid_exposure_1' 'healthy_volunteers_t' 'adult_t'
 'child_t' 'older_adult_t' 'includes_us_1' 'gender_ALL' 'gender_FEMALE'
 'gender_MALE']


In [None]:
import numpy as np

# 1. Settings
n_samples = 20
feature_names = encoder_step.get_feature_names_out()

# 2. Pick Random Indices
# We use a fixed seed (42) so you see the same rows every time you run this
rng = np.random.RandomState(42)
random_indices = rng.choice(X_train_cat.shape[0], size=n_samples, replace=False)

# 3. Slice the Data
# We grab the specific rows corresponding to the random indices
sample_raw = X_train_cat[random_indices]

# 4. Handle Sparse Matrix
# TF-IDF often creates a "Sparse Matrix" to save memory.
# We must convert it to a standard "Dense" array to put it in a DataFrame.
try:
    sample_raw = sample_raw.toarray()
except AttributeError:
    pass # It is already a standard array

# 5. Create and Display DataFrame
df_sample = pd.DataFrame(sample_raw, columns=feature_names)

print(f"Displaying {n_samples} Random Rows from the Processed Training Set:")
display(df_sample)

# 6. Quick Stats Check
print("\n--- Data Integrity Check ---")
print(f"Min Value: {df_sample.min().min()}")
print(f"Max Value: {df_sample.max().max()}")
print("If Max > 1, it means Target Encoding is working (probabilities) or TF-IDF is working (scores).")
print("If Max == 1, it might be only Binary/One-Hot features.")

Displaying 20 Random Rows from the Processed Training Set:


Unnamed: 0,is_international_1,covid_exposure_1,healthy_volunteers_t,adult_t,child_t,older_adult_t,includes_us_1,gender_ALL,gender_FEMALE,gender_MALE,...,therapeutic_area_Psychiatry,therapeutic_area_Respiratory,therapeutic_area_Stomatognathic,therapeutic_area_Urology (Male),therapeutic_area_Wounds,allocation_NON_RANDOMIZED,allocation_RANDOMIZED,allocation_UNKNOWN,therapeutic_subgroup_name,best_pathology
0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.109728,0.131049
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.124587,0.110498
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.085898,0.078115
3,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.115205,0.122051
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.168837,0.204787
5,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.124587,0.165624
6,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.198203,0.214376
7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.250402,0.278899
8,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.26148,0.267022
9,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.188342,0.189853



--- Data Integrity Check ---
Min Value: 0.0
Max Value: 1.0
If Max > 1, it means Target Encoding is working (probabilities) or TF-IDF is working (scores).
If Max == 1, it might be only Binary/One-Hot features.


In [34]:
# 1. Access the specific 'text' pipeline inside the ColumnTransformer
text_pipeline = encoder_step.named_transformers_['text']

# 2. Access the TfidfVectorizer (it is the last step of that pipeline)
vectorizer = text_pipeline.steps[-1][1]

# 3. Get the words
vocab = vectorizer.get_feature_names_out()

# 4. Print them nicely
print(f"--- TF-IDF VOCABULARY ({len(vocab)} words) ---")
print("These are the words your model is actually 'reading':\n")

# Print in groups of 10 for readability
for i in range(0, len(vocab), 10):
    print(vocab[i:i+10])

print("\n------------------------------------------------")
print("Check this list. If you see words like 'dose' or 'month', add them to 'clinical_stop_words' in Block 3.")

KeyError: 'text'