### Configuration and Path Setup

**Objective:** Define the absolute path to the data directory.
**Rationale:** Ensures the code is portable across different user environments (e.g., local machine vs. cloud).


In [None]:
import os

# Define the absolute path to the data directory
# Update this string if running on a different machine
DATA_PATH = "/home/delaunan/code/delaunan/clintrialpredict/data"

# Verify that the directory exists
if os.path.exists(DATA_PATH):
    print(f"Data Path configured: {DATA_PATH}")
else:
    print(f"Error: Path not found: {DATA_PATH}")

### Setup and Data Loading

**Objective:** Load the dataset and enforce chronological order.
**Rationale:** Predictive modeling for future events requires data to be sorted by time to prevent temporal leakage (using future data to predict the past).

In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Construct File Path
file_path = os.path.join(DATA_PATH, 'project_data.csv')

# 2. Load Data
try:
    df = pd.read_csv(file_path, low_memory=False)
    print(f"Data Loaded Successfully: {len(df)} rows.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")

# 3. Sort by Time
# Essential for temporal splitting.
if 'df' in locals():
    df = df.sort_values(by='start_year').reset_index(drop=True)
    print(f"Date Range: {df['start_year'].min()} to {df['start_year'].max()}")

FileNotFoundError: [Errno 2] No such file or directory: 'data/project_data.csv'

### Temporal Split (Train/Test)

**Objective:** Split the data into Training (Past) and Testing (Future) sets based on an 80/20 ratio.
**Rationale:**
*   **Temporal Split:** Simulates real-world conditions where the model must predict future trials based on historical data.
*   **COVID Check:** Verifies if the training set includes data from 2019+ to ensure the model learns the `covid_exposure` signal.

In [None]:
# 1. Define Split Point (80% Train / 20% Test)
split_idx = int(len(df) * 0.8)

# 2. Separate Features and Target
X = df.drop(columns=['target', 'overall_status'])
y = df['target']

# 3. Perform Temporal Split
X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test  = y.iloc[split_idx:]

# 4. Extract Years for Post-Hoc Analysis
train_years = X_train['start_year'].values
test_years  = X_test['start_year'].values

# 5. Verification Report
print(f"TRAIN Set: {train_years.min()} - {train_years.max()} (Rows: {len(X_train)})")
print(f"TEST Set:  {test_years.min()} - {test_years.max()} (Rows: {len(X_test)})")

# Check for COVID coverage in Training
if train_years.max() >= 2019:
    print("Status: Training set includes COVID-era data (2019+).")
else:
    print("Status: Training set ends before COVID era.")

### Encoding Pipeline Definition

**Objective:** Define the transformation logic for each data type based on the Audit Blueprint.
**Strategies:**
*   **Binary:** `OneHotEncoder` (drop one column).
*   **Nominal (<50):** `OneHotEncoder` (keep all columns).
*   **High Cardinality (>50):** `TargetEncoder` (maps category to failure probability).
*   **Text Tags:** `TfidfVectorizer` (top 50 keywords).
*   **Numeric:** Dropped in this step (handled by separate scaling pipeline).

In [None]:
# 1. Define Feature Groups
CAT_BINARY = ['is_international', 'covid_exposure', 'healthy_volunteers',
              'adult', 'child', 'older_adult', 'includes_us']

CAT_NOMINAL = ['gender', 'agency_class', 'masking', 'intervention_model',
               'primary_purpose', 'therapeutic_area', 'allocation']

CAT_HIGH_CARD = ['therapeutic_subgroup_name', 'best_pathology']

TEXT_TAGS = 'txt_tags'

# 2. Define Pipelines
# Binary: Impute -> OneHot (Drop if binary)
pipe_bin = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary', dtype=int, handle_unknown='ignore')
)

# Nominal: Impute 'UNKNOWN' -> OneHot (Dense)
pipe_nom = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='UNKNOWN'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype=int)
)

# High Cardinality: Impute 'UNKNOWN' -> Target Encoding (Smooth=10)
pipe_high = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='UNKNOWN'),
    TargetEncoder(target_type='binary', smooth=10.0, random_state=42)
)

# Text: Impute Empty -> TF-IDF (Top 50)
pipe_txt = make_pipeline(
    SimpleImputer(strategy='constant', fill_value=''),
    TfidfVectorizer(max_features=50, stop_words='english')
)

# 3. Assemble ColumnTransformer
encoder_step = ColumnTransformer(
    transformers=[
        ('binary', pipe_bin, CAT_BINARY),
        ('nominal', pipe_nom, CAT_NOMINAL),
        ('high_card', pipe_high, CAT_HIGH_CARD),
        ('text', pipe_txt, TEXT_TAGS)
    ],
    remainder='drop', # Numeric columns are excluded here
    verbose_feature_names_out=False
)

### Execution and Verification

**Objective:** Fit the pipeline on the Training set and transform both sets.
**Output:** Returns the processed categorical matrices ready for integration with scaled numeric data.

In [None]:
print("Processing Categorical Features...")

# 1. Fit on Train, Transform Train
X_train_cat = encoder_step.fit_transform(X_train, y_train)

# 2. Transform Test (No Fitting)
X_test_cat = encoder_step.transform(X_test)

# 3. Output Verification
print(f"Encoded Train Shape: {X_train_cat.shape}")
print(f"Encoded Test Shape:  {X_test_cat.shape}")
# Verify Feature Names
new_features = encoder_step.get_feature_names_out()
print(f"Total Features: {len(new_features)}")
print(f"Sample Features: {new_features[:10]}")