## **Target Trial Emulation**  
### *By Jyreneah Angel and Nicole Grace Joligon* 

## **IMPORT LIBRARIES**

In [3]:
import os
from tempfile import mkdtemp
import pandas as pd
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# Load the dataset
data_censored = pd.read_csv("data_censored.csv")
print(data_censored.head())

   id  age  treatment  x1        x2  event_time  censoring  censored
0   1   30          1   0 -0.049161    7.809887          0         1
1   2   64          1   0  1.593443    7.522101          0         1
2   3   59          1   1  0.765598   10.867286          0         1
3   4   32          0   0 -0.071377   13.681152          0         1
4   5   75          0   1 -1.716098   22.671353          0         1


## **Define Estimands**

In [6]:
trial_pp = {"estimand": "PP"}
trial_itt = {"estimand": "ITT"}

trial_pp.update({
    "data": data_censored.copy(),  
    "id_col": "id",
    "period_col": "period",
    "treatment_col": "treatment",
    "outcome_col": "outcome",
    "eligible_col": "eligible"
})

trial_itt = trial_pp.copy()
trial_itt["estimand"] = "ITT" 


## **Create Directories**

In [7]:
from tempfile import TemporaryDirectory  

pp_temp_dir = os.path.join(mkdtemp(), "per_protocol")  
os.makedirs(pp_temp_dir, exist_ok=True)  

itt_temp_dir = os.path.join(mkdtemp(), "intention_to_treat")  
os.makedirs(itt_temp_dir, exist_ok=True)  

print(f"Per-Protocol Directory: {pp_temp_dir}")  
print(f"Intention-To-Treat Directory: {itt_temp_dir}")  


Per-Protocol Directory: /tmp/tmpjrcw3vnx/per_protocol
Intention-To-Treat Directory: /tmp/tmpfeqnu_df/intention_to_treat


## **Define Functions**

In [19]:
from sklearn.linear_model import LogisticRegression

def compute_treatment_weights(df, num_formula, denom_formula, treat_col):
    """Compute inverse probability weights for treatment assignment."""
    
    num_model = LogisticRegression()
    num_model.fit(df[num_formula.split(" + ")], df[treat_col])

    denom_model = LogisticRegression()
    denom_model.fit(df[denom_formula.split(" + ")], df[treat_col])

    num_probs = num_model.predict_proba(df[num_formula.split(" + ")])[:, 1]
    denom_probs = denom_model.predict_proba(df[denom_formula.split(" + ")])[:, 1]

    return num_probs / denom_probs

def compute_censoring_weights(df, censor_col, num_formula, denom_formula):
    """Compute inverse probability weights for informative censoring."""
   
    num_model = LogisticRegression()
    num_model.fit(df[num_formula.split(" + ")], 1 - df[censor_col])

    denom_model = LogisticRegression()
    denom_model.fit(df[denom_formula.split(" + ")], 1 - df[censor_col])

    num_probs = num_model.predict_proba(df[num_formula.split(" + ")])[:, 1]
    denom_probs = denom_model.predict_proba(df[denom_formula.split(" + ")])[:, 1]

    return num_probs / denom_probs


## **Expand the Dataset**

In [25]:
def create_trial_dataset(df, id_col, period_col, treat_col, outcome_col):
    """Generate an expanded dataset for sequential trials."""
    
    # Debugging: Print available columns
    print("Available columns:", df.columns.tolist())

    if period_col not in df.columns:
        raise KeyError(f"Column '{period_col}' not found in the dataset. Available columns: {df.columns.tolist()}")

    trials = []
    
    # Loop through each unique period and expand the dataset
    for current_period in sorted(df[period_col].unique()):
        temp_df = df[df[period_col] <= current_period].copy()
        temp_df["trial_stage"] = current_period  # Rename for differentiation
        trials.append(temp_df)

    return pd.concat(trials).reset_index(drop=True)

# Check dataset before expansion
print("First few rows of trial_pp['data']:")
print(trial_pp["data"].head())

pp_trial_expanded = create_trial_dataset(
    df=trial_pp["data"],
    id_col="id",
    period_col="event_time",  # Change this to the correct column
    treat_col="treatment",
    outcome_col="censored"
)


First few rows of trial_pp['data']:
   id  age  treatment  x1        x2  event_time  censoring  censored
0   1   30          1   0 -0.049161    7.809887          0         1
1   2   64          1   0  1.593443    7.522101          0         1
2   3   59          1   1  0.765598   10.867286          0         1
3   4   32          0   0 -0.071377   13.681152          0         1
4   5   75          0   1 -1.716098   22.671353          0         1
Available columns: ['id', 'age', 'treatment', 'x1', 'x2', 'event_time', 'censoring', 'censored']


## **Calculate Weights**

In [33]:
# Ensure trial_pp_expanded has the expected columns
print("Columns in trial_pp_expanded:", pp_trial_.columns)

# Compute switch weights
trial_pp["switch_weights"] = calculate_weights(
    data=trial_pp_expanded,
    numerator_formula="age",
    denominator_formula="age + x1 + x3",
    treatment_col="treatment"
)

# Compute censor weights
trial_pp["censor_weights"] = calculate_censor_weights(
    data=trial_pp_expanded,
    censor_event="censored",
    numerator_formula="x2",
    denominator_formula="x2 + x1"
)

trial_itt["censor_weights"] = calculate_censor_weights(
    data=trial_pp_expanded,
    censor_event="censored",
    numerator_formula="x2",
    denominator_formula="x2 + x1"
)

# Handle potential NaN or Inf values in weights
trial_pp["switch_weights"] = np.nan_to_num(trial_pp["switch_weights"], nan=1.0, posinf=10, neginf=0.1)
trial_pp["censor_weights"] = np.nan_to_num(trial_pp["censor_weights"], nan=1.0, posinf=10, neginf=0.1)
trial_itt["censor_weights"] = np.nan_to_num(trial_itt["censor_weights"], nan=1.0, posinf=10, neginf=0.1)

# Combine weights safely
trial_pp["weights"] = trial_pp["switch_weights"] * trial_pp["censor_weights"]
trial_itt["weights"] = trial_itt["censor_weights"]


<class 'NameError'>: name 'pp_trial_' is not defined