In [None]:
import os
import tempfile
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
import pickle

class TrialSequence:
    def __init__(self, estimand):
        """
        Initialize a TrialSequence object with the specified estimand.
        """
        self.estimand = estimand
        self.directory = None  # Directory will be set later

    def create_directory(self, base_dir=None):
        """
        Create a directory for the trial sequence.
        If base_dir is not provided, use the system's temporary directory.
        """
        if base_dir is None:
            base_dir = tempfile.gettempdir()

        # Create a directory name based on the estimand
        dir_name = f"trial_{self.estimand.lower()}"
        self.directory = os.path.join(base_dir, dir_name)

        # Create the directory if it doesn't exist
        os.makedirs(self.directory, exist_ok=True)
        print(f"Directory created: {self.directory}")

    def __repr__(self):
        """
        String representation of the TrialSequence object.
        """
        return f"TrialSequence(estimand={self.estimand}, directory={self.directory})"
    


# STEP 1 ===============================================

# Create TrialSequence objects
trial_pp = TrialSequence(estimand="PP")  # Per-protocol
trial_itt = TrialSequence(estimand="ITT")  # Intention-to-treat

# Create directories for each trial
trial_pp.create_directory()
trial_itt.create_directory()

# Print objects for verification
print(trial_pp)
print(trial_itt)


# STEP 2 ===============================================

# Load the data from the CSV file
data_censored = pd.read_csv("data_censored.csv")

# Inspect the first few rows of the dataset
print(data_censored.head())

# Define a function to structure the data
def set_data(data, id_col, period_col, treatment_col, outcome_col, eligible_col):
    # Validate columns
    required_cols = [id_col, period_col, treatment_col, outcome_col, eligible_col]
    if not all(col in data.columns for col in required_cols):
        raise ValueError(f"Missing required columns. Expected: {required_cols}")
    return {
        "data": data,
        "id": id_col,
        "period": period_col,
        "treatment": treatment_col,
        "outcome": outcome_col,
        "eligible": eligible_col,
    }

# Using the function
trial_pp = set_data(
    data=data_censored,
    id_col="id",
    period_col="period",
    treatment_col="treatment",
    outcome_col="outcome",
    eligible_col="eligible",
)

# ITT (without pipe equivalent)
trial_itt = set_data(
    data=data_censored,
    id_col="id",
    period_col="period",
    treatment_col="treatment",
    outcome_col="outcome",
    eligible_col="eligible",
)

# Print the result (optional)
print(trial_itt)

# STEP 3 ===============================================






Directory created: C:\Users\Katrina\AppData\Local\Temp\trial_pp
Directory created: C:\Users\Katrina\AppData\Local\Temp\trial_itt
TrialSequence(estimand=PP, directory=C:\Users\Katrina\AppData\Local\Temp\trial_pp)
TrialSequence(estimand=ITT, directory=C:\Users\Katrina\AppData\Local\Temp\trial_itt)
   id  period  treatment  x1        x2  x3        x4  age     age_s  outcome  \
0   1       0          1   1  1.146148   0  0.734203   36  0.083333        0   
1   1       1          1   1  0.002200   0  0.734203   37  0.166667        0   
2   1       2          1   0 -0.481762   0  0.734203   38  0.250000        0   
3   1       3          1   0  0.007872   0  0.734203   39  0.333333        0   
4   1       4          1   1  0.216054   0  0.734203   40  0.416667        0   

   censored  eligible  
0         0         1  
1         0         0  
2         0         0  
3         0         0  
4         0         0  
{'data':      id  period  treatment  x1        x2  x3        x4  age     age_s