In [1]:
import numpy as np
import pandas as pd
import random

In [71]:
def simulated_linear(N=1000, observe_time =10, percentcensored = 0, randomseed = 100):
    # N - population size
    # observe_time - total observation time, after which all survivors are right-censored 
    # percentcensored - rate of dropping out early from the observation
    # this censoring is applied unformly from 0 to 10, so for those who had the event before dropping out, 
    # would not be censored in terms of the censoring definition, and actual drop out rate <= percentcencored
    #randomseed - random seed for population simulation, set before applying numpy.random functions
    
    #check that inputs are of the right type
    c1 = isinstance(N,int)
    c2 = (isinstance(observe_time, float)|isinstance(observe_time, int))
    c3 = (isinstance(percentcensored, float)|isinstance(percentcensored, int))
    if (isinstance(percentcensored, float)|isinstance(percentcensored, int)): 
        if((percentcensored>=1)|(percentcensored<0)): 
            c3= False
    c4 = isinstance(randomseed,int)
        
    if (c1&c2&c3&c4 == False): 
        c= [c1,c2,c3,c4] 
        b = ["N", "observe_time", "percentcensored", "randomseed"]
        print ("Error with input type:", [b[i] for i in range(len(c)) if c[i]==False])
        return None 
    
    #start simulations 
    random.seed(randomseed)
    
    #1_ generating main features:
    df = pd.DataFrame({"age": np.round(np.random.uniform(-1.73,1.73,N),1),
                  "bmi": np.round(np.random.normal(0, 1,N),1),  
                  "hyp": np.random.binomial(1,0.20, N),
                  "gender": np.random.binomial(1,0.5, N)})
    
    #2_ generating event times: 
    df["event_time"] = 0.01 + np.round(np.random.exponential(1/(0.1*np.exp(1.0*df.bmi + 
                                      df.hyp*0.7 + df.age*0.4)),N),2)
    df["observe_time"] = observe_time
    
    #3_ add censored observations with a shorter observation time (drop-outs)
    # the time is randomly drawn uniformly from observe_time/20 to observe_time ('/20' to exclude very short observations)
    df["early_censored"]=0 #marker if an observation dropped out early (1) 
    if (percentcensored > 0):
        #assume that nobody drops out in the first 1/20th of the observation time 
        randcentime = np.random.uniform(observe_time/20, observe_time, np.int(N*percentcensored))
        cens_obs = np.random.choice(df.shape[0], np.int(N*percentcensored), replace = False)
        # censored time is the end of observation in this simulation
        df["cens_time"]= np.nan
        df.loc[cens_obs, "cens_time"] = randcentime
        df.loc[~ df.index.isin(cens_obs), "cens_time"] = observe_time
        
        #change early censored to 1 for those dropped out before event and before observation end:
        df.loc[((df["cens_time"] < df["observe_time"]) & 
                (df["cens_time"] < df["event_time"])), "early_censored"]=1  
    else: 
        df["cens_time"] = observe_time
    
    #4_ defining the outcome and time 
    # time is the first from event, censoring, or end of observation
    df["time"] = df[['event_time','cens_time','observe_time']].min(axis=1)
    
    # event is 1 if event happened before "time"
    df.loc[df.event_time<= df.time, "event"] = 1
    df.loc[df.event_time > df.time, "event"] = 0
    
    # delete redundant columns 
    df = df.drop(columns = ["cens_time", "event_time"])
    
    return (df)

In [72]:

def simulated_nonlinear(N=1000, observe_time =10, percentcensored = 0, randomseed = 100):
    # N - population size
    # observe_time - total observation time, after which all survivors are right-censored 
    # percentcensored - rate of dropping out early from the observation
    # this censoring is applied unformly from 0 to 10, so for those who had the event before dropping out, 
    # would not be censored in terms of the censoring definition, and actual drop out rate <= percentcencored
    #randomseed - random seed for population simulation, set before applying numpy.random functions
    
    #check that inputs are of the right type
    c1 = isinstance(N,int)
    c2 = (isinstance(observe_time, float)|isinstance(observe_time, int))
    c3 = (isinstance(percentcensored, float)|isinstance(percentcensored, int))
    if (isinstance(percentcensored, float)|isinstance(percentcensored, int)): 
        if((percentcensored>=1)|(percentcensored<0)): 
            c3= False
    c4 = isinstance(randomseed,int)
        
    if (c1&c2&c3&c4 == False): 
        c= [c1,c2,c3,c4] 
        b = ["N", "observe_time", "percentcensored", "randomseed"]
        print ("Error with input type:", [b[i] for i in range(len(c)) if c[i]==False])
        return None 
    
    #start simulations 
    random.seed(randomseed)
    
    #1_ generating main features:
    df = pd.DataFrame({"age": np.round(np.random.uniform(-1.73,1.73,N),1),
                  "bmi": np.round(np.random.normal(0, 1,N),1),  
                  "hyp": np.random.binomial(1,0.20, N),
                  "gender": np.random.binomial(1,0.5, N)})
    
    #2_ generating event times: 
    #BMI impact is 2 for very low and high levels, 1 for high/ low level, 0 for normal range
    bmi_beta = np.array([2 if (np.abs(x)> 1.5) else 1 if (np.abs(x)>1) else 0 for x in df["bmi"]])

    #Age impact is 1 for age>=55; linear age impact is also present, but is smaller than in linear simulation
    age_beta = np.array([1 if (x >=1) else 0 for x in df["age"]])
    
    # simulating event time
    df["event_time"] = 0.01+ np.round(np.random.exponential(1/(0.08*np.exp(bmi_beta + 
                                      (df.hyp*0.7)+ df.age*0.2 + age_beta)),N),2)
    df["observe_time"] = observe_time
    
    #3_ add censored observations with a shorter observation time (drop-outs)
    # the time is randomly drawn uniformly from observe_time/20 to observe_time ('/20' to exclude very short observations)
    df["early_censored"]=0 #marker if an observation dropped out early (1) 
    if (percentcensored > 0):
        #assume that nobody drops out in the first 1/20th of the observation time 
        randcentime = np.random.uniform(observe_time/20, observe_time, np.int(N*percentcensored))
        cens_obs = np.random.choice(df.shape[0], np.int(N*percentcensored), replace = False)
        # censored time is the end of observation in this simulation
        df["cens_time"]= np.nan
        df.loc[cens_obs, "cens_time"] = randcentime
        df.loc[~ df.index.isin(cens_obs), "cens_time"] = observe_time
        
        #change early censored to 1 for those dropped out before event and before observation end:
        df.loc[((df["cens_time"] < df["observe_time"]) & 
                (df["cens_time"] < df["event_time"])), "early_censored"]=1  
    else: 
        df["cens_time"] = observe_time
    
    #4_ defining the outcome and time 
    # time is the first from event, censoring, or end of observation
    df["time"] = df[['event_time','cens_time','observe_time']].min(axis=1)
    
    # event is 1 if event happened before "time"
    df.loc[df.event_time<= df.time, "event"] = 1
    df.loc[df.event_time > df.time, "event"] = 0
    
    # delete redundant columns 
    df = df.drop(columns = ["cens_time", "event_time"])
    
    return (df)

In [73]:
def simulated_crossterms (N = 1000, observe_time = 10, percentcensored = 0, randomseed = 100):
    # N - population size
    # observe_time - total observation time, after which all survivors are right-censored 
    # percentcensored - rate of dropping out early from the observation
    # this censoring is applied unformly from 0 to 10, so for those who had the event before dropping out, 
    # would not be censored in terms of the censoring definition, and actual drop out rate <= percentcencored
    #randomseed - random seed for population simulation, set before applying numpy.random functions
    
    #check that inputs are of the right type
    c1 = isinstance(N,int)
    c2 = (isinstance(observe_time, float)|isinstance(observe_time, int))
    c3 = (isinstance(percentcensored, float)|isinstance(percentcensored, int))
    if (isinstance(percentcensored, float)|isinstance(percentcensored, int)): 
        if((percentcensored>=1)|(percentcensored<0)): 
            c3= False
    c4 = isinstance(randomseed,int)
    if (c1&c2&c3&c4 == False): 
        c= [c1,c2,c3,c4] 
        b = ["N", "observe_time", "percentcensored", "randomseed"]
        print ("Error with input type:", [b[i] for i in range(len(c)) if c[i]==False])
        return None 
    
    #start simulations 
    random.seed(randomseed)
    
    #1_ generating main features:
    df = pd.DataFrame({"age": np.round(np.random.uniform(-1.73,1.73,N),1),
                  "bmi": np.round(np.random.normal(0, 1,N),1),  
                  "hyp": np.random.binomial(1,0.20, N),
                  "gender": np.random.binomial(1,0.5, N)})
    
    #2_ generating event times: 
    #BMI impact is 2 for very low and high levels, 1 for high/ low level, 0 for normal range
    bmi_beta = np.array([2 if (np.abs(x)> 1.5) else 1 if (np.abs(x)>1) else 0 for x in df["bmi"]])  
    
    # hypertension x age interaction
    hyp_age_beta = np.array([2 if ((df.loc[i,"age"]>=1) & (df.loc[i,"hyp"]==1))
                        else 1 if ((df.loc[i,"age"]<1)&(df.loc[i,"hyp"]==1)) 
                                else 0 for i in range(df.shape[0])])

    # simulating event time
    df["event_time"] = 0.01 + np.round(np.random.exponential(1/(0.07*np.exp(bmi_beta + 
                                      hyp_age_beta + df["age"]*0.2)),N),2)
    
    df["observe_time"] = observe_time

    #3_ add censored observations with a shorter observation time (drop-outs)
    # the time is randomly drawn uniformly from observe_time/20 to observe_time ('/20' to exclude very short observations)
    df["early_censored"]=0 #marker if an observation dropped out early (1) 
    if (percentcensored > 0):
        #assume that nobody drops out in the first 1/20th of the observation time 
        randcentime = np.random.uniform(observe_time/20, observe_time, np.int(N*percentcensored))
        cens_obs = np.random.choice(df.shape[0], np.int(N*percentcensored), replace = False)
        # censored time is the end of observation in this simulation
        df["cens_time"]= np.nan
        df.loc[cens_obs, "cens_time"] = randcentime
        df.loc[~ df.index.isin(cens_obs), "cens_time"] = observe_time
        
        #change early censored to 1 for those dropped out before event and before observation end:
        df.loc[((df["cens_time"] < df["observe_time"]) & 
                (df["cens_time"] < df["event_time"])), "early_censored"]=1  
    else: 
        df["cens_time"] = observe_time
    
    #4_ defining the outcome and time 
    # time is the first from event, censoring, or end of observation
    df["time"] = df[['event_time','cens_time','observe_time']].min(axis=1)
    
    # event is 1 if event happened before "time"
    df.loc[df.event_time<= df.time, "event"] = 1
    df.loc[df.event_time > df.time, "event"] = 0
    
    # delete redundant columns 
    df = df.drop(columns = ["cens_time", "event_time"])
    
    return(df)

In [75]:
df = simulated_linear(1000, observe_time = 5, percentcensored = 0.3, randomseed = 100)
df.describe()

Unnamed: 0,age,bmi,hyp,gender,observe_time,early_censored,time,event
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-0.0309,0.0028,0.205,0.526,5.0,0.213,3.003691,0.43
std,1.031824,1.006651,0.403904,0.499573,0.0,0.409633,1.839614,0.495323
min,-1.7,-2.8,0.0,0.0,5.0,0.0,0.02,0.0
25%,-1.0,-0.7,0.0,0.0,5.0,0.0,1.15,0.0
50%,0.0,-0.0,0.0,1.0,5.0,0.0,3.056911,0.0
75%,0.9,0.6,0.0,1.0,5.0,0.0,5.0,1.0
max,1.7,3.4,1.0,1.0,5.0,1.0,5.0,1.0
