In [1]:
import pandas as pd
import numpy as np

# Generating Simulated Data

In [2]:
# read in our real data
df = pd.read_csv("../data/feature_matrices/FULL_KNN.csv")

# generate an empty data frame to hold 100 simulated patients
sim_df = pd.DataFrame(columns=df.columns, index=range(100))

In [3]:
# find our binary columns
binary_cols = df.loc[:, df.isin([0, 1]).all()].columns

# find our integer columns
int_cols = [(df[col] % 1 == 0).all() for col in df.columns]
int_cols = df.columns[int_cols]

In [4]:
# iterate through our data fields
for col in df.columns:

    # if binary, draw from a random bernoulli
    if col in binary_cols:
        sim_df[col] = np.random.binomial(n=1, p=df[col].mean(), size=100)

    # otherwise, draw from a random gaussian, 
    # but take the absolute value as none of our data should be negative
    else:
        sim_df[col] = np.round(abs(
            np.random.normal(loc=df[col].mean(), scale=df[col].std(),
                             size=100)),
                               decimals=2)

    # finally, if a count value, transform to an integer
    if col in int_cols:
        sim_df[col] = sim_df[col].astype(int)

In [5]:
# save the results
sim_df.to_csv("../data/simulated_data/simulated_feature_matrix.csv",
              index=False)
sim_df.head()

Unnamed: 0,PAT_DEID,DEMO_AGE_AT_DX,DEMO_DEPRESSED,DEMO_AGE_AT_CHE,DEMO_DX_TO_CHE_TIME,DEMO_GENDER_F,DEMO_breast,DEMO_gastrointestinal,DEMO_genitourinary,DEMO_gynecologic,...,LABS_UPH,LABS_URIC,LABS_VD25D3,LABS_VGGQNT,LABS_VGQNT,LABS_WBC,LABS_WBCFLD,LABS_XPTT,LABS_XSPG,LABS_XUPH
0,1533957,57,0,62.41,362,0,1,0,0,0,...,6.42,5.51,39.37,2.03,8.58,339.58,7343.26,38.97,0.99,5.77
1,2380316,36,0,40.54,1679,1,1,0,0,0,...,6.01,5.54,30.39,2.39,7.29,151.09,1985.05,31.3,1.02,5.75
2,3020815,43,0,62.97,511,1,0,0,1,0,...,5.77,6.32,27.15,2.64,6.97,204.11,11021.23,35.46,1.01,6.01
3,3083237,54,0,53.86,1568,0,0,0,0,1,...,6.54,4.84,27.28,1.92,6.96,814.61,14401.12,42.03,1.0,6.15
4,209372,53,1,58.11,265,0,1,0,0,0,...,5.81,5.01,37.41,2.22,7.28,184.84,11311.06,38.22,1.02,6.27


In [6]:
# generate simulated outcomes p=0.35 comes from the incidence of OP-35 events at 30 days in our cohort
sim_outcomes = pd.DataFrame(sim_df["PAT_DEID"])
sim_outcomes["ANY_180"] = np.random.binomial(n=1, p=0.35, size=100)

# save the results
sim_outcomes.to_csv("../data/simulated_data/simulated_outcomes.csv",
              index=False)
sim_outcomes.head()

Unnamed: 0,PAT_DEID,ANY_180
0,1533957,0
1,2380316,1
2,3020815,1
3,3083237,1
4,209372,0
