# Generate Patient Data

In [2]:
import pandas as pd
import os
import numpy as np
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

pd.set_option('display.max_rows',200)
pd.set_option('display.max_columns',200)
pd.options.plotting.backend = 'plotly'

# Data

In [71]:
### patient dimension
df_pat = pd.DataFrame(columns=['patient_id','age_bucket','sex','race','foreign_born','education','poverty'])

N = int(1e6)
low = 100000
df_pat['patient_id'] = np.random.randint(low=low, high=low+N*2, size=N)

In [129]:
### patient encounters

# date: randomly selected visits/encounters from 2016-01-01 to 2021-12-31
# datetime: random time component
# patient_id: randomly generated integer, can be joined on patient dimension table id
# encounter_type: type of encounter in ['Clinic Visit','Surgery','Hospital Encounter','Emergency']
# los: Length of Stay, turn around time between start of encounter and end of encounter
# service: randomly selected from common hospital services ['Cardiology','Oncology','Dermatology','Neurology','Orthopedics','Plastics']
# charges: total charges for services for the encounter

df_enc = pd.DataFrame(columns=['datetime','patient_id','encounter_type','los','service','charges'])
N = int(2e5)

df_enc['date'] = pd.Series(pd.date_range(start='2016-01-01',end='2021-12-31',freq='d')).sample(n=N,replace=True)
df_enc['time'] = pd.Series(pd.date_range(start="00:00",end="23:59:59",freq="1S")).sample(n=N,replace=True).astype(str).str[-8:].values
df_enc['datetime'] = pd.to_datetime(df_enc['date'].astype(str) + ' ' + df_enc['time'])
# (limited) random sample of patients with replacement
pat_list = df_pat['patient_id'].sample(frac=1).head(N).sample(n=N,replace=True).values
df_enc['patient_id'] = pat_list

temp = ['Clinic Visit']*int(N*0.65)+['Surgery']*int(N*0.05)+['Hospital Encounter']*int(N*0.25)+['Emergency']*int(N*0.05)
np.random.shuffle(temp)
df_enc['encounter_type'] = temp

df_enc['los'] = np.random.poisson(lam=7, size=N)

temp = ['Cardiology']*int(N*0.2)+['Oncology']*int(N*0.15)+['Dermatology']*int(N*0.25)+['Neurology']*int(N*0.1)+['Orthopedics']*int(N*0.2)+['Plastics']*int(N*0.1)
np.random.shuffle(temp)
df_enc['service'] = temp

df_enc['charges'] = np.random.normal(loc=3e3,scale=1e3,size=N)

# shuffle patient_id
df_enc.to_parquet('encounter_sample.parquet')
df_enc.head()


Unnamed: 0,datetime,patient_id,encounter_type,los,service,charges,date,time
599,2017-08-22 04:39:24,484357,Clinic Visit,4,Plastics,2585.976848,2017-08-22,04:39:24
284,2016-10-11 19:07:12,1254946,Hospital Encounter,7,Oncology,3274.49041,2016-10-11,19:07:12
1875,2021-02-18 05:07:57,1851659,Hospital Encounter,10,Oncology,3296.385039,2021-02-18,05:07:57
611,2017-09-03 00:20:19,936376,Clinic Visit,5,Orthopedics,3512.701435,2017-09-03,00:20:19
668,2017-10-30 09:40:47,1890254,Clinic Visit,3,Cardiology,2810.532943,2017-10-30,09:40:47


In [131]:
df_enc['los'].plot(kind='hist')

In [70]:
### patient dimension

# https://www.census.gov/quickfacts/fact/table/sandiegocitycalifornia/PST045221
# 1,386,932 population of San Diego County

# patient_id: randomly generated integer.
# age_bucket: age distributions for 4 custom age buckets.
# sex: 49.5% of San Diego county residents are Female.
# race: total percent does not add to 100, but scaling should result in representative distributions.
# foreign_born: 25.6% of San Diego County residents are foreign born.
# education: level of education, if bachelors, then high school is implied.
# poverty: 11.8% of San Diego County residents are in poverty.
cat_cols = []
age_bucket_dist = {'<5 years':0.058,'5 to 18 years':0.196,'>65 years':0.133}
age_bucket_dist['18 to 65 years'] = 1 - np.sum([x for _,x in age_bucket_dist.items()])
cat_cols.extend([x for x,_ in age_bucket_dist.items()])

sex_dist = {'Female':0.495}
sex_dist['Male'] = 1 - np.sum([x for _,x in sex_dist.items()])
cat_cols.extend([x for x,_ in sex_dist.items()])

race_dist = {'Black or African American':0.061,
             'American Indian or Alaska Native':0.005,
             'Asian':0.173,
             'Native Hawaiian and Other Pacific Islander':0.004,
             'Two or More Races':0.078,
             'Hispanic or Latino':0.301
            }
race_dist['White'] = 1 - np.sum([x for _,x in race_dist.items()])
cat_cols.extend([x for x,_ in race_dist.items()])

foreign_dist = {'Foreign Born':0.256}
foreign_dist['Native Born'] = 1 - np.sum([x for _,x in foreign_dist.items()])
cat_cols.extend([x for x,_ in foreign_dist.items()])

education_dist = {"Bachelor's Degree or Higher":0.467}
education_dist['High School Graduate'] = 0.888 - np.sum([x for _,x in education_dist.items()])
education_dist['Not a High School Graduate'] = 1 - np.sum([x for _,x in education_dist.items()])
cat_cols.extend([x for x,_ in education_dist.items()])

poverty_dist = {'Poverty':0.118}
poverty_dist['Not Poverty'] = 1 - np.sum([x for _,x in poverty_dist.items()])
cat_cols.extend([x for x,_ in poverty_dist.items()])

dists = {'age_bucket':age_bucket_dist,'sex':sex_dist,'race':race_dist,'foreign_born':foreign_dist,'education':education_dist,'poverty':poverty_dist}

for col,d in dists.items():
    df_pat[col] = ''
    pvals = list(d.values())
    dist = np.random.multinomial(n=1,pvals=pvals,size=N)
    i = 0
    for cat in list(d.keys()):
        df_pat[cat] = dist[:,i]
        df_pat.loc[df_pat[cat]==1,col] = cat
        i+=1
        
df_pat.to_parquet('patient_sample.parquet')
print(df_pat.shape)
display(df_pat.head(2))
display(df_pat.tail(2))

(1000000, 27)


Unnamed: 0,patient_id,age_bucket,sex,race,foreign_born,education,poverty,<5 years,5 to 18 years,>65 years,18 to 65 years,Female,Male,Black or African American,American Indian or Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Two or More Races,Hispanic or Latino,White,Foreign Born,Native Born,Bachelor's Degree or Higher,High School Graduate,Not a High School Graduate,Poverty,Not Poverty
0,1418224,18 to 65 years,Female,Hispanic or Latino,Native Born,Bachelor's Degree or Higher,Not Poverty,0,0,0,1,1,0,0,0,0,0,0,1,0,0,1,1,0,0,0,1
1,1326247,5 to 18 years,Female,Hispanic or Latino,Native Born,High School Graduate,Not Poverty,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1


Unnamed: 0,patient_id,age_bucket,sex,race,foreign_born,education,poverty,<5 years,5 to 18 years,>65 years,18 to 65 years,Female,Male,Black or African American,American Indian or Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Two or More Races,Hispanic or Latino,White,Foreign Born,Native Born,Bachelor's Degree or Higher,High School Graduate,Not a High School Graduate,Poverty,Not Poverty
999998,1731751,18 to 65 years,Female,Two or More Races,Native Born,High School Graduate,Not Poverty,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1
999999,478302,>65 years,Female,White,Native Born,Bachelor's Degree or Higher,Not Poverty,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,1,0,0,0,1


In [69]:
corr = df_pat[cat_cols].corr()

fig = go.Figure(data=go.Heatmap(z=corr.values,x=corr.columns,y=corr.index))
fig.update_layout(title='Cross Correlation of Patient Demographics')
fig.show()