## 1. Using the Faker Package to Generate Basic Patient Records

### Basic Usage Example

In [85]:
from faker import Faker
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import OrderedDict
import re
import datetime

In [2]:
fake = Faker()

def generate_patient():
    return {
        "Patient_ID": fake.uuid4(),
        "Name": fake.name(),
        "Age": fake.random_int(min=0, max=100),
        "Gender": fake.random_element(elements=("Male", "Female")),
        "Condition": fake.random_element(elements=("Diabetes", "Hypertension", "Asthma")),
        "Medication": fake.random_element(elements=("Metformin", "Lisinopril", "Albuterol"))
    }

# Generate 100 synthetic patient records
patients = [generate_patient() for _ in range(100)]
df = pd.DataFrame(patients)
df.head()

Unnamed: 0,Patient_ID,Name,Age,Gender,Condition,Medication
0,35522622-a22e-4f55-8171-945d155445c4,Joy Mills,91,Male,Asthma,Lisinopril
1,2c1aefb6-ad55-48ce-9676-d311fe9128c1,Brandon Barber,98,Male,Asthma,Albuterol
2,600e9ccc-f720-47a1-af2d-9237f48fcf86,Brian Collins,64,Female,Diabetes,Lisinopril
3,94071cf3-e144-4bdd-8262-0a2826a37caf,Lisa Grant,10,Female,Diabetes,Metformin
4,a342b4a3-1e38-4ce7-8f8c-2fec81879320,Brianna Pearson,82,Female,Asthma,Albuterol


### Generating Fake Patients Based on Existing Data

In [3]:
def load_data_for_file(filename):
    print(f"Loading data for {filename}")
    df = pd.concat([ # use pd.concat to append/concatenate the data for all states together into a single frame
        pd.read_parquet(f"https://dicbworkshops.s3.amazonaws.com/{output_dir}/parquet/{filename}") # use read_csv to load the data from each output directory
        for output_dir in tqdm(['output_hi', 'output_ma', 'output_tx', 'output_wa']) # loop over each output directory
    ])
    return df

In [4]:
# load in the patients data
patients = load_data_for_file('patients.parquet')

Loading data for patients.parquet


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00,  1.67it/s]


In [5]:
death_rate = len(patients[~patients['DEATHDATE'].isna()]) / len(patients)

In [6]:
gender_distribution = patients['GENDER'].value_counts(normalize=True)

In [7]:
race_distribution = patients["RACE"].value_counts(normalize=True)

In [8]:
ethnicity_distribution = patients["ETHNICITY"].value_counts(normalize=True)

In [37]:
middle_name_rate = len(patients[~patients['MIDDLE'].isna()]) / len(patients)

In [9]:
def get_elements_from_distribution(distribution):
    result = []
    for i in range(len(distribution)):
        result.append((distribution.index[i], float(distribution.iloc[i])))
    return OrderedDict(result)

In [96]:
def generate_patient_v2():
    # birthdate is generated in the range of birthdates observed in the "actual data"
    birthdate = str(fake.date_time_between(
        pd.to_datetime(patients['BIRTHDATE'].min()), 
        pd.to_datetime(patients['BIRTHDATE'].max())
    ).date())
    # deathdate is generated to be between the birthdate and the current date
    deathdate = str(fake.date_time_between(start_date=pd.to_datetime(birthdate)).date()) if np.random.random() < death_rate else None
    # gender, race, and ethnicity are generated to match the individualized frequencies seen in the "real data"
    gender = fake.random_element(elements=get_elements_from_distribution(gender_distribution))
    race = fake.random_element(elements=get_elements_from_distribution(race_distribution))
    ethnicity = fake.random_element(elements=get_elements_from_distribution(ethnicity_distribution))
    # generate a name based on the gender
    if gender == 'M':
        firstname, lastname = fake.first_name_male(), fake.last_name()
    else:
        firstname, lastname = fake.first_name_female(), fake.last_name()
    middlename = fake.first_name_nonbinary() if np.random.random() < middle_name_rate else None
    return {
        "Id": fake.uuid4(),
        "BIRTHDATE": birthdate,
        "DEATHDATE": deathdate,
        "SSN": fake.ssn(),
        "DRIVERS": "S999" + str(fake.random_number(digits=5, fix_len=True)) if birthdate < '2009-03-01' else None,
        "PASSPORT": "X" + str(fake.random_number(digits=8, fix_len=True)) + "X",
        "FIRST": firstname,
        "MIDDLE": middlename,
        "LAST": lastname,
        "GENDER": gender,
        "RACE": race,
        "ETHNICITY": ethnicity
    }

In [100]:
synthetic = pd.DataFrame([generate_patient_v2() for _ in range(len(patients))])

In [101]:
synthetic

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,FIRST,MIDDLE,LAST,GENDER,RACE,ETHNICITY
0,80bf1b4c-8f68-4db4-8ec9-0c5e28e0d010,1942-04-09,,606-06-5385,S99997266,X48416074X,Julia,Robert,Tyler,F,white,nonhispanic
1,81283041-0f05-4129-b3e8-ffc81d0bf485,1992-08-22,,308-26-4065,S99927054,X55771397X,Anna,Mark,Watson,F,white,nonhispanic
2,94074fa5-7437-4e61-ac8f-7f419bf9adc1,2016-07-30,2016-09-07,203-83-9438,,X32275140X,Joyce,Willie,Daugherty,F,white,hispanic
3,a221943e-34bf-4a27-b255-ed1933d882ef,1954-11-12,,875-25-1156,S99979635,X90996425X,Lisa,Brianna,Alvarez,F,white,nonhispanic
4,ffde02d9-d00c-4f5b-b2e8-12b2a011f630,1981-01-18,,707-31-0819,S99975604,X52694733X,Adrian,Jose,Bush,M,white,nonhispanic
...,...,...,...,...,...,...,...,...,...,...,...,...
4574,97c3d15e-7a4a-40d3-a5f6-5a86e39f8d46,1969-09-02,,277-33-5741,S99950463,X94027751X,Rachel,David,Gutierrez,F,white,nonhispanic
4575,5d03947c-4bb7-47e1-bcae-f34aff11fc89,1987-04-14,,731-99-3581,S99953548,X87215502X,Peter,Angela,Hood,M,white,hispanic
4576,56583d92-acbe-4efa-90e0-0062187cdeb8,1934-10-09,,834-83-5341,S99941153,X95204964X,Jaime,Herbert,Mendez,F,white,nonhispanic
4577,50ccfee9-500d-4998-8cd6-311ecf2d32a4,2002-10-29,,578-60-0541,S99996792,X84193108X,Tammy,Kara,Hill,F,hawaiian,hispanic


In [102]:
patients[synthetic.columns]

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,FIRST,MIDDLE,LAST,GENDER,RACE,ETHNICITY
0,561a0548-fb96-1077-045b-32b2f53f4a86,1984-09-23,,999-28-3895,S99966167,X67980688X,Bert917,,Blick895,M,white,nonhispanic
1,8f8229e6-00be-a033-bb16-42781f9d208a,1983-05-25,,999-89-6788,S99965515,X33191293X,Augusta206,,Hilll811,F,asian,nonhispanic
2,e1418fe5-3ca2-2d35-9cd0-88ec20bce2d6,1976-02-16,,999-30-7651,S99987999,X48455510X,Claudia969,Julia241,Guevara385,F,white,hispanic
3,1b88c0c7-2c6c-7062-c96e-0f9199704237,1985-09-13,,999-79-1205,S99980586,X23697754X,Sofia418,María Eugenia578,Calvillo618,F,white,hispanic
4,f977e4f6-6491-43c7-274d-0629e5a1d2c8,1976-12-23,,999-97-1212,S99953694,X63637742X,Cristobal567,Antonio44,Alcaraz418,M,white,hispanic
...,...,...,...,...,...,...,...,...,...,...,...,...
1139,968da040-7683-9382-b5e4-f777eaf5f59c,1980-03-06,,999-14-6843,S99939996,X82579576X,Long300,Lee268,Smith67,M,white,nonhispanic
1140,0ca9f247-0197-d381-a6e5-e3b5c53430ef,1966-04-05,,999-18-5196,S99979996,X36794173X,Hung902,Quincy153,Haag279,M,native,nonhispanic
1141,3147f01c-bcf2-1aab-aa0d-3cb1b19bc7bf,1975-06-08,,999-26-4329,S99931354,X44734902X,Pat3,,Kerluke267,M,white,nonhispanic
1142,3df4eaa0-3234-0118-df9e-5cbe4659744e,1960-06-20,,999-17-5062,S99980900,X8568568X,Nathaniel596,Wendell199,Gaylord332,M,white,hispanic
