In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
import scipy as sp
from tqdm import tqdm

# 1. Investigating Chronic Kidney Disease (CKD) in the Synthea Data

## Step 1. Loading of Relevant Data
Data for 4,000 synthetic patients, divided into four states, was generated using the [Synthea tool](https://synthea.mitre.org/)
This data is split into four separate directories
```
output_hi/ # patients in Hawaii
output_ma/ # patients in Massacusetts
output_tx/ # patients in Texas
output_wa/ # patients in Washington
```



In the introductory primer, we worked exclusively with the patient demographics data stored in patients.csv
For the subsequent investigation, we will be relying on two new files
- `conditions.csv`: Stores SNOMED CT coded patient conditions and diagnoses
- `observations.csv`: Includes vital signs and LOINC coded lab results for all patients \
Below is a convenience function that can be used to load the concatenated conditions and observations for all states
into a single data frame \
This uses a new operation that we haven't seen previously, `pd.concat()` which is used to concatenate DataFrames \
together, row-wise

In [None]:
def load_data_for_file(filename):
    print(f"Loading data for {filename}")
    df = pd.concat([ # use pd.concat to append/concatenate the data for all states together into a single frame
        pd.read_parquet(f"https://dicbworkshops.s3.amazonaws.com/{output_dir}/parquet/{filename}") # use read_csv to load the data from each output directory
        for output_dir in tqdm(['output_hi', 'output_ma', 'output_tx', 'output_wa']) # loop over each output directory
    ])
    return df

In [None]:
# load in the conditions and observations
conditions = load_data_for_file('conditions.parquet')
observations = load_data_for_file('observations.parquet')

For this next investigation, we are interested in patients who have been diagnosed with Chronic Kidney Disease  \
In order to do this, we filter the conditions table to get all diagnoses of chronic kidney disease \
We can use the SNOMED CT codes for Stages 1 - 4 Chronic Kidney Disease
- 431855005	Chronic kidney disease stage 1 (disorder)
- 431856006	Chronic kidney disease stage 2 (disorder)
- 433144002	Chronic kidney disease stage 3 (disorder)
- 431857002	Chronic kidney disease stage 4 (disorder)


In [None]:
conditions

In [None]:
ckd = conditions[
    conditions['CODE'].isin([431855005, 431856006, 433144002, 431857002])
]

### Quick Exercise: How many patients in our data set have at least one CKD diagnosis?

Now we will analyze Glomerular filtration rate in patients with various stages of CKD \
We can use the LOINC code for Glomerular filtration rate/1.73 : 33914-3 \
We also make sure we pull the normalized values, with units = mL/min/{1.73_m2}	

In [None]:
observations

In [None]:
observations[observations['DESCRIPTION'].str.contains('glomerular', case=False)]

In [None]:
gfr_labs = observations.query('CODE == "33914-3" & UNITS == "mL/min/{1.73_m2}"')

Now we want to label the GFR labs based on whether the patient had a CKD diagnosis at the time the lab was taken \
and which stage of CKD the patient was diagnosed with. We will use the following convention:
- 0 for no CKD diagnosis
- 1 - 4 for stages 1 - 4 respectively
In order to do this correctly, we need to be careful to label the labs based on the date of the CKD diagnoses

In [None]:
# for each patient with a CKD diagnosis, add the earliest start date to a dictionary data structure
STAGE1 = 431855005
STAGE2 = 431856006
STAGE3 = 433144002
STAGE4 = 431857002

ckd_start_dates = {
    STAGE1: {},
    STAGE2: {},
    STAGE3: {},
    STAGE4: {}
}
for _, row in ckd.iterrows():
    # get the patient, start date, and code
    patient = row['PATIENT']
    start = row['START']
    code = row['CODE']
    ckd_start_dates[code][patient] = min(ckd_start_dates[code].get(patient, '2999-99-99'), start)

In [None]:
# first assign a SIMPLE_DATE column to the bun_labs that only includes the date without time information
gfr_labs_simp = gfr_labs.assign(
    SIMPLE_DATE=lambda x: pd.to_datetime(x['DATE']).dt.date
)


In [None]:
gfr_labs_simp

In [None]:
# now loop over the BUN labs and construct the labels systematically
labels = []
for _, row in gfr_labs_simp.iterrows():
    # get the simple date
    date = str(row['SIMPLE_DATE'])
    # get the patient
    patient = row['PATIENT']
    # now check if the patient has a stage 4 diagnosis
    if ckd_start_dates[STAGE4].get(patient, "2099-99-99") <= date:
        labels.append(4)
    elif ckd_start_dates[STAGE3].get(patient, "2099-99-99") <= date:
        labels.append(3)
    elif ckd_start_dates[STAGE2].get(patient, "2099-99-99") <= date:
        labels.append(2)
    elif ckd_start_dates[STAGE1].get(patient, "2099-99-99") <= date:
        labels.append(1)
    else:
        labels.append(0)
# assign the labels to the bun labs
gfr_labs_labeled = gfr_labs_simp.assign(
    LABEL=labels
)

Now we will plot the distributions of GFR in the different cohorts using the Seaborn package \
and display summary statistics for those distributions

In [None]:
sns.kdeplot(
    gfr_labs_labeled[['VALUE', 'LABEL']].astype({'VALUE': float}),
    x='VALUE',
    hue='LABEL',
    common_norm=False
)
plt.show()

In [None]:
sns.boxplot(
    gfr_labs_labeled.astype({'VALUE': float}),
    x='LABEL',
    y='VALUE'
)

# Discussion Points
1. What is the story with the negative outliers in the Non-CKD group (label 0)? How could we investigate this further for these specific patients?
2. Why do we see positive GFR outliers in the Stage 3 and Stage 4 groups?