# Introduction
State notebook purpose here

### Imports
Import libraries and write settings here.

In [333]:
# Data manipulation
import cufflinks as cf
import altair as alt
from IPython import get_ipython
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Display all cell outputs
InteractiveShell.ast_node_interactivity = 'all'

ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
alt.renderers.enable('notebook')
alt.renderers.enable('html')

cf.go_offline(connected=True)
cf.set_config_file(theme='white')

RendererRegistry.enable('notebook')

RendererRegistry.enable('html')

# Data Wrangling

In [334]:
# Reading in data
demographics = pd.read_csv("dataScienceTask/T_demo.csv")
dbp = pd.read_csv("dataScienceTask/T_DBP.csv")
sbp = pd.read_csv("dataScienceTask/T_SBP.csv")
creatinine = pd.read_csv("dataScienceTask/T_creatinine.csv")
glucose = pd.read_csv("dataScienceTask/T_glucose.csv")
meds = pd.read_csv("dataScienceTask/T_meds.csv")
hgb = pd.read_csv("dataScienceTask/T_HGB.csv")
ldl = pd.read_csv("dataScienceTask/T_ldl.csv")
stage = pd.read_csv("dataScienceTask/T_stage.csv")

In [335]:
# Aggregating data into 1 dataframe
df = pd.DataFrame()

for i in demographics['id']:
    temp = dbp.loc[dbp['id'] == i][['time', 'value']].merge(sbp.loc[sbp['id'] == i][[
        'value', 'time']], left_on='time', right_on='time', suffixes=('_dbp', '_sbp'), how='outer')
    temp = temp.merge(creatinine.loc[creatinine['id'] == i][['time', 'value']], left_on='time',
                      right_on='time', how='outer').rename(columns={"value": "value_creatinine"})
    temp = temp.merge(glucose.loc[glucose['id'] == i][['time', 'value']], left_on='time',
                      right_on='time', how='outer').rename(columns={"value": "value_glucose"})
    temp = temp.merge(hgb.loc[hgb['id'] == i][['time', 'value']], left_on='time',
                      right_on='time', how='outer').rename(columns={"value": "value_hgb"})
    temp = temp.merge(ldl.loc[ldl['id'] == i][['time', 'value']], left_on='time',
                      right_on='time', how='outer').rename(columns={"value": "value_ldl"})
    temp = temp.sort_values(by=['time'])
    temp['id'] = i
    df = df.append(temp)

df = df.merge(demographics[['id', 'gender', 'race']],
              left_on='id', right_on='id')

df

Unnamed: 0,time,value_dbp,value_sbp,value_creatinine,value_glucose,value_hgb,value_ldl,id,gender,race
0,0,95.32,134.11,1.29,6.24,13.51,161.49,0,Male,Unknown
1,98,83.98,133.75,,,,,0,Male,Unknown
2,107,,,1.15,7.20,13.39,111.39,0,Male,Unknown
3,279,65.97,125.08,,,,,0,Male,Unknown
4,286,,,1.44,7.01,12.84,,0,Male,Unknown
...,...,...,...,...,...,...,...,...,...,...
4756,621,72.52,113.38,,,,,299,Male,Asian
4757,652,,,0.82,,,,299,Male,Asian
4758,659,,,,6.20,14.43,79.02,299,Male,Asian
4759,1069,,,,,14.90,,299,Male,Asian


In [336]:
# Brief look at the data
df.describe()

Unnamed: 0,time,value_dbp,value_sbp,value_creatinine,value_glucose,value_hgb,value_ldl,id
count,4761.0,1821.0,1809.0,1439.0,1556.0,2025.0,1261.0,4761.0
mean,462.41924,79.518583,134.155368,1.328999,6.694833,13.825185,87.558795,148.060912
std,334.49964,10.178676,14.765817,0.357971,1.599058,1.644946,28.156104,88.343602
min,0.0,44.95,91.99,0.24,2.89,8.82,26.1,0.0
25%,196.0,72.83,124.63,1.08,5.62,12.67,67.29,70.0
50%,414.0,79.05,133.44,1.29,6.36,13.92,83.96,147.0
75%,640.0,86.17,143.16,1.53,7.44,14.98,105.14,227.0
max,1429.0,112.93,211.09,3.02,16.61,19.0,198.59,299.0


In [337]:
# merging demographics and disease stage progression
demo_stage = demographics.merge(
    stage, left_on='id', right_on='id', how='outer')

In [338]:
aggregate = pd.DataFrame()

In [339]:
for i in demographics['id']:
    dbp = df[(df['id'] == i) & (df['value_dbp'] > 85)
             ]['value_dbp'].count() / df[(df['id'] == i)]['value_dbp'].count()
    sbp = df[(df['id'] == i) & (df['value_sbp'] > 129)
             ]['value_sbp'].count() / df[(df['id'] == i)]['value_sbp'].count()
    hgb = df[(df['id'] == i) & (df['value_hgb'] < 9)
             ]['value_hgb'].count() / df[(df['id'] == i)]['value_hgb'].count()
    ldl = df[(df['id'] == i) & (df['value_ldl'] > 160)
             ]['value_ldl'].count() / df[(df['id'] == i)]['value_ldl'].count()
    glucose = df[(df['id'] == i) & ((df['value_glucose'] > 6.9) | (df['value_glucose'] < 3.8))
                 ]['value_glucose'].count() / df[(df['id'] == i)]['value_glucose'].count()
    medications = len(meds[meds['id'] == i]['drug'].unique())
    if demographics[demographics['id'] == i]['gender'].item() == 'Male':
        creatinine = df[(df['id'] == i) & (df['value_creatinine'] > 1.2)]['value_creatinine'].count(
        ) / df[(df['id'] == i)]['value_creatinine'].count()
    else:
        creatinine = df[(df['id'] == i) & (df['value_creatinine'] > 1.0)]['value_creatinine'].count(
        ) / df[(df['id'] == i)]['value_creatinine'].count()

    temp = pd.DataFrame({'id': i, 'gender': demographics[demographics['id'] == i]['gender'],
                        'dbp': dbp, 'sbp': sbp, 'hgb': hgb, 'ldl': ldl, 'glucose': glucose, 'creatinine': creatinine, 'medications': medications})
    aggregate = aggregate.append(temp)

aggregate = aggregate.merge(
    demo_stage[['race', 'age', 'Stage_Progress', 'id']], left_on='id', right_on='id', how='inner')

In [349]:
# looking at some calculated factors in ckd
aggregate.describe()
aggregate

Unnamed: 0,id,dbp,sbp,hgb,ldl,glucose,creatinine,medications,age
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,149.5,0.300401,0.621056,0.000417,0.012778,0.363845,0.706853,2.036667,70.383333
std,86.746758,0.287829,0.312236,0.007217,0.07874,0.38217,0.364706,1.186034,9.247058
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0
25%,74.75,0.0,0.364583,0.0,0.0,0.0,0.4,1.0,64.0
50%,149.5,0.25,0.666667,0.0,0.0,0.25,0.857143,2.0,71.0
75%,224.25,0.5,0.875,0.0,0.0,0.75,1.0,3.0,78.0
max,299.0,1.0,1.0,0.125,0.666667,1.0,1.0,5.0,86.0


Unnamed: 0,id,gender,dbp,sbp,hgb,ldl,glucose,creatinine,medications,race,age,Stage_Progress
0,0,Male,0.375000,0.875000,0.0,0.333333,0.333333,0.666667,3,Unknown,70,True
1,1,Female,0.285714,1.000000,0.0,0.000000,1.000000,1.000000,1,White,71,False
2,2,Female,0.750000,1.000000,0.0,0.000000,0.666667,1.000000,1,White,57,True
3,3,Male,0.000000,0.714286,0.0,0.000000,0.333333,0.200000,1,White,74,False
4,4,Female,0.857143,0.857143,0.0,0.000000,1.000000,0.833333,3,White,50,True
...,...,...,...,...,...,...,...,...,...,...,...,...
295,295,Female,0.250000,0.875000,0.0,0.000000,0.142857,1.000000,0,White,79,True
296,296,Female,0.714286,0.571429,0.0,0.000000,1.000000,1.000000,2,White,66,False
297,297,Male,0.000000,0.375000,0.0,0.000000,1.000000,0.666667,5,Unknown,80,True
298,298,Female,0.000000,0.500000,0.0,0.000000,0.000000,0.600000,2,Asian,72,False


In [341]:
# the medication proved to be most difficult to sort because of the number of medications, variations in dosage, and dates
meds.describe()
meds['drug'].unique()

Unnamed: 0,id,daily_dosage,start_day,end_day
count,2181.0,2181.0,2181.0,2181.0
mean,144.054562,276.653141,291.25447,370.996332
std,87.194538,478.411077,191.087155,190.575803
min,0.0,2.0,-78.0,-4.0
25%,68.0,20.0,126.0,208.0
50%,140.0,50.0,290.0,377.0
75%,222.0,320.0,448.0,534.0
max,299.0,2550.0,691.0,698.0


array(['atorvastatin', 'losartan', 'metformin', 'rosuvastatin',
       'pravastatin', 'carvedilol', 'valsartan', 'metoprolol',
       'simvastatin', 'bisoprolol', 'atenolol', 'lovastatin',
       'olmesartan', 'irbesartan', 'canagliflozin', 'dapagliflozin',
       'telmisartan', 'labetalol', 'nebivolol', 'pitavastatin',
       'propranolol'], dtype=object)

Looking at the different medications there are several major groups of medication. Statins (atorvastatin, rosuvastatin, pravastatin, simvastatin, lovastatin, pitavastatin), angiotensin receptor blocker (losartan, valsartan, olmesartan, irbesartan, telmisartan), biguanide antihyperglycemic agent (metformin), beta blocker (carvedilol, metoprolol, bisoprolol, atenolol, labeltalol, nebivolol, propranolol) and sodium-glucose cotransporter-2 (SGLT2) inhibitor (canagliflozin). Statins reduce cholestrol, angiotensin receptor blockers and beta blockers reduce blood pressure, and biguanide antihyperglycemic agent and sodium-glucose cotransporter-2 (SGLT2) inhibitor reduce blood glucose levels. 

In [342]:
len(meds[meds['id'] == 299]['drug'].unique())

2

In [343]:
meds[meds['id'] == 299]

Unnamed: 0,id,drug,daily_dosage,start_day,end_day
2170,299,atorvastatin,10.0,117,147
2171,299,atorvastatin,10.0,507,537
2172,299,atorvastatin,10.0,529,559
2173,299,atorvastatin,10.0,578,608
2174,299,losartan,100.0,131,161
2175,299,losartan,100.0,143,173
2176,299,losartan,100.0,144,174
2177,299,losartan,100.0,461,491
2178,299,losartan,100.0,497,527
2179,299,losartan,100.0,529,559


# Basic Data Visualization of the Demographics
Looking at basic demographics of the patients

In [344]:
alt.Chart(demo_stage).mark_bar().encode(
    alt.X("age:Q", bin=True),
    alt.Y('count()', title='Number of Patients'),
    column='race',
    color='Stage_Progress'
).properties(
    width=150,
    height=120
)

In [345]:
alt.Chart(demo_stage).mark_bar().encode(
    alt.X("age:Q", bin=True),
    alt.Y("count()", title='Number of Patients'),
    column='gender',
    color='Stage_Progress'
).properties(
    width=300,
    height=150
)

Looking at the demographics data, there are more white patients than of the other races. In terms of gender, there are more more female patients across all age bins, but the difference is not as large as compared to the race differences. 

In [346]:
alt.Chart(demo_stage, title='Percentage of CKD patients').mark_bar().encode(
    alt.X("age:Q", bin=True),
    alt.Y('count()', stack="normalize", axis=alt.Axis(
        format=".0%"), title='Precentage of Patients'),
    color='Stage_Progress'
).properties(
    width=300,
    height=150
)

Just looking at outcomes (Stage_Progress), there are more patients who do not have chronic kidney disease (CKD) compared to patients who have CKD. 

In [347]:
positive = alt.Chart(demo_stage.loc[demo_stage['Stage_Progress'] == True], title="CKD Positive").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x=alt.X('age:Q', bin=True),
    y=alt.Y('count():Q', scale=alt.Scale(
        domain=(0, 46)), title='Number of Patients'),
    color='race:N'
)
negative = alt.Chart(demo_stage.loc[demo_stage['Stage_Progress'] == 0], title="CKD Negative").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x=alt.X('age:Q', bin=True),
    y=alt.Y('count():Q', scale=alt.Scale(
        domain=(0, 46)), title='Number of Patients'),
    color='race:N'
)

positive | negative

In [348]:
male = alt.Chart(demo_stage.loc[demo_stage['gender'] == 'Male'], title="Male").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x=alt.X('age:Q', bin=True),
    y=alt.Y('count():Q', scale=alt.Scale(
        domain=(0, 46)), title='Number of Patients'),
    color='race:N'
)

female = alt.Chart(demo_stage.loc[demo_stage['gender'] == 'Female'], title="Female").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x=alt.X('age:Q', bin=True),
    y=alt.Y('count():Q', scale=alt.Scale(
        domain=(0, 46)), title='Number of Patients'),
    color='race:N'
)

male | female

# Analysis/Modeling
Do work here

# Results
Show graphs and stats here

# Conclusions and Next Steps
Summarize findings here