# Introduction
State notebook purpose here

### Imports
Import libraries and write settings here.

In [None]:
# Data manipulation
import cufflinks as cf
import altair as alt
from IPython import get_ipython
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
#from sklearnex import patch_sklearn

# Additional packages
from sklearn import preprocessing
import lazypredict

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Display all cell outputs
InteractiveShell.ast_node_interactivity = 'all'

ipython = get_ipython()

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Visualizations
alt.renderers.enable('notebook')
alt.renderers.enable('html')

cf.go_offline(connected=True)
cf.set_config_file(theme='white')

# Data Wrangling

## Basic Data Manipulation

In [None]:
# Reading in data
demographics = pd.read_csv("dataScienceTask/T_demo.csv")
dbp = pd.read_csv("dataScienceTask/T_DBP.csv")
sbp = pd.read_csv("dataScienceTask/T_SBP.csv")
creatinine = pd.read_csv("dataScienceTask/T_creatinine.csv")
glucose = pd.read_csv("dataScienceTask/T_glucose.csv")
meds = pd.read_csv("dataScienceTask/T_meds.csv")
hgb = pd.read_csv("dataScienceTask/T_HGB.csv")
ldl = pd.read_csv("dataScienceTask/T_ldl.csv")
stage = pd.read_csv("dataScienceTask/T_stage.csv")

In [None]:
# Aggregating data into 1 dataframe
df = pd.DataFrame()

for i in demographics['id']:
    temp = dbp.loc[dbp['id'] == i][['time', 'value']].merge(sbp.loc[sbp['id'] == i][[
        'value', 'time']], left_on='time', right_on='time', suffixes=('_dbp', '_sbp'), how='outer')
    temp = temp.merge(creatinine.loc[creatinine['id'] == i][['time', 'value']], left_on='time',
                      right_on='time', how='outer').rename(columns={"value": "value_creatinine"})
    temp = temp.merge(glucose.loc[glucose['id'] == i][['time', 'value']], left_on='time',
                      right_on='time', how='outer').rename(columns={"value": "value_glucose"})
    temp = temp.merge(hgb.loc[hgb['id'] == i][['time', 'value']], left_on='time',
                      right_on='time', how='outer').rename(columns={"value": "value_hgb"})
    temp = temp.merge(ldl.loc[ldl['id'] == i][['time', 'value']], left_on='time',
                      right_on='time', how='outer').rename(columns={"value": "value_ldl"})
    temp = temp.sort_values(by=['time'])
    temp['id'] = i
    df = df.append(temp)

df = df.merge(demographics[['id', 'gender', 'race']],
              left_on='id', right_on='id')

df

In [None]:
# Brief look at the data
df.describe()

In [None]:
# merging demographics and disease stage progression
demo_stage = demographics.merge(
    stage, left_on='id', right_on='id', how='outer')

In [None]:
# the medication proved to be most difficult to sort because of the number of medications, variations in dosage, and dates
meds.describe()
meds['drug'].unique()

Looking at the different medications there are several major groups of medication. Statins (atorvastatin, rosuvastatin, pravastatin, simvastatin, lovastatin, pitavastatin), angiotensin receptor blocker (losartan, valsartan, olmesartan, irbesartan, telmisartan), biguanide antihyperglycemic agent (metformin), beta blocker (carvedilol, metoprolol, bisoprolol, atenolol, labeltalol, nebivolol, propranolol) and sodium-glucose cotransporter-2 (SGLT2) inhibitor (canagliflozin). Statins reduce cholestrol, angiotensin receptor blockers and beta blockers reduce blood pressure, and biguanide antihyperglycemic agent and sodium-glucose cotransporter-2 (SGLT2) inhibitor reduce blood glucose levels. Choice was made to just add the number of medications for each person as a simple initial feature engineering move. The number of medications would be a strong correlation to the number of risk factors the patient has for CKD. 

In [None]:
aggregate = pd.DataFrame()

In [None]:
for i in demographics['id']:
    dbp = df[(df['id'] == i) & (df['value_dbp'] > 85)
             ]['value_dbp'].count() / df[(df['id'] == i)]['value_dbp'].count()
    sbp = df[(df['id'] == i) & (df['value_sbp'] > 129)
             ]['value_sbp'].count() / df[(df['id'] == i)]['value_sbp'].count()
    hgb = df[(df['id'] == i) & (df['value_hgb'] < 9)
             ]['value_hgb'].count() / df[(df['id'] == i)]['value_hgb'].count()
    ldl = df[(df['id'] == i) & (df['value_ldl'] > 160)
             ]['value_ldl'].count() / df[(df['id'] == i)]['value_ldl'].count()
    glucose = df[(df['id'] == i) & ((df['value_glucose'] > 6.9) | (df['value_glucose'] < 3.8))
                 ]['value_glucose'].count() / df[(df['id'] == i)]['value_glucose'].count()
    medications = len(meds[meds['id'] == i]['drug'].unique())
    if demographics[demographics['id'] == i]['gender'].item() == 'Male':
        creatinine = df[(df['id'] == i) & (df['value_creatinine'] > 1.2)]['value_creatinine'].count(
        ) / df[(df['id'] == i)]['value_creatinine'].count()
    else:
        creatinine = df[(df['id'] == i) & (df['value_creatinine'] > 1.0)]['value_creatinine'].count(
        ) / df[(df['id'] == i)]['value_creatinine'].count()

    temp = pd.DataFrame({'id': i, 'gender': demographics[demographics['id'] == i]['gender'],
                        'dbp': dbp, 'sbp': sbp, 'hgb': hgb, 'ldl': ldl, 'glucose': glucose, 'creatinine': creatinine, 'medications': medications})
    aggregate = aggregate.append(temp)

aggregate = aggregate.merge(
    demo_stage[['race', 'age', 'Stage_Progress', 'id']], left_on='id', right_on='id', how='inner')

In [None]:
# looking at some calculated factors in ckd
aggregate.describe()
aggregate

## Data Manipulation for Machine Learning

In [None]:
# Dealing with categorical data
# Checking gender
data_array = np.array(aggregate)
data_array

In [None]:
# Changing gender from categorical to ordinal
enc = preprocessing.OrdinalEncoder()
enc.fit(data_array[:, 1].reshape(-1, 1))
data_array[:, 1] = enc.transform(
    data_array[:, 1].reshape(-1, 1)).reshape(1, -1)
data_array
enc.get_params()

In [None]:
# Changing stage progress from categorical to ordinal
enc.fit(data_array[:, -1].reshape(-1, 1))
data_array[:, -
           1] = enc.transform(data_array[:, -1].reshape(-1, 1)).reshape(1, -1)
data_array
enc.get_params()

In [None]:
# Do one hot encoding for race (categorical data)
# Change "Unknown" into np.nan
data_array[:, -3] = np.where(data_array[:, -3] ==
                             'Unknown', np.nan, data_array[:, -3])
# Do one hot encoding for race
enc2 = preprocessing.OneHotEncoder()
enc2.fit(data_array[:, -3].reshape(-1, 1))
race = enc2.transform(data_array[:, -3].reshape(-1, 1)).toarray()
enc2.get_feature_names()

In [None]:
data_array_final = np.delete(data_array, -3, 1)
data_array_final = np.insert(data_array_final, [-2], race, 1)
data_array_final

In [None]:
data_array_final

In [None]:
# get 'column names for array'
col_names = ['id',
             'gender',
             'dbp',
             'sbp',
             'hgb',
             'ldl',
             'glucose',
             'creatinine',
             'medications',
             'x0_Asian', 'x0_Black', 'x0_Hispanic', 'x0_White', 'x0_nan'
             'age',
             'Stage_Progress']

# Basic Data Visualization of the Demographics
Looking at basic demographics of the patients

In [None]:
alt.Chart(demo_stage).mark_bar().encode(
    alt.X("age:Q", bin=True),
    alt.Y('count()', title='Number of Patients'),
    column='race',
    color='Stage_Progress'
).properties(
    width=150,
    height=120
)

In [None]:
alt.Chart(demo_stage).mark_bar().encode(
    alt.X("age:Q", bin=True),
    alt.Y("count()", title='Number of Patients'),
    column='gender',
    color='Stage_Progress'
).properties(
    width=300,
    height=150
)

Looking at the demographics data, there are more white patients than of the other races. In terms of gender, there are more more female patients across all age bins, but the difference is not as large as compared to the race differences. 

In [None]:
alt.Chart(demo_stage, title='Percentage of CKD patients').mark_bar().encode(
    alt.X("age:Q", bin=True),
    alt.Y('count()', stack="normalize", axis=alt.Axis(
        format=".0%"), title='Precentage of Patients'),
    color='Stage_Progress'
).properties(
    width=300,
    height=150
)

Just looking at outcomes (Stage_Progress), there are more patients who do not have chronic kidney disease (CKD) compared to patients who have CKD (inbalanced dataset). 

In [None]:
positive = alt.Chart(demo_stage.loc[demo_stage['Stage_Progress'] == True], title="CKD Positive").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x=alt.X('age:Q', bin=True),
    y=alt.Y('count():Q', scale=alt.Scale(
        domain=(0, 46)), title='Number of Patients'),
    color='race:N'
)
negative = alt.Chart(demo_stage.loc[demo_stage['Stage_Progress'] == 0], title="CKD Negative").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x=alt.X('age:Q', bin=True),
    y=alt.Y('count():Q', scale=alt.Scale(
        domain=(0, 46)), title='Number of Patients'),
    color='race:N'
)

positive | negative

In [None]:
male = alt.Chart(demo_stage.loc[demo_stage['gender'] == 'Male'], title="Male").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x=alt.X('age:Q', bin=True),
    y=alt.Y('count():Q', scale=alt.Scale(
        domain=(0, 46)), title='Number of Patients'),
    color='race:N'
)

female = alt.Chart(demo_stage.loc[demo_stage['gender'] == 'Female'], title="Female").mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x=alt.X('age:Q', bin=True),
    y=alt.Y('count():Q', scale=alt.Scale(
        domain=(0, 46)), title='Number of Patients'),
    color='race:N'
)

male | female

# Analysis/Modeling
Do work here

# Results
Show graphs and stats here

# Conclusions and Next Steps
Summarize findings here