# Import and Clean Data

The data must be imported from a CSV file, cleaned to the standards of the question being asked, and adapted into a neural network input.


## References

 * [Andrew Long](https://github.com/andrewwlong/diabetes_readmission/blob/master/diabetes_project.ipynb)
 * [Sheng Weng](https://github.com/swengzju/Predicting-Diabetes-Patient-Readmission/blob/master/Predicting%20Diabetes%20Patient%20Readmission.ipynb)
 * [Bose, et al.](https://github.com/Yawhoong/ISS-Diabetes-Readmission/blob/master/Identification%20of%20Critical%20Risk%20Factors%20leading%20to%20short-term%20readmission%20of%20Diabetic%20Patients.pdf)


In [1]:
# Import modules and data
import pandas as pd
import numpy as np

df = pd.read_csv("diabetic_data.csv")
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [2]:
# Determine whether the diabetes diagnosis was in the top 3
# Drop the diagnoses, they are too scattered
import re
DIABETES_REGEX = re.compile("250")
isDiabetes = lambda s: DIABETES_REGEX.match(s)
df["primary_diag"] = (df["diag_1"].str.contains("^250") |
                      df["diag_2"].str.contains("^250") |
                      df["diag_3"].str.contains("^250")).astype("int")
df.drop(["diag_1", "diag_2", "diag_3"], axis=1, inplace=True)
df.groupby("primary_diag").size()

primary_diag
0    63742
1    38024
dtype: int64

In [3]:
# Remove columns...
#  * encounter_id, patient_nbr (identification)
#  * weight, payer_code, medical_specialty (too sparse)
#  * diag_1, diag_2, diag_3 (too specific)
df.drop(["encounter_id", "patient_nbr", "weight", "payer_code", "medical_specialty"], axis=1, inplace=True)
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,primary_diag
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,NO,1
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,Up,No,No,No,No,No,Ch,Yes,>30,1
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,Yes,NO,1
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,Up,No,No,No,No,No,Ch,Yes,NO,1
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,Steady,No,No,No,No,No,Ch,Yes,NO,1


In [4]:
# Filter by...
#  * Must have a gender
df = df[df["gender"] != "Unknown/Invalid"]

#  * Must not be discharged by death or hospice
df = df.loc[~df["discharge_disposition_id"].isin([11, 13, 14, 19, 20, 21])]

df.shape

(99340, 43)

In [5]:
# Normalize continuous columns to [0, 1]
cols_continuous = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient','number_diagnoses']

for col in cols_continuous:
    lo, hi = df[col].min(), df[col].max()
    df[col] = (df[col] - lo) / (hi - lo)

In [6]:
# Normalize numerically-categorical columns (i.e. age)
AGE_MAPPING = {"[{}-{})".format(i * 10, (i + 1) * 10): i * 0.1 + 0.05 for i in range(10)}
df["age"] = df["age"].apply(lambda s: AGE_MAPPING[s])
df[["age"] + cols_continuous].head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,0.05,0.0,0.305344,0.0,0.0,0.0,0.0,0.0,0.0
1,0.15,0.153846,0.442748,0.0,0.2125,0.0,0.0,0.0,0.533333
2,0.25,0.076923,0.076336,0.833333,0.15,0.047619,0.0,0.047619,0.333333
3,0.35,0.076923,0.328244,0.166667,0.1875,0.0,0.0,0.0,0.4
4,0.45,0.0,0.381679,0.0,0.0875,0.0,0.0,0.0,0.266667


In [7]:
# Manipulate categorical columns
cols_categorical = ['race', 'gender',
        'admission_type', 'admission_source',
        'max_glu_serum', 'A1Cresult',
        'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
        'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
        'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
        'tolazamide', 'insulin',
        'glyburide-metformin', 'glipizide-metformin',
        'glimepiride-pioglitazone', 'metformin-rosiglitazone',
        'metformin-pioglitazone', 'change', 'diabetesMed',
        'examide', 'citoglipton']

# Create an "Unknown" category for race
df["race"] = df["race"].replace("?", "Unknown")

# Consolidate admissions types
ADMISSION_TYPE_MAPPING = {
    1: "Emergency",
    2: "Urgent",
    3: "Elective"
}
df["admission_type_id"] = df["admission_type_id"].apply(lambda s: ADMISSION_TYPE_MAPPING.get(s, "Other"))

# Discharge disposition is either going home OK or not going home OK
df["discharge_disposition_id"] = (df["discharge_disposition_id"] == 1).astype("int")

# Admission source is either Emergency, Referral, or Other
ADMISSION_SOURCE_MAPPING = {
    1: "EmergencyRoom",
    7: "Referral"
}
df["admission_source_id"] = df["admission_type_id"].apply(lambda s: ADMISSION_SOURCE_MAPPING.get(s, "Other"))

df.rename(columns={
    "admission_type_id": "admission_type",
    "discharge_disposition_id": "discharged_home",
    "admission_source_id": "admission_source"}, inplace=True)
#df["race", "admission_type", "admission_source", "discharged_home"].head()
df.head()

Unnamed: 0,race,gender,age,admission_type,discharged_home,admission_source,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,primary_diag
0,Caucasian,Female,0.05,Other,0,Other,0.0,0.305344,0.0,0.0,...,No,No,No,No,No,No,No,No,NO,1
1,Caucasian,Female,0.15,Emergency,1,Other,0.153846,0.442748,0.0,0.2125,...,Up,No,No,No,No,No,Ch,Yes,>30,1
2,AfricanAmerican,Female,0.25,Emergency,1,Other,0.076923,0.076336,0.833333,0.15,...,No,No,No,No,No,No,No,Yes,NO,1
3,Caucasian,Male,0.35,Emergency,1,Other,0.076923,0.328244,0.166667,0.1875,...,Up,No,No,No,No,No,Ch,Yes,NO,1
4,Caucasian,Male,0.45,Emergency,1,Other,0.0,0.381679,0.0,0.0875,...,Steady,No,No,No,No,No,Ch,Yes,NO,1


In [8]:
# Handle categorical columns...
#  1. Remove categorical columns that are almost uniform (>99% of entries are one thing)
#  2. Split other columns into multiple boolean columns
n_rows = df.shape[0]
cols_insignificant = []
cols_significant = []
for col in cols_categorical:
    freqs = dict(df.groupby(col).size())
    
    # Remove this column
    if any([float(value)/float(n_rows) > 0.99 for value in freqs.values()]):
        cols_insignificant.append(col)
    
    else:
        # Sanity check
        assert(len(freqs) > 1)
        cols_significant.append(col)
        
        # Convert to column to string if not already string type
        if df[col].dtype != str:
            df[col] = df[col].astype(str)
            
# Extract output
df_output = pd.get_dummies(df["readmitted"])
df.drop(["readmitted"], axis=1, inplace=True)

df_cat = pd.get_dummies(df[cols_significant], drop_first=False)
df_input = pd.concat([df, df_cat], axis=1)

df_input.drop(cols_insignificant + cols_significant, axis=1, inplace=True)

print("Insignificant: {}".format(cols_insignificant))
print("Significant: {}".format(cols_significant))
df_input.head()

Insignificant: ['admission_source', 'nateglinide', 'chlorpropamide', 'acetohexamide', 'tolbutamide', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'examide', 'citoglipton']
Significant: ['race', 'gender', 'admission_type', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change', 'diabetesMed']


Unnamed: 0,age,discharged_home,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,rosiglitazone_Steady,rosiglitazone_Up,insulin_Down,insulin_No,insulin_Steady,insulin_Up,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
0,0.05,0,0.0,0.305344,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,1,1,0
1,0.15,1,0.153846,0.442748,0.0,0.2125,0.0,0.0,0.0,0.533333,...,0,0,0,0,0,1,1,0,0,1
2,0.25,1,0.076923,0.076336,0.833333,0.15,0.047619,0.0,0.047619,0.333333,...,0,0,0,1,0,0,0,1,0,1
3,0.35,1,0.076923,0.328244,0.166667,0.1875,0.0,0.0,0.0,0.4,...,0,0,0,0,0,1,1,0,0,1
4,0.45,1,0.0,0.381679,0.0,0.0875,0.0,0.0,0.0,0.266667,...,0,0,0,0,1,0,1,0,0,1


At this point, all cells in the data should be numeric between 0 and 1. There are three different outcomes represented. Outputs are left in the same table as inputs to ensure that they do not get scrambled.

In [9]:
df = pd.concat([df_input, df_output], axis=1)
df.rename(columns={"<30": "OUTPUT_<30", ">30": "OUTPUT_>30", "NO": "OUTPUT_NO"}, inplace=True)
df.head()

Unnamed: 0,age,discharged_home,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,insulin_No,insulin_Steady,insulin_Up,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,OUTPUT_<30,OUTPUT_>30,OUTPUT_NO
0,0.05,0,0.0,0.305344,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,1,1,0,0,0,1
1,0.15,1,0.153846,0.442748,0.0,0.2125,0.0,0.0,0.0,0.533333,...,0,0,1,1,0,0,1,0,1,0
2,0.25,1,0.076923,0.076336,0.833333,0.15,0.047619,0.0,0.047619,0.333333,...,1,0,0,0,1,0,1,0,0,1
3,0.35,1,0.076923,0.328244,0.166667,0.1875,0.0,0.0,0.0,0.4,...,0,0,1,1,0,0,1,0,0,1
4,0.45,1,0.0,0.381679,0.0,0.0875,0.0,0.0,0.0,0.266667,...,0,1,0,1,0,0,1,0,0,1


# Neural Network

Bose, et al. used a neural network with the following parameters to model the given data:

 * Layers:
    1. Input (size = 70)
    2. Hidden Layer 1 (size = 70, ReLU activation)
    3. Hidden Layer 2 (size = 20, ReLU activation)
    4. Output Layer (size = 2, softmax)  
 * Dropout Rate: 0.1
 * Batch Size: 100
 * Epoch Size: 50
 * Optimization Algorithm: Adaptive Moment Estimation
 * Cost Function: Optimizes for recall
 
[This tutorial](https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/3_NeuralNetworks/multilayer_perceptron.py) is useful for implementing a basic multi-layer perceptron classifier.

In [10]:
# Pull out validation (15%) and test (15%) data from training (70%) data
df_valid_test = df.sample(frac=0.3, random_state=0xda)
df_training = df.drop(df_valid_test.index)

df_valid = df_valid_test.sample(frac=0.5)
df_test = df_valid_test.drop(df_valid.index, axis=0)

df_training.shape[0], df_valid.shape[0], df_test.shape[0]

(69538, 14901, 14901)

The following code defines an iterator object which remembers where it is while iterating through the training data, meaning that it can be placed in a `for` loop. It gives each category equal representation.

In [11]:
# Separate training data into three groups based on output category
df_training_less = df_training[df_training["OUTPUT_<30"] == 1].to_numpy()
df_training_more = df_training[df_training["OUTPUT_>30"] == 1].to_numpy()
df_training_never = df_training[df_training["OUTPUT_NO"] == 1].to_numpy()

# Get a batch of training data, using the last batch as a reference
class TrainingBatches:
    
    def __init__(self, batch_size=16):
        self.batch_size = batch_size
        self.bounds = max([
            df_training_less.shape[0],
            df_training_more.shape[0],
            df_training_never.shape[0]])
        self.it = 0
        
    def __iter__(self):
        return self
    
    def getSlice(self, array, start, length):
        start = start % array.shape[0]
        if start + length >= array.shape[0]:
            return np.concatenate((array[start:], array[:(start + length) % array.shape[0]]), axis=0)
        else:
            return array[start:start + length]
    
    def next(self):
        if self.it >= self.bounds:
            raise StopIteration
            
        output = np.concatenate((
            self.getSlice(df_training_less, self.it, self.batch_size),
            self.getSlice(df_training_more, self.it, self.batch_size),
            self.getSlice(df_training_never, self.it, self.batch_size)
        ), axis=0)
        self.it += self.batch_size
        
        return output[:, :67], output[:, 67:]

In [12]:
import tensorflow as tf
import matplotlib.pyplot as plt

In [13]:
# Hyperparameters
inlayer_size = 67
hlayer1_size = 70
hlayer2_size = 20
outlayer_size = 3
dropout_rate = 0.1
batch_size = 100
epoch_size = 50
learning_rate = 0.01

In [14]:
# Input/Output
X = tf.placeholder("float", [None, inlayer_size])
Y = tf.placeholder("float", [None, outlayer_size])

# Weights
weights = {
    "h1": tf.Variable(tf.random_normal([inlayer_size, hlayer1_size])),
    "h2": tf.Variable(tf.random_normal([hlayer1_size, hlayer2_size])),
    "out": tf.Variable(tf.random_normal([hlayer2_size, outlayer_size]))
}

# Biases
biases = {
    "h1": tf.Variable(tf.random_normal([hlayer1_size])),
    "h2": tf.Variable(tf.random_normal([hlayer2_size])),
    "out": tf.Variable(tf.random_normal([outlayer_size]))
}

In [15]:
# Create model
def multilayer_perceptron(x):
    layer1 = tf.add(tf.matmul(x, weights["h1"]), biases["h1"])
    layer2 = tf.add(tf.matmul(layer1, weights["h2"]), biases["h2"])
    return tf.matmul(layer2, weights["out"]) + biases["out"]

logits = multilayer_perceptron(X)

loss_op = tf.reduce_mean(tf.losses.softmax_cross_entropy(Y, logits))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
init = tf.global_variables_initializer()

W0919 00:38:48.985447 139704929851200 deprecation.py:323] From /usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/losses/losses_impl.py:121: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [16]:
# Train the model
with tf.Session() as sess:
    sess.run(init)
    
    for epoch in range(10):
        total_cost = 0.0
        num_batches = 0
        for batch_in, batch_out in iter(TrainingBatches(16)):
            _, cost = sess.run([train_op, loss_op], feed_dict={X: batch_in, Y: batch_out})
            total_cost += cost
            num_batches += 1
            
        print("Epoch {}: cost = {}".format(epoch, total_cost / num_batches))
        
    pred = tf.nn.softmax(logits)
    valid_data = df_valid.to_numpy()
    valid_in = valid_data[:, :67]
    valid_out = valid_data[:, 67:]
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print("Accuracy = ", accuracy.eval({X: valid_in, Y: valid_out}))

Epoch 0: cost = 3.2591192308
Epoch 1: cost = 1.26700463964
Epoch 2: cost = 1.01665586651
Epoch 3: cost = 0.975290271879
Epoch 4: cost = 0.980874980913
Epoch 5: cost = 0.979611031114
Epoch 6: cost = 0.975267961414
Epoch 7: cost = 0.968668671331
Epoch 8: cost = 0.963179378274
Epoch 9: cost = 0.960471337369
('Accuracy = ', 0.4178914)
