In [1]:
import argparse
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import numpy as np
import logging
from utils.logger import logger_initialization
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

# seed for numpy and sklearn
random_state = 7
np.random.seed(random_state)

In [2]:
# ignore warning of compiling tensorflow from source
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

input_file='datasets/training.csv'

dataset = pd.read_csv(filepath_or_buffer=input_file, delimiter='|')


In [3]:
print(dataset.iloc[0:5])

   PATIENT_KEY  TIME_TO_APPT  NOSHOW  NENCOUNTERTYPE  AGEGROUP  \
0     10977031             7       0               3         1   
1     10977031             7       1               3         1   
2     10977031             7       1               3         1   
3     10977031             7       0               3         1   
4     10977031             7       0               3         1   

  ENCOUNTER_DEPARTMENT_ABBR ENCOUNTER_DEPARTMENT_SPECIALTY  \
0                      KFCH             GENERAL PEDIATRICS   
1                      KFCH             GENERAL PEDIATRICS   
2                      KFCH             GENERAL PEDIATRICS   
3                      KFCH             GENERAL PEDIATRICS   
4                      KFCH             GENERAL PEDIATRICS   

  ENCOUNTER_APPOINTMENT_WEEK_DAY ENCOUNTER_APPOINTMENT_TYPE PATIENT_GENDER  \
0                      Tuesday                     SAME DAY              F   
1                      Tuesday                     SAME DAY              F

In [4]:
# encode class values as integers
encoder = LabelEncoder()
categorical_keys = ['ENCOUNTER_DEPARTMENT_ABBR', 'ENCOUNTER_DEPARTMENT_SPECIALTY', 'ENCOUNTER_APPOINTMENT_WEEK_DAY',
                    'ENCOUNTER_APPOINTMENT_TYPE', 'PATIENT_GENDER']

dataset_floats = dataset.copy()

for key in categorical_keys:
    dataset_floats[key] = encoder.fit_transform(dataset[key])

# remove every row that is missing a value
dataset_floats.dropna(axis=0, inplace=True)

# labels 0 == SHOWUP, 1 == NOSHOW
y = np.array(dataset_floats['NOSHOW'])

dataset_floats['ENCOUNTER_APPOINTMENT_DATETIME'] = pd.to_datetime(dataset_floats['ENCOUNTER_APPOINTMENT_DATETIME'])


dataset=dataset_floats

In [5]:
print('Dataset converted to only digits')
print(dataset.iloc[0:5])

Dataset converted to only digits
   PATIENT_KEY  TIME_TO_APPT  NOSHOW  NENCOUNTERTYPE  AGEGROUP  \
0     10977031             7       0               3         1   
1     10977031             7       1               3         1   
2     10977031             7       1               3         1   
3     10977031             7       0               3         1   
4     10977031             7       0               3         1   

   ENCOUNTER_DEPARTMENT_ABBR  ENCOUNTER_DEPARTMENT_SPECIALTY  \
0                          0                               0   
1                          0                               0   
2                          0                               0   
3                          0                               0   
4                          0                               0   

   ENCOUNTER_APPOINTMENT_WEEK_DAY  ENCOUNTER_APPOINTMENT_TYPE  PATIENT_GENDER  \
0                               1                           0               0   
1                      

In [6]:
number_ones = len(y[y==1])
print('data points NOSHOW true = {0}'.format(number_ones))
number_zeros = len(y[y==0])
print('data points NOSHOW False = {0}'.format(number_zeros))


data points NOSHOW true = 5
data points NOSHOW False = 5


In [7]:
x_train, x_test, y_train, y_test = train_test_split(dataset, y, test_size=0.33, random_state=42) 

In [8]:
x_train = x_train.assign(SHOW_FREQUENCY=np.ones(np.shape(x_train)[0]))
x_test = x_test.assign(SHOW_FREQUENCY=np.ones(np.shape(x_test)[0]))


In [9]:
print(x_train[:5])
print(x_test[:5])

   PATIENT_KEY  TIME_TO_APPT  NOSHOW  NENCOUNTERTYPE  AGEGROUP  \
7      9268929             0       1               3         4   
2     10977031             7       1               3         1   
9      9268929             0       1               3         4   
4     10977031             7       0               3         1   
3     10977031             7       0               3         1   

   ENCOUNTER_DEPARTMENT_ABBR  ENCOUNTER_DEPARTMENT_SPECIALTY  \
7                          1                               0   
2                          0                               0   
9                          1                               0   
4                          0                               0   
3                          0                               0   

   ENCOUNTER_APPOINTMENT_WEEK_DAY  ENCOUNTER_APPOINTMENT_TYPE  PATIENT_GENDER  \
7                               0                           1               0   
2                               1                       

In [10]:
train_data = x_train
test_data = x_test

In [11]:
unique_patient_ids = train_data['PATIENT_KEY'].unique()
print('there are {0} unique patient IDs in the training dataset.'.format(len(unique_patient_ids)))
print('The first 5 patients are:\n{0}'.format(unique_patient_ids[:5]))

there are 2 unique patient IDs in the training dataset.
The first 5 patients are:
[ 9268929 10977031]


In [12]:
# for each patient in the training dataset
for patient_key in unique_patient_ids:
    # get the data point matching the current patient_key
    patient_dataframe = train_data[train_data['PATIENT_KEY'] == patient_key]
    # number of encounters processed
    encounters_processed = 0.0
    # total number of encounter that the patient showed up
    total_shows = 0.0
    # total number of encounter that the patient did not show up
    total_no_shows = 0.0
    # loop through each encounter
    for index, data_point in patient_dataframe.iterrows():
        encounters_processed += 1.0
        # if the patient did not show up
        if data_point['NOSHOW']:
            total_no_shows += 1.0
            prob = 1 - (total_no_shows / encounters_processed)
        else:
            total_shows += 1.0
            prob = total_shows / encounters_processed

        # update the SHOW_FREQUENCY for the specific patient's based on the index processed
        train_data.loc[index, 'SHOW_FREQUENCY'] = prob

In [13]:
train_data[['PATIENT_KEY','NOSHOW', 'SHOW_FREQUENCY', 'ENCOUNTER_APPOINTMENT_DATETIME']]

Unnamed: 0,PATIENT_KEY,NOSHOW,SHOW_FREQUENCY,ENCOUNTER_APPOINTMENT_DATETIME
7,9268929,1,0.0,2015-11-17 11:30:00
2,10977031,1,0.0,2017-08-17 11:00:00
9,9268929,1,0.0,2015-11-19 09:30:00
4,10977031,0,0.5,2017-08-27 11:00:00
3,10977031,0,0.666667,2017-08-20 11:00:00
6,9268929,0,0.333333,2015-11-17 09:30:00


In [28]:
test_data[['PATIENT_KEY','NOSHOW', 'SHOW_FREQUENCY', 'ENCOUNTER_APPOINTMENT_DATETIME']]

Unnamed: 0,PATIENT_KEY,NOSHOW,SHOW_FREQUENCY,ENCOUNTER_APPOINTMENT_DATETIME
8,9268929,1,0.0,2015-11-18 09:30:00
1,10977031,1,0.0,2017-08-16 11:00:00
5,9268929,0,0.0,2015-11-16 09:30:00
0,10977031,0,0.666667,2017-08-28 11:00:00


In [29]:
test_data.loc[0, 'ENCOUNTER_APPOINTMENT_DATETIME'] = pd.datetime(2017,8,28,11,0,0)

In [30]:
for patient_key in unique_testing_patient_ids:
    # if the patient in the testing dataset is not in the training dataset, then continue to the next patient and
    # do not modified the SHOW_FREQUENCY i.e. leave it to 100% chance of showing up
    if patient_key not in train_data['PATIENT_KEY'].values:
        continue

    training_patient_dataframe = train_data[train_data['PATIENT_KEY'] == patient_key]
    testing_patient_dataframe = test_data[test_data['PATIENT_KEY'] == patient_key]

    for test_index, testing_patient in testing_patient_dataframe.iterrows():
        last_encounter_time = pd.datetime(1, 1, 1, 7, 0, 0)
        for _, training_patient in training_patient_dataframe.iterrows():
            if testing_patient['ENCOUNTER_APPOINTMENT_DATETIME'] > \
                    training_patient['ENCOUNTER_APPOINTMENT_DATETIME'] > last_encounter_time:
                show_frequency = training_patient['SHOW_FREQUENCY']
                last_encounter_time = training_patient['ENCOUNTER_APPOINTMENT_DATETIME']
        test_data.loc[test_index, 'SHOW_FREQUENCY'] = show_frequency

In [31]:
test_data[['PATIENT_KEY','NOSHOW', 'SHOW_FREQUENCY', 'ENCOUNTER_APPOINTMENT_DATETIME']]

Unnamed: 0,PATIENT_KEY,NOSHOW,SHOW_FREQUENCY,ENCOUNTER_APPOINTMENT_DATETIME
8,9268929,1,0.0,2015-11-18 09:30:00
1,10977031,1,0.0,2017-08-16 11:00:00
5,9268929,0,0.0,2015-11-16 09:30:00
0,10977031,0,0.5,2017-08-28 11:00:00


In [20]:
train_data[['PATIENT_KEY','NOSHOW', 'SHOW_FREQUENCY', 'ENCOUNTER_APPOINTMENT_DATETIME']]

Unnamed: 0,PATIENT_KEY,NOSHOW,SHOW_FREQUENCY,ENCOUNTER_APPOINTMENT_DATETIME
7,9268929,1,0.0,2015-11-17 11:30:00
2,10977031,1,0.0,2017-08-17 11:00:00
9,9268929,1,0.0,2015-11-19 09:30:00
4,10977031,0,0.5,2017-08-27 11:00:00
3,10977031,0,0.666667,2017-08-20 11:00:00
6,9268929,0,0.333333,2015-11-17 09:30:00
