# MAM01 Assignment Data Science in Python

Course: Fundamentals of data science in medicine (Amsterdam UMC, UvA)

The goal of this notebook is to reproduce table 3 from the paper by Strack et al. [1] in python. 

### Data prep 

In [35]:
#imports 
import pandas as pd
import numpy as np

# set up the database with pandas
data_path = "data/diabetic_data_initial.csv"
data_diabetic_df = pd.read_csv(data_path, header='infer', sep=",")
print("rows:", data_diabetic_df.shape[0])

# preprocessing mentioned in the paper
# use only first encounters
data_diabetic_first_encounters = data_diabetic_df.drop_duplicates('patient_nbr') 
print("rows:", data_diabetic_first_encounters.shape[0])

# remove all encounters that resulted in either discharge to a hospice or patient death
rows_to_remove = [11,13,14]
data_diabetic_selected = data_diabetic_first_encounters
for row in rows_to_remove:
    data_diabetic_selected = data_diabetic_selected[data_diabetic_selected['discharge_disposition_id'] != row]
print("rows:", data_diabetic_selected.shape[0])

rows: 101766
rows: 71518
rows: 69980


### Helper functions

In [37]:
def find_encounters(column_name, condition, extra_column_name='', extra_condition=''):
    if len(extra_column_name) > 0 or len(extra_condition) > 0:
        return data_diabetic_selected[(data_diabetic_selected[column_name] == condition) & (data_diabetic_selected[extra_column_name] == extra_condition)].shape[0]
    else:
        return sum(data_diabetic_selected[column_name] == condition)

def find_percentage(no_encounters):
    population_size = data_diabetic_selected.shape[0]
    return round(no_encounters / population_size * 100, 1)

def find_readmitted_encounters(column_name, condition, extra_column_name='', extra_condition=''):
    placeholder_df = data_diabetic_selected[data_diabetic_selected['readmitted'] == '<30']
    if len(extra_column_name) > 0 or len(extra_condition) > 0:
        return placeholder_df[(placeholder_df[column_name] == condition) & (placeholder_df[extra_column_name] == extra_condition)].shape[0]
    else:
        return placeholder_df[placeholder_df[column_name] == condition].shape[0]

def find_readmitted_percentage(no_readmitted, no_encounters):
    return round(no_readmitted / no_encounters * 100, 1)

def calculate_row(row_name, column_name, condition, extra_column_name='', extra_condition=''):
    encounters = find_encounters(column_name, condition, extra_column_name, extra_condition)
    percentage = find_percentage(encounters)
    readmitted_encounter = find_readmitted_encounters(column_name, condition, extra_column_name, extra_condition)
    readmitted_percentage = find_readmitted_percentage(readmitted_encounter, encounters)
    return [row_name, encounters, percentage, readmitted_encounter, readmitted_percentage]

def calculate_range_row(row_name, column_name, row_range, extra_column_name='', extra_condition=''):
    encounters = 0
    readmitted_encounter = 0
    
    for i in row_range:
        encounters += find_encounters(column_name, i, extra_column_name, extra_condition)
        readmitted_encounter += find_readmitted_encounters(column_name, i, extra_column_name, extra_condition)

    percentage = find_percentage(encounters)
    readmitted_percentage = find_readmitted_percentage(readmitted_encounter, encounters)

    return [row_name, encounters, percentage, readmitted_encounter, readmitted_percentage]

def filter_values(value_str, column_name):
    return np.unique([x for x in data_diabetic_selected[column_name] if value_str in x])

### Calculate & assemble the table as a dataframe

In [95]:
# make an empty dataframe
table3_df = pd.DataFrame(columns=['Variable', 'Number of encounters', '% of the population', '(Readmitted) number of encounters', '(Readmitted) % of the group'])
table3_df

Unnamed: 0,Variable,Number of encounters,% of the population,(Readmitted) number of encounters,(Readmitted) % of the group


In [96]:
# populate the empty dataframe to recreate table 3

# HbA1c
table3_df.loc[len(table3_df)] = calculate_row('No test was performed', 'A1Cresult', 'None')
table3_df.loc[len(table3_df)] = calculate_row('Result was high and the diabetic medication was changed', 'A1Cresult', '>8', 'change', 'Ch')
table3_df.loc[len(table3_df)] = calculate_row('Result was high but the diabetic medication was not changed', 'A1Cresult', '>8', 'change', 'No')
table3_df.loc[len(table3_df)] = calculate_row('Normal result of the test', 'A1Cresult', 'Norm')

# Gender
table3_df.loc[len(table3_df)] = calculate_row('Female', 'gender', 'Female')
table3_df.loc[len(table3_df)] = calculate_row('Male', 'gender', 'Male')

# Discharge disposition
table3_df.loc[len(table3_df)] = calculate_row('Discharged to home', 'discharge_disposition_id', 1)
table3_df.loc[len(table3_df)] = calculate_range_row('Otherwise', 'discharge_disposition_id', range(2,29)) # take all other discharge ID's

# Admission source
table3_df.loc[len(table3_df)] = calculate_row('Admitted from emergency room', 'admission_source_id', 7)
table3_df.loc[len(table3_df)] = calculate_range_row('Admitted because of physician/clinic referral', 'admission_source_id', range(1,3)) # physician and clinical referral ID
other_admission_ids = np.setdiff1d(np.array(range(1,26)), np.array([1,2,7]))
table3_df.loc[len(table3_df)] = calculate_range_row('Otherwise', 'admission_source_id', other_admission_ids) # take all other admission ID's

# Specialty of the admitting physician
table3_df.loc[len(table3_df)] = calculate_row('Internal Medicine', 'medical_specialty', 'InternalMedicine')
cardiology_specialties = filter_values("Cardiology", "medical_specialty") # find all cardiology specialties
table3_df.loc[len(table3_df)] = calculate_range_row('Cardiology', 'medical_specialty', cardiology_specialties)
surgeon_specialties = filter_values("Surge", "medical_specialty") # find all surgery specialties
table3_df.loc[len(table3_df)] = calculate_range_row('Surgery', 'medical_specialty', surgeon_specialties)
table3_df.loc[len(table3_df)] = calculate_row('Family/GeneralPractice', 'medical_specialty', 'Family/GeneralPractice')
table3_df.loc[len(table3_df)] = calculate_row('Missing or unknown', 'medical_specialty', '?')
all_used_specialties = np.concatenate((np.array(['InternalMedicine', 'Family/GeneralPractice', '?']), np.concatenate((cardiology_specialties, surgeon_specialties)))) # array of all used specialties
all_specialties = [np.unique(data_diabetic_selected['medical_specialty'])] # array of all specialties
other_specialties = np.setdiff1d(all_specialties, all_used_specialties) # all other specialties
table3_df.loc[len(table3_df)] = calculate_range_row('Other', 'medical_specialty', other_specialties)

# Primary diagnosis is excluded

# Race
table3_df.loc[len(table3_df)] = calculate_row('African American', 'race', 'AfricanAmerican')
table3_df.loc[len(table3_df)] = calculate_row('Caucasian', 'race', 'Caucasian')
other_races = ['Asian', 'Other', 'Hispanic'] # define 'other'  races
table3_df.loc[len(table3_df)] = calculate_range_row('Other', 'race', other_races)
table3_df.loc[len(table3_df)] = calculate_row('Missing', 'race', '?')

# Age
under_30 = ['[0-10)', '[10-20)', '[20-30)']
table3_df.loc[len(table3_df)] = calculate_range_row('30 years old or younger', 'age', under_30)
thirty_sixty = ['[30-40)', '[40-50)', '[50-60)']
table3_df.loc[len(table3_df)] = calculate_range_row('30–60 years old', 'age', thirty_sixty)
over_60 = ['[60-70)', '[70-80)', '[80-90)', '[90-100)']
table3_df.loc[len(table3_df)] = calculate_range_row('Older than 60', 'age', over_60)


table3_df

Unnamed: 0,Variable,Number of encounters,% of the population,(Readmitted) number of encounters,(Readmitted) % of the group
0,No test was performed,57134,81.6,5199,9.1
1,Result was high and the diabetic medication wa...,4058,5.8,348,8.6
2,Result was high but the diabetic medication wa...,2181,3.1,161,7.4
3,Normal result of the test,3742,5.3,323,8.6
4,Female,37234,53.2,3361,9.0
5,Male,32743,46.8,2916,8.9
6,Discharged to home,44317,63.3,3078,6.9
7,Otherwise,25663,36.7,3199,12.5
8,Admitted from emergency room,37266,53.3,3446,9.2
9,Admitted because of physician/clinic referral,22655,32.4,1952,8.6


In [135]:
# make an empty dataframe (age numeric)
age_num_df = pd.DataFrame(columns=['Age (Numeric)', 'Mean', 'Median', '1st Qu.', '3rd Qu.'])
age_num_df

Unnamed: 0,Age (Numeric),Mean,Median,1st Qu.,3rd Qu.


In [136]:
# populate the empty dataframe to recreate table 3 (age numeric)
estimated_ages = [int(x[-3:-1])-5 for x in data_diabetic_selected['age']] # take the estimated mean of the age group
age_num_df.loc[len(age_num_df)] = ["Age in years", round(np.mean(estimated_ages), 1), np.median(estimated_ages), np.quantile(estimated_ages, 0.25), np.quantile(estimated_ages, 0.75)]

estimated_days = [int(x) for x in data_diabetic_selected['time_in_hospital'] if int(x) < 14]
age_num_df.loc[len(age_num_df)] = ["Age in years", round(np.mean(estimated_days), 1), np.median(estimated_days), np.quantile(estimated_days, 0.25), np.quantile(estimated_days, 0.75)]


age_num_df

Unnamed: 0,Age (Numeric),Mean,Median,1st Qu.,3rd Qu.
0,Age in years,62.9,65.0,55.0,75.0
1,Age in years,4.2,3.0,2.0,6.0


## References
1. Strack, B., DeShazo, J. P., Gennings, C., Olmo, J. L., Ventura, S., Cios, K. J., & Clore, J. N. (2014). Impact of HbA1c measurement on hospital readmission rates: analysis of 70,000 clinical database patient records. BioMed research international, 2014.