In [1]:
## Set up directory and load libraries

setwd("C:/Users/HerComputer/Desktop/DSC630Files")
library(caret)

# Read in datafile for charts
hospital_data <- read.csv('diabetic_data.csv', header=TRUE, stringsAsFactors=FALSE,
                         na.strings=c("","?"))
head(hospital_data)

Loading required package: lattice
"package 'lattice' was built under R version 3.6.3"Loading required package: ggplot2
"package 'ggplot2' was built under R version 3.6.3"

encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
35754,82637451,Caucasian,Male,[50-60),,2,1,2,3,...,No,Steady,No,No,No,No,No,No,Yes,>30


In [2]:
# Quick evaluation of data
str(hospital_data)
dim(hospital_data)

'data.frame':	101766 obs. of  50 variables:
 $ encounter_id            : int  2278392 149190 64410 500364 16680 35754 55842 63768 12522 15738 ...
 $ patient_nbr             : int  8222157 55629189 86047875 82442376 42519267 82637451 84259809 114882984 48330783 63555939 ...
 $ race                    : chr  "Caucasian" "Caucasian" "AfricanAmerican" "Caucasian" ...
 $ gender                  : chr  "Female" "Female" "Female" "Male" ...
 $ age                     : chr  "[0-10)" "[10-20)" "[20-30)" "[30-40)" ...
 $ weight                  : chr  NA NA NA NA ...
 $ admission_type_id       : int  6 1 1 1 1 2 3 1 2 3 ...
 $ discharge_disposition_id: int  25 1 1 1 1 1 1 1 1 3 ...
 $ admission_source_id     : int  1 7 7 7 7 2 2 7 4 4 ...
 $ time_in_hospital        : int  1 3 2 2 1 3 4 5 13 12 ...
 $ payer_code              : chr  NA NA NA NA ...
 $ medical_specialty       : chr  "Pediatrics-Endocrinology" NA NA NA ...
 $ num_lab_procedures      : int  41 59 11 44 51 31 70 73 68 33 ...
 $ num_p

In [3]:
# Keep only observations where patients were released home
keep_discharge <- c(1,6,8,13,16,17)
hospital_data <- subset(hospital_data, discharge_disposition_id %in% keep_discharge)
dim(hospital_data)

In [4]:
# Keep only observations where patients answered gender question
keep_gender <- c('Male', 'Female')
hospital_data <- subset(hospital_data, gender %in% keep_gender)
dim(hospital_data)

In [5]:
# Make sure all observations have a diagnosis code
sum(is.na(hospital_data$diag_1))

In [6]:
# Drop any observations without a diagnosis code
hospital_data <- subset(hospital_data, !is.na(diag_1))
sum(is.na(hospital_data$diag_1))

In [7]:
# Convert readmitted (target variable) to 0/1
hospital_data['readmitted'][hospital_data['readmitted'] == 'NO'] <- '0'
hospital_data['readmitted'][hospital_data['readmitted'] == '>30'] <- '0'
hospital_data['readmitted'][hospital_data['readmitted'] == '<30'] <- '1'
head(hospital_data)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
2,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,0
3,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,0
4,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,0
5,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,0
6,35754,82637451,Caucasian,Male,[50-60),,2,1,2,3,...,No,Steady,No,No,No,No,No,No,Yes,0
7,55842,84259809,Caucasian,Male,[60-70),,3,1,2,4,...,No,Steady,No,No,No,No,No,Ch,Yes,0


In [8]:
# Make sure there are no N/A values for readmitted
sum(is.na(hospital_data$readmitted))

In [9]:
# Check how many na values are in data
sum(is.na(hospital_data))

In [10]:
# Replace all na with 0
hospital_data[is.na(hospital_data)] <- 0
sum(is.na(hospital_data))

In [11]:
# On all medicines, only need to know if patient is taking it or not
# Change in dose is being treated as irrelevant
med_cols = c('metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 
            'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
            'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
            'insulin', 'glyburide.metformin', 'glipizide.metformin', 'glimepiride.pioglitazone',
            'metformin.rosiglitazone', 'metformin.pioglitazone')

for (i in med_cols) {
    hospital_data[[i]][hospital_data[[i]] == 'Steady'] <- '1'
    hospital_data[[i]][hospital_data[[i]] == 'Up'] <- '1'
    hospital_data[[i]][hospital_data[[i]] == 'Down'] <- '1'
    hospital_data[[i]][hospital_data[[i]] == 'No'] <- '1'
}

head(hospital_data)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide.metformin,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted
2,149190,55629189,Caucasian,Female,[10-20),0,1,1,7,3,...,No,1,1,1,1,1,1,Ch,Yes,0
3,64410,86047875,AfricanAmerican,Female,[20-30),0,1,1,7,2,...,No,1,1,1,1,1,1,No,Yes,0
4,500364,82442376,Caucasian,Male,[30-40),0,1,1,7,2,...,No,1,1,1,1,1,1,Ch,Yes,0
5,16680,42519267,Caucasian,Male,[40-50),0,1,1,7,1,...,No,1,1,1,1,1,1,Ch,Yes,0
6,35754,82637451,Caucasian,Male,[50-60),0,2,1,2,3,...,No,1,1,1,1,1,1,No,Yes,0
7,55842,84259809,Caucasian,Male,[60-70),0,3,1,2,4,...,No,1,1,1,1,1,1,Ch,Yes,0


In [12]:
# Separate into training and testing
inTraining <- createDataPartition(hospital_data$readmitted, p = .80, list = FALSE)
training_data <- hospital_data[inTraining,]
training_x <- subset(training_data, select = -readmitted)
training_y <- training_data$readmitted
testing_data <- hospital_data[-inTraining,]
testing_x <- subset(testing_data, select = -readmitted)
testing_y <- testing_data$readmitted

# Check that training and testing data is balanced
training_percentages <- 100*(table(training_data$readmitted)/length(training_data$readmitted))
testing_percentages <- 100*(table(testing_data$readmitted)/length(testing_data$readmitted))

dim(training_data)
training_percentages

dim(testing_data)
testing_percentages


        0         1 
90.124735  9.875265 


        0         1 
90.128988  9.871012 

In [13]:
# Save data to files for Python to read in

write.csv(training_data, 'train_data.csv', row.names=FALSE)
write.csv(testing_data, 'test_data.csv', row.names=FALSE)