# Supervised Machine Learning with 2017 Rush University Medical Center and Rush Oak Park Hospital Inpatient Data Part 1

In [None]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# load Inpatient dataset
IP = pd.read_csv("../Datasets/Rush_IP_2017_ACS_Cleaned.csv")

In [None]:
IP.head()

In [None]:
# Convert integer features to float
IP["Chemotherapy"] = pd.to_numeric(IP["Chemotherapy"], downcast='float')
IP["HipKneeJoint"] = pd.to_numeric(IP["HipKneeJoint"], downcast='float')

In [None]:
# get dummy variables for race/ethnicity variable
IP.loc[IP["Race_Ethnicity"] == "Non-Hispanic Black", "RE_NHB"] = 1
IP.loc[IP["Race_Ethnicity"] != "Non-Hispanic Black", "RE_NHB"] = 0

IP.loc[IP["Race_Ethnicity"] == "Hispanic", "RE_Hisp"] = 1
IP.loc[IP["Race_Ethnicity"] != "Hispanic", "RE_Hisp"] = 0

IP.loc[IP["Race_Ethnicity"] == "Other/Unknown", "RE_Other"] = 1
IP.loc[IP["Race_Ethnicity"] != "Other/Unknown", "RE_Other"] = 0

In [None]:
# remove rows with missing data
IP = IP.dropna()

In [None]:
# Assign X (data) and y (target)
X = IP.drop(["ID", "Readmit", "Race_Ethnicity"], axis=1)
y = IP["Readmit"]
print(X.shape, y.shape)

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
# scale data
X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Logistic Regression

In [None]:
# Create a logistic regression model
classifier = LogisticRegression()
classifier

In [None]:
# Fit the model
classifier.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

## Decision Tree

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

## Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

## K-Nearest Neighbors

In [None]:
# Loop through different k values to search for highest accuracy
# Note: Only odd numbers are used to avoid ties
train_scores = []
test_scores = []

for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker='x')
plt.xlabel('k neighbors')
plt.ylabel('Testing Accuracy Score')
plt.show()

In [None]:
# k=9 appears to be the best choice
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)
print('k=9 Test Acc: %.3f' % knn.score(X_test_scaled, y_test))

## Support Vector Machine

In [None]:
svc_model = SVC(kernel='linear')
svc_model.fit(X_train_scaled, y_train)

In [None]:
print('Test Acc: %.3f' % svc_model.score(X_test_scaled, y_test))

## Neural Network

In [None]:
# One-hot encoding
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

In [None]:
# Create a sequential model
nn_model = Sequential()

In [None]:
# Set up input layer and hidden layer
nn_model.add(Dense(units=24, activation='relu', input_dim=34))

In [None]:
# Set up output layer
nn_model.add(Dense(units=2, activation='softmax'))

In [None]:
# Compile the model
nn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
nn_model.fit(X_train_scaled, y_train_categorical, epochs=100, shuffle=True, verbose=2)

In [None]:
nn_model_loss, nn_model_accuracy = nn_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {nn_model_loss}, Accuracy: {nn_model_accuracy}")

## Deep Learning

In [None]:
deep_model = Sequential()
deep_model.add(Dense(units=24, activation='relu', input_dim=34))
deep_model.add(Dense(units=24, activation='relu'))
deep_model.add(Dense(units=2, activation='softmax'))

In [None]:
deep_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

deep_model.fit(X_train_scaled, y_train_categorical, epochs=100, shuffle=True, verbose=2)

In [None]:
deep_model_loss, deep_model_accuracy = deep_model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(f"Deep Neural Network - Loss: {deep_model_loss}, Accuracy: {deep_model_accuracy}")

###### Logistic regression, k-nearest neighbors, support vector machine, and neural network yielded accuracies of about 85-86%.