# Unsupervised Machine Learning with 2017 Rush University Medical Center and Rush Oak Park Hospital Inpatient Data 

In [None]:
# import dependencies
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
# load Inpatient dataset
IP = pd.read_csv("../Datasets/Rush_IP_2017_ACS_Cleaned.csv")

In [None]:
# list column names
IP.columns

In [None]:
# get dummy variables for race/ethnicity variable
IP.loc[IP["Race_Ethnicity"] == "Non-Hispanic Black", "RE_NHB"] = 1
IP.loc[IP["Race_Ethnicity"] != "Non-Hispanic Black", "RE_NHB"] = 0

IP.loc[IP["Race_Ethnicity"] == "Hispanic", "RE_Hisp"] = 1
IP.loc[IP["Race_Ethnicity"] != "Hispanic", "RE_Hisp"] = 0

IP.loc[IP["Race_Ethnicity"] == "Other/Unknown", "RE_Other"] = 1
IP.loc[IP["Race_Ethnicity"] != "Other/Unknown", "RE_Other"] = 0

In [None]:
# drop missing rows
IP = IP.dropna()

In [None]:
# select columns for analysis
IP_Analysis = IP[['Age', 'no_diploma_pct', 'HS_Grad_ge25_pct',
       'HS_Grad_orHigher_pct', 'Bach_Deg_ge25_pct', 'Bach_Deg_orHigher_pct',
       'Labor_Partic_Rate', 'Emp_Pop_Ratio', 'Unemp_Rate', 'Uninsured_Percent',
       'Household_Med_Income', 'Family_Med_income', 'Married_Fam_Med_Income',
       'NonFamHousehold_Med_Income', 'Housing_Occ_Percent',
       'Housing_Vacant_Percent', 'Below_Poverty_Percent',
       'Household_SNAP_Percent', 'English', 'Female',
       'Readmit', 'RE_NHB', 'RE_Hisp', 'RE_Other']]

In [None]:
# scale the dataset
mms = MinMaxScaler()
mms.fit(IP_Analysis)
data_transformed = mms.transform(IP_Analysis)

In [None]:
# get sum of squared distances
Sum_of_squared_distances = []
K = range(1, 15)
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(data_transformed)
    Sum_of_squared_distances.append(km.inertia_)

In [None]:
# plot sum of squared distances to look for elbow
# The optimal k for this dataset appears to be 4.
# https://blog.cambridgespark.com/how-to-determine-the-optimal-number-of-clusters-for-k-means-clustering-14f27070048f
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of Squared Distances')
plt.title('Elbow Method for Optimal k')
plt.show()

In [None]:
# run K means clustering for 4 clusters
kmeans = KMeans(n_clusters=4)
kmeans.fit(data_transformed)
predicted_clusters = kmeans.predict(data_transformed)

In [None]:
# add 1 to every cluster number to avoid the 0 label for cluster 1
clusters = []
for cluster in predicted_clusters:
    cluster += 1
    clusters.append(cluster)

In [None]:
# add predicted clusters as a column
IP['Cluster'] = clusters

In [None]:
# split up categorical and continuous variables
categorical_variables = ['Race_Ethnicity', 'English', 'Female', 'Readmit']
continuous_variables = ['Age', 'no_diploma_pct', 'HS_Grad_ge25_pct', 'HS_Grad_orHigher_pct',
       'Bach_Deg_ge25_pct', 'Bach_Deg_orHigher_pct', 'Labor_Partic_Rate',
       'Emp_Pop_Ratio', 'Unemp_Rate', 'Uninsured_Percent',
       'Household_Med_Income', 'Family_Med_income', 'Married_Fam_Med_Income',
       'NonFamHousehold_Med_Income', 'Housing_Occ_Percent',
       'Housing_Vacant_Percent', 'Below_Poverty_Percent',
       'Household_SNAP_Percent']

In [None]:
# check categorical variable descriptive statistics of clusters 
table_counts_list = []
table_percents_list = []

for col in categorical_variables:
    table_counts = pd.crosstab(IP[col], IP["Cluster"], margins = True)
    table_percents = table_counts/table_counts.ix["All"]
    table_counts_list.append(table_counts)
    table_percents_list.append(table_percents)
    
# print(table_counts_list)
for table in table_percents_list:
    print(table)

In [None]:
# check continuous variable descriptive statistics of clusters 
table_stats_list = []

for col in continuous_variables:
    cluster1_stats = IP[col][IP['Cluster'] == 1].describe()
    table_stats_list.append(cluster1_stats)
    
    cluster2_stats = IP[col][IP['Cluster'] == 2].describe()
    table_stats_list.append(cluster2_stats)
    
    cluster3_stats = IP[col][IP['Cluster'] == 3].describe()
    table_stats_list.append(cluster3_stats)
    
    cluster4_stats = IP[col][IP['Cluster'] == 4].describe()
    table_stats_list.append(cluster4_stats)
    
table_stats_list

###### Readmission status proportions was consistent between the clusters.
###### Cluster 1: Hispanic, slightly less English speakers, mostly female, lower socioeconomic status (SES)
###### Cluster 2: Non-Hispanic Black, mostly English speakers, mostly female, lower SES
###### Cluster 3: Non-Hispanic White, mostly English speakers, all male, more educated, more employed, more insured, higher income, higher SES
###### Cluster 4: Non-Hispanic White, mostly English speakers, all female, more educated, more employed, more insured, higher income, higher SES

###### Race/Ethnicity, gender, and SES appear to be what differentiates the clusters. Cluster 1 with Hispanic people and Cluster 2 with Non-Hispanic Black people appear to be the lower SES group. Clusters 3 and 4 appear to be non-Hispanic White people with higher SES, only distinguished by gender.