In [1]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
import scipy.cluster.hierarchy as shc
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import boto3
import s3fs

In [2]:
client=boto3.client('s3')

# Import/Format Data

In [3]:
data=pd.read_csv('s3://ms-syntheamass-1m-mldata/syntheamassCSV/output_1/csv/patients.csv')
dataOG=pd.read_csv('s3://ms-syntheamass-1m-mldata/syntheamassCSV/output_1/csv/patients.csv')
dfo=pd.read_csv('s3://ms-syntheamass-1m-mldata/syntheamassCSV/output_1/csv/observations.csv')
data['BIRTHDATE']=data['BIRTHDATE'].str.replace('-', '')
data['BIRTHDATE']=data['BIRTHDATE'].astype(float)
data=data.drop(['ID','DEATHDATE','MAIDEN','SSN','DRIVERS','PASSPORT','PREFIX','FIRST','LAST','SUFFIX','BIRTHPLACE','ADDRESS'],axis=1)
data['MARITAL']=data['MARITAL'].fillna(0)
data['MARITAL']=data['MARITAL'].replace({'M':1})
data['MARITAL']=data['MARITAL'].replace({'S':0})
data['GENDER']=data['GENDER'].replace({'M':1})
data['GENDER']=data['GENDER'].replace({'F':0})
data.head()

Unnamed: 0,BIRTHDATE,MARITAL,RACE,ETHNICITY,GENDER
0,19960726.0,0,white,irish,0
1,19960924.0,0,white,french_canadian,0
2,19440901.0,1,white,irish,1
3,19640514.0,1,white,french,0
4,19460305.0,0,white,irish,0


# Set Up Observation Data Frame

In [4]:
dfm=pd.DataFrame()
dfm['PATIENT']=dataOG['ID']
dfm['Body Height']=np.nan
dfm['Body Weight']=np.nan
dfm['Body Mass Index']=np.nan
dfm['Diastolic Blood Pressure']=np.nan
dfm['Systolic Blood Pressure']=np.nan
dfm['Heart rate']=np.nan
dfm['Respiratory rate']=np.nan
dfm.head()

Unnamed: 0,PATIENT,Body Height,Body Weight,Body Mass Index,Diastolic Blood Pressure,Systolic Blood Pressure,Heart rate,Respiratory rate
0,660bec03-9e58-47f2-98b9-2f1c564f3838,,,,,,,
1,5125d2b2-3aef-4ae2-aa5c-335f7e206b92,,,,,,,
2,26626faf-cbd5-48d5-a3bf-a7b21ae08e4b,,,,,,,
3,f509a0f0-77ef-477f-977d-e2784a241b52,,,,,,,
4,4c763cac-b1df-4bcc-b89c-834942c4d3d6,,,,,,,


In [5]:
#DISCLAIMER:stole this bit from Justin
columns=["Body Height","Body Weight","Body Mass Index","Diastolic Blood Pressure","Systolic Blood Pressure"
         ,"Heart rate","Respiratory rate"]
patients=[]
descriptions=[]
values=[]

for index,row, in dfo [["PATIENT", "DESCRIPTION", "VALUE"]].iterrows():
    if row["DESCRIPTION"] in columns:
        for column in columns:
            if row["DESCRIPTION"]==column:
                patients.append(row["PATIENT"])
                descriptions.append(column)
                values.append(row["VALUE"])

## Populate Dataframe

In [None]:
for index, row in dfm.iterrows():
    for pat, desc, val in zip(patients, descriptions, values):
        if pat==row['PATIENT']:
            dfm.loc[dfm.PATIENT==pat,desc]=val

In [None]:
dfm.head()

# Merge Patient/observation Dataframe

In [None]:
data.head()

In [None]:
df=pd.concat([data,dfm],axis=1)
df=df.drop(['PATIENT','SSN','LAT','LON'],axis=1)
df.head()

In [None]:
dfcopy=df.copy()
dfcopy=dfcopy.dropna()
dfcopy=dfcopy.sample(frac=1, random_state=7896)
dfcopy.head()

## Encode Categories

In [None]:
enc = preprocessing.OrdinalEncoder()
X=df.iloc[:,2:4].values
enc.fit(X)

In [None]:
X=enc.transform(X)
cols=['RACE','ETHNICITY']
df[cols]=X

In [None]:
df=df.dropna()
df=df.sample(frac=1, random_state=7896)
df.head()

# Hierachical Clustering

## Dendrogram

In [None]:
plt.figure(figsize=(10, 7))
plt.title("Patients Dendrograms")
dend = shc.dendrogram(shc.linkage(df, method='ward'))

## Agglomerative Clustering

In [None]:
cluster= AgglomerativeClustering(n_clusters=3, affinity='euclidean',linkage='ward')
cluster.fit_predict(df)
print(cluster.labels_)

## Mean Shift

In [None]:
clf=MeanShift()
clf.fit(df)
labels=clf.labels_
cluster_centers=clf.cluster_centers_
dfcopy['cluster_group']=np.nan
df['cluster_group']=np.nan

In [None]:
for i in range(len(df)):
    dfcopy['cluster_group'].iloc[i]=labels[i]
    df['cluster_group'].iloc[i]=labels[i]
n_clusters_ =len(np.unique(labels))
print(n_clusters_)

In [None]:
dfcopy.head()

### Analyzing Groups

In [None]:
temp0=dfcopy[dfcopy['cluster_group']==0]
len(temp0)

In [None]:
temp1=dfcopy[dfcopy['cluster_group']==1]
len(temp1)

In [None]:
temp2=dfcopy[dfcopy['cluster_group']==2]
len(temp2)

In [None]:
temp3=dfcopy[dfcopy['cluster_group']==3]
len(temp3)

#### Looking at Outlier Group

In [None]:
temp3.head(7)

# PCA for Data Visualization

In [None]:
pca=PCA(n_components=3)
principalComponents=pca.fit_transform(df)
principalDF = pd.DataFrame(data=principalComponents, columns =['Principal Component 1', 'Principal Component 2','Principal Component 3'])
principalDF.head()

In [None]:
finalDF=pd.concat([principalDF, df[['cluster_group']]],axis=1)
finalDF.head()

### 2D

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)


targets = [0,1,2,3]
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDF['cluster_group'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'Principal Component 1']
               , finalDF.loc[indicesToKeep, 'Principal Component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

### 3D

In [None]:
fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111,projection='3d') 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 Component PCA', fontsize = 20)


targets = [0,1,2,3]
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDF['cluster_group'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'Principal Component 1']
               , finalDF.loc[indicesToKeep, 'Principal Component 2']
               , finalDF.loc[indicesToKeep, 'Principal Component 3']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()