In [52]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
import scipy.cluster.hierarchy as shc
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import boto3
import s3fs

In [53]:
client=boto3.client('s3')

# Import/Format Data

In [54]:
data=pd.read_csv('s3://ms-syntheamass-1m-mldata/syntheamassCSV/output_1/csv/patients.csv')
dataOG=pd.read_csv('s3://ms-syntheamass-1m-mldata/syntheamassCSV/output_1/csv/patients.csv')
dfo=pd.read_csv('s3://ms-syntheamass-1m-mldata/syntheamassCSV/output_1/csv/observations.csv')
data['BIRTHDATE']=data['BIRTHDATE'].str.replace('-', '')
data['BIRTHDATE']=data['BIRTHDATE'].astype(float)
data=data.drop(['ID','DEATHDATE','MAIDEN','SSN','DRIVERS','PASSPORT','PREFIX','FIRST','LAST','SUFFIX','BIRTHPLACE','ADDRESS'],axis=1)
data['MARITAL']=data['MARITAL'].fillna(0)
data['MARITAL']=data['MARITAL'].replace({'M':1})
data['MARITAL']=data['MARITAL'].replace({'S':0})
data['GENDER']=data['GENDER'].replace({'M':1})
data['GENDER']=data['GENDER'].replace({'F':0})
data.head()

Unnamed: 0,BIRTHDATE,MARITAL,RACE,ETHNICITY,GENDER
0,19960726.0,0,white,irish,0
1,19960924.0,0,white,french_canadian,0
2,19440901.0,1,white,irish,1
3,19640514.0,1,white,french,0
4,19460305.0,0,white,irish,0


## Shrink Observation dataframe

In [55]:
dfo.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS
0,2011-07-02,33f33990-ae8b-4be8-938f-e47ad473abfe,673daa98-67e9-4e80-be46-a0b547533653,8302-2,Body Height,175.76,cm
1,2011-03-12,36d131ee-dd5b-4acb-acbe-19961c32c099,bac018de-114a-481d-b4f9-87980d7ef8b8,8302-2,Body Height,167.28,cm
2,2011-07-02,33f33990-ae8b-4be8-938f-e47ad473abfe,673daa98-67e9-4e80-be46-a0b547533653,29463-7,Body Weight,56.51,kg
3,2011-03-12,36d131ee-dd5b-4acb-acbe-19961c32c099,bac018de-114a-481d-b4f9-87980d7ef8b8,29463-7,Body Weight,63.35,kg
4,2011-07-02,33f33990-ae8b-4be8-938f-e47ad473abfe,673daa98-67e9-4e80-be46-a0b547533653,39156-5,Body Mass Index,18.29,kg/m2


In [56]:
dfo.shape

(5383318, 7)

In [57]:
dfobmi=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Body Mass Index"].index)
dfobw=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Body Weight"].index)
dfobh=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Body Height"].index)
dfodbp=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Diastolic Blood Pressure"].index)
dfosbp=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Systolic Blood Pressure"].index)
dfobhr=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Heart rate"].index)
dfobrr=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Respiratory rate"].index)
dfo=pd.concat([dfobmi,dfobw,dfobh,dfodbp,dfosbp,dfobhr,dfobrr])
dfo=dfo.sample(frac=1,random_state=6)
dfo.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS
4731049,2011-06-28,a5a2180e-9269-4917-a930-2aee0661dedb,89a4ac7a-ea1a-4d5e-8b6d-87c0c60194b6,8302-2,Body Height,132.35,cm
2258683,2013-03-18,84fc7b80-4b98-4b04-afe9-f55cff80863c,a7a2f4c6-198e-4528-84b2-c32c4cdab36c,8462-4,Diastolic Blood Pressure,113.0,mmHg
1178322,2014-10-22,132b6961-28b0-41d3-972e-259079e9580d,09942dd6-197b-4e5e-aa3d-923d22c6be1e,8462-4,Diastolic Blood Pressure,120.0,mmHg
1048865,2011-10-17,ba347cb1-06de-4844-ac94-0a5771976789,2ae819ae-c6cd-4862-8660-de16d7e83f0e,39156-5,Body Mass Index,33.39,kg/m2
2862323,2013-03-15,0ab01083-b19c-47f6-a5c5-537bbbbc4e5e,de25b4c3-bef8-4bcd-b5e0-99eb5028e33c,8302-2,Body Height,151.11,cm


In [58]:
dfo.shape

(2872837, 7)

# Set Up Observation Data Frame

In [59]:
patient_ids=set()
for patient_id in dfo["PATIENT"]:
    patient_ids.add(patient_id)
patient_ids=list(patient_ids)
patient_ids.sort()

In [60]:
columns=["patient_id","Body Height","Body Weight", "Body Mass Index","Diastolic Blood Pressure",
         "Systolic Blood Pressure","Heart rate","Respiratory rate"]
dfm=pd.DataFrame(columns=columns)
dfm["patient_id"]=patient_ids
dfm.head()

Unnamed: 0,patient_id,Body Height,Body Weight,Body Mass Index,Diastolic Blood Pressure,Systolic Blood Pressure,Heart rate,Respiratory rate
0,00004593-87ae-40ac-bfec-7e2dd37b690f,,,,,,,
1,00005a31-23bc-46dc-9dcd-13337680b90e,,,,,,,
2,000120e2-63aa-404c-8842-d6ddc96f6dec,,,,,,,
3,0002bd42-7fcc-4a88-aa2b-6d6539b34b58,,,,,,,
4,0002c430-5526-4389-8335-595c918610d3,,,,,,,


## Populate Dataframe

In [61]:
query=pd.DataFrame()
for column in columns[1:]:
    query=pd.concat([query,dfo.query(f"DESCRIPTION=='{column}'")])

In [62]:
for index_dfo,row_dfo in list(query[["PATIENT","DESCRIPTION","VALUE"]].iterrows()):
    if row_dfo["DESCRIPTION"] in columns:
        dfm.loc[dfm.patient_id==row_dfo["PATIENT"],row_dfo["DESCRIPTION"]]= row_dfo["VALUE"]

KeyboardInterrupt: 

In [None]:
dfm.head()

# Merge Patient/observation Dataframe

In [9]:
data.head()

Unnamed: 0,BIRTHDATE,MARITAL,RACE,ETHNICITY,GENDER
0,19960726.0,0,white,irish,0
1,19960924.0,0,white,french_canadian,0
2,19440901.0,1,white,irish,1
3,19640514.0,1,white,french,0
4,19460305.0,0,white,irish,0


In [11]:
df=pd.concat([data,dfm],axis=1)
df.to_csv()
df.head()

Unnamed: 0,BIRTHDATE,MARITAL,RACE,ETHNICITY,GENDER,patient_id,Body Height,Body Weight,Body Mass Index,Diastolic Blood Pressure,Systolic Blood Pressure,Heart rate,Respiratory rate
0,19960726.0,0,white,irish,0,00004593-87ae-40ac-bfec-7e2dd37b690f,,,,,,,
1,19960924.0,0,white,french_canadian,0,00005a31-23bc-46dc-9dcd-13337680b90e,,,,,,,
2,19440901.0,1,white,irish,1,000120e2-63aa-404c-8842-d6ddc96f6dec,,,,,,,
3,19640514.0,1,white,french,0,000177c6-f76b-432b-9493-5a88bc9fb6bd,,,,,,,
4,19460305.0,0,white,irish,0,0002bd42-7fcc-4a88-aa2b-6d6539b34b58,,,,,,,


In [None]:
dfcopy=df.copy()
dfcopy=dfcopy.dropna()
dfcopy=dfcopy.sample(frac=1, random_state=7896)
dfcopy.to_csv()
dfcopy.head()

## Encode Categories

In [None]:
enc = preprocessing.OrdinalEncoder()
X=df.iloc[:,2:4].values
enc.fit(X)

In [None]:
X=enc.transform(X)
cols=['RACE','ETHNICITY']
df[cols]=X

In [None]:
df=df.dropna()
df=df.sample(frac=1, random_state=7896)
df.head()

# Hierachical Clustering

## Dendrogram

In [None]:
plt.figure(figsize=(10, 7))
plt.title("Patients Dendrograms")
dend = shc.dendrogram(shc.linkage(df, method='ward'))

## Agglomerative Clustering

In [None]:
cluster= AgglomerativeClustering(n_clusters=3, affinity='euclidean',linkage='ward')
cluster.fit_predict(df)
print(cluster.labels_)

## Mean Shift

In [None]:
clf=MeanShift()
clf.fit(df)
labels=clf.labels_
cluster_centers=clf.cluster_centers_
dfcopy['cluster_group']=np.nan
df['cluster_group']=np.nan

In [None]:
for i in range(len(df)):
    dfcopy['cluster_group'].iloc[i]=labels[i]
    df['cluster_group'].iloc[i]=labels[i]
n_clusters_ =len(np.unique(labels))
print(n_clusters_)

In [None]:
dfcopy.head()

### Analyzing Groups

In [None]:
temp0=dfcopy[dfcopy['cluster_group']==0]
len(temp0)

In [None]:
temp1=dfcopy[dfcopy['cluster_group']==1]
len(temp1)

In [None]:
temp2=dfcopy[dfcopy['cluster_group']==2]
len(temp2)

In [None]:
temp3=dfcopy[dfcopy['cluster_group']==3]
len(temp3)

#### Looking at Outlier Group

In [None]:
temp3.head(7)

# PCA for Data Visualization

In [None]:
pca=PCA(n_components=3)
principalComponents=pca.fit_transform(df)
principalDF = pd.DataFrame(data=principalComponents, columns =['Principal Component 1', 'Principal Component 2','Principal Component 3'])
principalDF.head()

In [None]:
finalDF=pd.concat([principalDF, df[['cluster_group']]],axis=1)
finalDF.head()

### 2D

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)


targets = [0,1,2,3]
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDF['cluster_group'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'Principal Component 1']
               , finalDF.loc[indicesToKeep, 'Principal Component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

### 3D

In [None]:
fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111,projection='3d') 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 Component PCA', fontsize = 20)


targets = [0,1,2,3]
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDF['cluster_group'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'Principal Component 1']
               , finalDF.loc[indicesToKeep, 'Principal Component 2']
               , finalDF.loc[indicesToKeep, 'Principal Component 3']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()