In [1]:
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import MeanShift
import scipy.cluster.hierarchy as shc
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Import/Format Data

In [2]:
data=pd.read_csv('data/patients.csv')
dataOG=pd.read_csv('data/patients.csv')
dfo=pd.read_csv('data/observations.csv')
data['BIRTHDATE']=data['BIRTHDATE'].str.replace('-', '')
data['SSN']=data['SSN'].str.replace('-', '')
data['BIRTHDATE']=data['BIRTHDATE'].astype(float)
data['SSN']=data['SSN'].astype(float)
data=data.drop(['Id','DEATHDATE','MAIDEN','DRIVERS','PASSPORT','PREFIX','FIRST','LAST','SUFFIX','BIRTHPLACE','ADDRESS','CITY','STATE','COUNTY'],axis=1)
data['MARITAL']=data['MARITAL'].fillna(0)
data['ZIP']=data['ZIP'].fillna(0)
data['MARITAL']=data['MARITAL'].replace({'M':1})
data['MARITAL']=data['MARITAL'].replace({'S':0})
data['GENDER']=data['GENDER'].replace({'M':1})
data['GENDER']=data['GENDER'].replace({'F':0})

## Shrink Observation Dataframe

In [3]:
dfo.shape

(299697, 8)

In [4]:
dfobmi=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Body Mass Index"].index)
dfobw=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Body Weight"].index)
dfobh=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Body Height"].index)
dfodbp=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Diastolic Blood Pressure"].index)
dfosbp=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Systolic Blood Pressure"].index)
dfobhr=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Heart rate"].index)
dfobrr=dfo.drop(dfo.loc[dfo["DESCRIPTION"]!="Respiratory rate"].index)
dfo=pd.concat([dfobmi,dfobw,dfobh,dfodbp,dfosbp,dfobhr,dfobrr])
dfo=dfo.sample(frac=1,random_state=6)
dfo.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS,TYPE
135534,2012-09-22T10:26:36Z,69b60335-4bd9-4043-ad76-26b75d7dd40a,4ca1fa3d-0429-473f-9dd5-f942f071a137,8480-6,Systolic Blood Pressure,123.0,mm[Hg],numeric
206667,2013-07-06T13:13:50Z,78a9a8d6-b3b2-47dc-b4a0-867abec7c78f,1c2ef8a9-7d30-4634-8806-8d0326238c24,39156-5,Body Mass Index,27.2,kg/m2,numeric
111336,1989-07-29T20:06:53Z,59cf17d9-6c13-4333-a1cb-cc5fdf63366d,7b05a570-1969-4cdf-91c5-c26b6d28b80a,8302-2,Body Height,186.8,cm,numeric
12201,2016-03-04T04:10:32Z,dadd331c-6143-41d1-bffb-d34b321ed157,40a0cbb2-de8f-48fe-9f77-1315c17cb4c3,9279-1,Respiratory rate,12.0,/min,numeric
146886,1999-02-25T02:48:14Z,60320709-0445-4112-ab79-1997cc6f3822,db7ef447-a886-4522-882c-14f29fae6853,39156-5,Body Mass Index,27.5,kg/m2,numeric


In [5]:
dfo.shape

(87585, 8)

# Multiprocesssing

In [13]:
def popdf(dfo):
    global dfm
    global query
    for index_dfo,row_dfo in list(query[["PATIENT","DESCRIPTION","VALUE"]].iterrows()):
        if row_dfo["DESCRIPTION"] in columns:
            dfm.loc[dfm.patient_id==row_dfo["PATIENT"],row_dfo["DESCRIPTION"]]= row_dfo["VALUE"]
    return(dfm)

In [14]:
import multiprocessing as mp
import tqdm
from itertools import repeat
from multiprocessing import Process, Manager
from multiprocessing import Pool

In [15]:
num_processes=mp.cpu_count()
num_partitions=num_processes
manager=Manager()
d=manager.dict()
df_split=np.array_split(dfo,num_partitions)
pool=Pool(num_processes)
shared_arg=repeat(d,num_partitions)

# Set Up Observation Data Frame

In [16]:
patient_ids=set()
for patient_id in dfo["PATIENT"]:
    patient_ids.add(patient_id)
patient_ids=list(patient_ids)
patient_ids.sort()

In [17]:
columns=["patient_id","Body Height","Body Weight", "Body Mass Index","Diastolic Blood Pressure",
         "Systolic Blood Pressure","Heart rate","Respiratory rate"]
dfm=pd.DataFrame(columns=columns)
dfm["patient_id"]=patient_ids
dfm.head()

Unnamed: 0,patient_id,Body Height,Body Weight,Body Mass Index,Diastolic Blood Pressure,Systolic Blood Pressure,Heart rate,Respiratory rate
0,00185faa-2760-4218-9bf5-db301acf8274,,,,,,,
1,0042862c-9889-4a2e-b782-fac1e540ecb4,,,,,,,
2,0047123f-12e7-486c-82df-53b3a450e365,,,,,,,
3,010d4a3a-2316-45ed-ae15-16f01c611674,,,,,,,
4,01207ecd-9dff-4754-8887-4652eda231e2,,,,,,,


## Populate Dataframe

In [18]:
query=pd.DataFrame()
for column in columns[1:]:
    query=pd.concat([query,dfo.query(f"DESCRIPTION=='{column}'")])

In [None]:
for _ in tqdm.tqdm(pool.map(popdf, zip(shared_arg,df_split)),total=num_partitions):
    pass

In [None]:
dfm.head()

# Merge Patient/observation Dataframe

In [None]:
data.head()

In [None]:
df=pd.concat([data,dfm],axis=1)
df=df.drop(['patient_id','SSN','LAT','LON'],axis=1)
df.head()

In [None]:
dfcopy=df.copy()
dfcopy=dfcopy.dropna()
dfcopy=dfcopy.sample(frac=1, random_state=7896)
dfcopy.head()

## Encode Categories

In [None]:
enc = preprocessing.OrdinalEncoder()
X=df.iloc[:,2:4].values
enc.fit(X)

In [None]:
X=enc.transform(X)
cols=['RACE','ETHNICITY']
df[cols]=X

In [None]:
df=df.dropna()
df=df.sample(frac=1, random_state=7896)
df.head()

# Hierachical Clustering

## Dendrogram

In [None]:
plt.figure(figsize=(10, 7))
plt.title("Patients Dendrograms")
dend = shc.dendrogram(shc.linkage(df, method='ward'))

## Agglomerative Clustering

In [None]:
cluster= AgglomerativeClustering(n_clusters=3, affinity='euclidean',linkage='ward')
cluster.fit_predict(df)
print(cluster.labels_)

## Mean Shift

In [None]:
clf=MeanShift()
clf.fit(df)
labels=clf.labels_
cluster_centers=clf.cluster_centers_
dfcopy['cluster_group']=np.nan
df['cluster_group']=np.nan

In [None]:
for i in range(len(df)):
    dfcopy['cluster_group'].iloc[i]=labels[i]
    df['cluster_group'].iloc[i]=labels[i]
n_clusters_ =len(np.unique(labels))
print(n_clusters_)

In [None]:
dfcopy.head()

### Analyzing Groups

In [None]:
temp0=dfcopy[dfcopy['cluster_group']==0]
len(temp0)

In [None]:
temp1=dfcopy[dfcopy['cluster_group']==1]
len(temp1)

In [None]:
temp2=dfcopy[dfcopy['cluster_group']==2]
len(temp2)

In [None]:
temp3=dfcopy[dfcopy['cluster_group']==3]
len(temp3)

#### Looking at Outlier Group

In [None]:
temp3.head(7)

# PCA for Data Visualization

In [None]:
pca=PCA(n_components=3)
principalComponents=pca.fit_transform(df)
principalDF = pd.DataFrame(data=principalComponents, columns =['Principal Component 1', 'Principal Component 2','Principal Component 3'])
principalDF.head()

In [None]:
finalDF=pd.concat([principalDF, df[['cluster_group']]],axis=1)
finalDF.head()

### 2D

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)


targets = [0,1,2,3]
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDF['cluster_group'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'Principal Component 1']
               , finalDF.loc[indicesToKeep, 'Principal Component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

### 3D

In [None]:
fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111,projection='3d') 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 Component PCA', fontsize = 20)


targets = [0,1,2,3]
colors = ['r', 'g', 'b','y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDF['cluster_group'] == target
    ax.scatter(finalDF.loc[indicesToKeep, 'Principal Component 1']
               , finalDF.loc[indicesToKeep, 'Principal Component 2']
               , finalDF.loc[indicesToKeep, 'Principal Component 3']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()