# Sent2Vec: diagnosis clustering

test: group diagnosis with feature vectors from pretrained NLP model

In [None]:
import numpy as np
from collections import Counter

import sent2vec

# Calculate feature vector for each diagnosis string

In [None]:
patient_demo_dict = np.load('processed/patient_demographics.npy', allow_pickle=True).item()

diagnosis = patient_demo_dict['apacheadmissiondx']
diagnosis_vec = []

sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model('../pretrained_models/wiki_unigrams.bin')

for d in diagnosis:
    diagnosis_vec.append(sent2vec_model.embed_sentence(d))

np.save('processed/diagnosis', diagnosis)
np.save('processed/diagnosis_vec', diagnosis_vec)

In [None]:
diagnosis = np.load('processed/diagnosis.npy')
diagnosis_vec = np.load('processed/diagnosis_vec.npy')

# Feature vector clustering 

In [25]:
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.cluster import AffinityPropagation, DBSCAN, OPTICS

import matplotlib.pyplot as plt

In [3]:
diagnosis_vec = diagnosis_vec.reshape(diagnosis_vec.shape[0], -1)
diagnosis_vec.shape

(43540, 600)

### DBSCAN Clustering

In [4]:
# AP_clustering = AffinityPropagation(random_state=0).fit(diagnosis_vec)
DBSCAN_clusters = DBSCAN(eps=0.3, min_samples=10).fit(diagnosis_vec)
DBSCAN_clusters.core_sample_indices_.shape

In [6]:
diagnosis_labels = DBSCAN_clusters.labels_
core_samples_mask = np.zeros_like(diagnosis_labels, dtype=bool)
core_samples_mask[DBSCAN_clusters.core_sample_indices_] = True

In [7]:
n_clusters_ = len(set(diagnosis_labels)) - (1 if -1 in diagnosis_labels else 0)
n_noise_ = list(diagnosis_labels).count(-1)
n_clusters_, n_noise_

(127, 263)

In [16]:
diagnosis_dict = {}

for i, label in enumerate(diagnosis_labels):
    if label in diagnosis_dict:
        diagnosis_dict[label].append(i)
    else:
        diagnosis_dict[label] = [i]

In [19]:
diagnosis[diagnosis_dict[1]]

array(['Sepsis, GI', 'Asthma', 'Cardiomyopathy', ..., '', '', ''],
      dtype='<U177')

In [20]:
diagnosis[diagnosis_dict[2]]

array(['Angina, unstable (angina interferes w/quality of life or meds are tolerated poorly)',
       'Angina, unstable (angina interferes w/quality of life or meds are tolerated poorly)',
       'Angina, unstable (angina interferes w/quality of life or meds are tolerated poorly)',
       ...,
       'Angina, unstable (angina interferes w/quality of life or meds are tolerated poorly)',
       'Angina, unstable (angina interferes w/quality of life or meds are tolerated poorly)',
       'Angina, unstable (angina interferes w/quality of life or meds are tolerated poorly)'],
      dtype='<U177')

In [23]:
for i in range(128):
    print('\n',len(diagnosis[diagnosis_dict[i]]), '\n', diagnosis[diagnosis_dict[i]])


 28 
 ['Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction'
 'Thoracotomy for lung reduction' 'Thoracotomy for lung reduction']

 8014 
 ['Sepsis, GI' 'Asthma' 'Cardiomyopathy' ... '

KeyError: 127

### OPTICS Clustering

In [27]:
OPTICS_cluster = OPTICS(min_samples=50, xi=.05, min_cluster_size=.01)
OPTICS_cluster.fit(diagnosis_vec)

  ratio = reachability_plot[:-1] / reachability_plot[1:]


OPTICS(min_cluster_size=0.01, min_samples=50)

In [32]:
num_labels_optics = len(set(OPTICS_cluster.labels_))
num_labels_optics

20

In [33]:
diagnosis_dict_optics = {}

for i, label in enumerate(OPTICS_cluster.labels_):
    if label in diagnosis_dict_optics:
        diagnosis_dict_optics[label].append(i)
    else:
        diagnosis_dict_optics[label] = [i]

In [80]:
f = open('diagnosis_stats.txt', 'w')

for i in range(-1, 19):
    f.write(f'Group {i}\n')
    c = Counter(diagnosis[diagnosis_dict_optics[i]])
    for key in c:
        f.write(f'{key}: {c[key]}\n')
    f.write('\n\n\n')
f.close()

### save clustering models

In [87]:
import joblib

joblib.dump(OPTICS_cluster, 'diagnosis_cluster_OPTICS')
joblib.dump(DBSCAN_clusters, 'diagnosis_cluster_DBSCAN')

['diagnosis_cluster_DBSCAN']

In [88]:
model1 = joblib.load('diagnosis_cluster_OPTICS')

In [None]:
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(diagnosis_labels))]
for k, col in zip(diagnosis_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (diagnosis_labels == k)

    xy = diagnosis_vec[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = diagnosis_vec[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
