# Sent2Vec: admission diagnosis clustering

Group diagnosis with feature vectors from pretrained NLP model

# Calculate feature vector for each diagnosis string

In [None]:
import os
import numpy as np
from collections import Counter
import sent2vec

os.makedirs("_cache", exist_ok=True)

SENT2VEC_MODEL_PATH = '../pretrained_models/wiki_unigrams.bin'
sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model(SENT2VEC_MODEL_PATH)

In [None]:
patient_demo_dict = np.load('_cache/patient_demo.npy', allow_pickle=True).item()
admissiondx = patient_demo_dict['apacheadmissiondx']

admissiondx_embs_cache_path = '_cache/admissiondx_embs.npy'
if os.path.exists(admissiondx_embs_cache_path):
    admissiondx_embs = np.load(admissiondx_embs_cache_path, allow_pickle=True)

else:
    admissiondx_embs = sent2vec_model.embed_sentences(admissiondx)
    np.save('_cache/admissiondx_embs.npy', admissiondx_embs)

In [3]:
admissiondx_embs = admissiondx_embs.reshape(admissiondx_embs.shape[0], -1)
admissiondx_embs.shape

(43540, 600)

# Feature vector clustering 

In [25]:
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.cluster import AffinityPropagation, DBSCAN, OPTICS

import matplotlib.pyplot as plt

## DBSCAN Clustering

In [4]:
# Cluster
DBSCAN_clusters = DBSCAN(eps=0.3, min_samples=10)
DBSCAN_clusters.fit(admissiondx_embs)
print("Number of core samples:", DBSCAN_clusters.core_sample_indices_.shape)

admissiondx_dbscan_labels = DBSCAN_clusters.labels_
core_samples_mask = np.zeros_like(admissiondx_dbscan_labels, dtype=bool)
core_samples_mask[DBSCAN_clusters.core_sample_indices_] = True

n_clusters_ = len(set(admissiondx_dbscan_labels)) - (1 if -1 in admissiondx_dbscan_labels else 0)
n_noise_ = list(admissiondx_dbscan_labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

In [16]:
diagnosis_dict = {}

for i, label in enumerate(admissiondx_dbscan_labels):
    if label in diagnosis_dict:
        diagnosis_dict[label].append(i)
    else:
        diagnosis_dict[label] = [i]

In [None]:
admissiondx[diagnosis_dict[1]]

In [None]:
admissiondx[diagnosis_dict[2]]

In [None]:
for i in range(128):
    print('\n',len(admissiondx[diagnosis_dict[i]]), '\n', admissiondx[diagnosis_dict[i]])

## OPTICS Clustering

In [27]:
OPTICS_cluster = OPTICS(min_samples=50, xi=.05, min_cluster_size=.01)
OPTICS_cluster.fit(admissiondx_embs)

  ratio = reachability_plot[:-1] / reachability_plot[1:]


OPTICS(min_cluster_size=0.01, min_samples=50)

In [32]:
num_labels_optics = len(set(OPTICS_cluster.labels_))
print('Estimated number of labels: %d' % num_labels_optics)

20

In [33]:
diagnosis_dict_optics = {}

for i, label in enumerate(OPTICS_cluster.labels_):
    if label in diagnosis_dict_optics:
        diagnosis_dict_optics[label].append(i)
    else:
        diagnosis_dict_optics[label] = [i]

In [80]:
# f = open('diagnosis_stats.txt', 'w')

# for i in range(-1, 19):
#     f.write(f'Group {i}\n')
#     c = Counter(admissiondx[diagnosis_dict_optics[i]])
#     for key in c:
#         f.write(f'{key}: {c[key]}\n')
#     f.write('\n\n\n')
# f.close()

## Save clustering models

In [87]:
import joblib

joblib.dump(OPTICS_cluster, 'admission_diagnosis_cluster_OPTICS')
joblib.dump(DBSCAN_clusters, 'admission_diagnosis_cluster_DBSCAN')

['diagnosis_cluster_DBSCAN']

In [88]:
OPTICS_cluster = joblib.load('admission_diagnosis_cluster_OPTICS')
DBSCAN_clusters = joblib.load('admission_diagnosis_cluster_DBSCAN')

In [None]:
admissiondx_dbscan_labels = DBSCAN_clusters.labels_
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(admissiondx_dbscan_labels))]
for k, col in zip(admissiondx_dbscan_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (admissiondx_dbscan_labels == k)

    xy = admissiondx_embs[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = admissiondx_embs[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
