# Vectorize Asthma Trial inclusion data

## Using Google's pre-trained Word2Vec

Pros: Pre-trained  
Cons: Likely missing lots of medical terminology important to the meaning of criteria

Important check: Capture all words that Word2Vec doesn't recognize in the asthma study subset

In [1]:
from pymongo import MongoClient
from gensim.models import Word2Vec
import gensim
import re
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import networkx as nx
from itertools import combinations
import community
from collections import defaultdict
import randomcolor
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pickle
from scipy.spatial.distance import pdist

## Connect to Mongo clinical_trials DB 

In [2]:
def connect_to_mongo(database, collection):
    
    """
    Opens a connection to a specified Mongo DB location
    
    Input Parameters:
    database: name of database to connect to or create (str)
    collection: name of collection to connect to or create (str)
    
    Returns:
    The connection object for the database without a collection specified
    The connection object for a specific Mongo location (database & collection)
    """
    
    client = MongoClient()
    db = client[database]
    mongo_loc = db[collection]
    return db, mongo_loc

In [3]:
trials_loc, eligibility_loc = connect_to_mongo('clinical_trials', 'eligibilities')

## Pre-process inclusion data

In [None]:
doc_cursor = eligibility_loc.find({"inclusion_criteria": { '$regex' : ".*asthma.*"}})

stoplist = stopwords.words('english')

inclusion_texts = []

for study in doc_cursor:
    for crit in study['cleaned_inclusion']:
        words = re.findall('[a-z][a-z]+', crit)
        inclusion_tokens = [[word for word in words if word not in stoplist]]
        inclusion_texts += inclusion_tokens
print(inclusion_texts[0:5])

## Load Google's pre-trained Word2Vec model 

In [5]:
google_vec_file = '/Users/courtney/ds/Word2Vec/GoogleNews-vectors-negative300.bin'

In [6]:
model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

## Get vector for each criteria

### Function to vectorize each inclusion criteria 

In [7]:
missing_words = []

def get_doc_vec(words, model):
    good_words = []
    for word in words:
        # Words not in the original model will fail
        try:
            if model.wv[word] is not None:
                good_words.append(word)
        except:
            if word not in missing_words:
                missing_words.append(word)
            continue
    # If no words are in the original model
    if len(good_words) == 0:
        return None
    # Return the mean of the vectors for all the good words
    return model.wv[good_words].mean(axis=0)

### Calculate number of unique keywords in the asthma criteria dataset 

In [8]:
unique_words = []
total_word_count = 0
for crit in inclusion_texts:
    for word in crit:
        total_word_count += 1
        if word not in unique_words:
            unique_words.append(word)
unique_word_count = len(unique_words)
print("Unique words:", unique_word_count, "\nTotal words:", total_word_count)

Unique words: 6741 
Total words: 122252


### Vectorize each inclusion criteria 

#### Create empty array to fill with vectrized criteria

In [9]:
inclusion_vectors = np.zeros((len(inclusion_texts), 300))
inclusion_vectors.shape

(11298, 300)

#### Vectorize criteria 

In [10]:
for i, doc in enumerate(inclusion_texts):
    vec = get_doc_vec(doc, model)
    inclusion_vectors[i, :] = vec
print(f"\nMissing {len(missing_words)} out of {unique_word_count} unique words: {round(len(missing_words)/unique_word_count*100)}% missing\n")
print(vec, '\n')
print(inclusion_texts[-1])




Missing 1219 out of 6741 unique words: 18% missing

[ 0.10852051  0.23339844 -0.07384491  0.05639648 -0.02832031 -0.04815674
  0.22998047 -0.11035156  0.17773438  0.09515381  0.06445312 -0.04302979
 -0.12330627 -0.17285156 -0.07044983 -0.09249878  0.03405762  0.1496582
 -0.08163452 -0.12988281 -0.0647583  -0.11698914 -0.13134766 -0.0904541
 -0.0355835  -0.17211914 -0.05151367  0.11254883  0.04956055  0.02148438
 -0.04345703 -0.13574219  0.02716064  0.0284729   0.02648926  0.08056641
 -0.0916748  -0.17919922 -0.11181641  0.17480469  0.06563568  0.0970459
  0.09326172 -0.01611328  0.04931641 -0.01098633  0.04443359  0.18701172
 -0.1164856  -0.0880127  -0.08587646 -0.15991211 -0.21484375 -0.12158203
  0.12904739  0.04299927  0.07922363 -0.05505371 -0.18981934 -0.03833008
 -0.05554199  0.06396484  0.08081055 -0.13882446  0.18188477  0.04833984
  0.06176758  0.13983154  0.16992188  0.05651855 -0.04174805 -0.11138916
 -0.00317383  0.25830078 -0.00390625  0.0871582   0.04637146  0.08178711
 

18% of unique words are missing in Word2Vec. Most of them are numbers. 

In [14]:
# print(missing_words)

In [12]:
print(len(missing_words))

1219


#### Exploring how well GoogleNews Word2Vec performs on medical words 

In [None]:
model.most_similar('gene' ,topn=8)

In [None]:
model.most_similar('pollen' ,topn=8)

In [None]:
model.most_similar('mg_dL' ,topn=8)

## Pickle vectorized eligibility criteria  

In [16]:
pickle.dump(inclusion_vectors, open("vectorized_criteria.p", "wb"))