# Load Data
Load up the word vector and raw FDA approved drug names.

In [1]:
import numpy as np
from gensim.models import KeyedVectors
# load the word embedding model
wv = KeyedVectors.load_word2vec_format('bio_embedding_intrinsic', binary=True)
# and our approved drug names
drugs = None
with open('fda_approved/fda_approved.processed.names') as infile:
    drugs = [l.strip() for l in infile]

# Convert Raw FDA Approved Drug Names to Vectors
Here we take the FDA approved drug names and convert them to word vectors.
First, we break them into tokens.
Then, for each drug, we get word vectors for each individual token and average them for the drug.
We drop individual tokens that are not in the vocab, modifying the drug name along with the average.
Obviously, drug names with no tokens in the vocab are dropped.

In [3]:
# we're first going to convert every multi-token drug into a word vector average
def wv_avg_tokens(tokens, wv):
    # collect the unit vectors for each token
    new_toks = list()
    unit_vecs = list()
    for t in tokens:
        # skip tokens that aren't in the embedding
        if t in wv:
            # keep the token
            new_toks.append(t)
            # scale to a unit vector
            uvec = wv[t]
            uvec = uvec / np.linalg.norm(uvec)
            unit_vecs.append(uvec)
    # now add them up if we got at least one
    if len(unit_vecs) < 1:
        return None,None
    # sum
    ret_vec = unit_vecs[0]
    for uvec in unit_vecs[1:]:
        ret_vec = ret_vec + uvec
    # and rescale
    ret_vec = ret_vec / np.linalg.norm(ret_vec)
    return new_toks,ret_vec

In [6]:
# how many raw drugs are we starting with again?
print('Starting drug count: {0}'.format(len(drugs)))
# gather up our drug vectors with the corresponding "new" names (based on token dropping)
drug_vec_tups = list()
# and track the drugs that were dropped completely
dropped_drugs = list()
for d in drugs:
    # split and average word vectors
    toks = d.split()
    new_toks,drug_vec = wv_avg_tokens(toks, wv)
    # check for complete loss of a drug, or track the new name and vector
    if drug_vec is None:
        dropped_drugs.append(d)
    else:
        new_name = ' '.join(new_toks)
        drug_vec_tups.append((new_name, drug_vec))
# and how many raw drug names were we able to convert into something
print('Converted drug count: {0}'.format(len(drug_vec_tups)))
# because we have modified drug names, we may have ended up with new dupes
tmp = list()
drug_seen = set()
for dvt in drug_vec_tups:
    if dvt[0] not in drug_seen:
        tmp.append(dvt)
        drug_seen.add(dvt[0])
drug_vec_tups = tmp
# and how many distinct converted drug names do we have
print('Final distinct drug vector count: {0}'.format(len(drug_vec_tups)))

Starting drug count: 8561
Converted drug count: 6506
Final distinct drug vector count: 5850
