In [2]:
import numpy as np

from gensim.models import KeyedVectors
wv = KeyedVectors.load_word2vec_format('bio_embedding_intrinsic', binary=True)

In [3]:
# trying to see how I might filter for drug names
# especially those with multi-word names
print(type(wv.vocab))
print('butabarbital-sodium' in wv.vocab.keys())
print('butabarbital' in wv.vocab.keys())

<class 'dict'>
False
True


In [4]:
# read in all the potential drug names and active ingredients
# dataset from: https://www.fda.gov/drugs/drug-approvals-and-databases/drugsfda-data-files
drug_names = set()
with open('fda_approved/Products.txt') as infile:
    header = infile.readline().strip().split('\t')
    name_ind = header.index('DrugName')
    actv_ing_ind = header.index('ActiveIngredient')
    # now go through and collect them all
    # TODO handle ; separated and parenthetical lists
    # TODO e.g. "drug;drug;drug;" and "drug (alias;alias;alias)"
    for l in infile:
        parts = l.strip().lower().split('\t')
        drug_names.add(parts[name_ind])
        drug_names.add(parts[actv_ing_ind])
# sort them into a list
drug_names = sorted(drug_names)
print(len(drug_names))
print('\n'.join(drug_names[:50]))

9019
8-hour bayer
8-mop
a-hydrocort
a-methapred
a-poxide
a.p.l.
a/t/s
abacavir and lamivudine
abacavir sulfate
abacavir sulfate and lamivudine
abacavir sulfate, lamivudine and zidovudine
abacavir sulfate; dolutegravir sodium; lamivudine
abacavir sulfate; lamivudine
abacavir sulfate; lamivudine; zidovudine
abacavir sulfate;lamivudine
abacavir; lamivudine
abacavir;lamivudine
abaloparatide
abarelix
abatacept
abciximab
abelcet
abemaciclib
abilify
abilify maintena kit
abilify mycite kit
abiraterone acetate
abitrexate
ablavar
ablysinol
abobotulinumtoxina
abraxane
abreva
abrilada
absorica
absorica ld
abstral
acalabrutinib
acamprosate calcium
acanya
acarbose
accolate
accretropin
accrufer
accuneb
accupril
accurbron
accuretic
accutane
acebutolol hydrochloride


In [5]:
# names with aliases and no parentheticals are easy
# e.g abacavir;lamivudine (just split those)
# names with parentheticals are harder - what do they look like
for dn in drug_names:
    if '(' in dn:
        print(dn)

actonel with calcium (copackaged)
aminosyn 10% (ph6)
aminosyn 7% (ph6)
aminosyn 8.5% (ph6)
antara (micronized)
atropine (autoinjector)
clobetasol propionate (emollient)
cytoxan (lyophilized)
ddavp (needs no refrigeration)
desmopressin acetate (needs no refrigeration)
dextrose 5%, sodium chloride 0.2% and potassium chloride 15meq (k)
dextrose 5%, sodium chloride 0.2% and potassium chloride 20meq (k)
dextrose 5%, sodium chloride 0.2% and potassium chloride 5meq (k)
dextrose 5%, sodium chloride 0.45% and potassium chloride 20meq (k) in plastic container
dicyclomine hydrochloride (preservative free)
doxil (liposomal)
doxorubicin hydrochloride (liposomal)
enoxaparin sodium (preservative free)
epinephrine (autoinjector)
epinephrine (copackaged)
evzio (autoinjector)
excedrin (migraine)
famotidine preservative free (pharmacy bulk)
fenofibrate (micronized)
ferric hexacyanoferrate(ii)
glyburide (micronized)
hc (hydrocortisone)
hydroserpine plus (r-h-h)
infuvite pediatric (pharmacy bulk package)


In [6]:
# thankfully there aren't many, and most aren't drug or alias names
# maybe the relevant drugs are already present and we can just drop ()s
drug_name_set = set(drug_names)
names_to_check = ['premarin', 'cycrin', 'hydrocortisone', 'ampicillin',
                 'sulfabenzamide', 'sulfacetamide', 'sulfathiazole',
                 'sulfadiazine', 'sulfamerazine', 'sulfamethazine']
for ntc in names_to_check:
    print('{0}: {1}'.format(ntc, ntc in drug_name_set))

premarin: True
cycrin: True
hydrocortisone: True
ampicillin: False
sulfabenzamide: False
sulfacetamide: False
sulfathiazole: False
sulfadiazine: True
sulfamerazine: False
sulfamethazine: False


In [7]:
# okay, to process these into individual drugs, let's do this
# 1. drop all text after the first occurrence of a '('
# 2. split remaining text by ';'
# 3. manually add our list of drugs from ()s [names_to_check above]
final_drug_names = set()
for dn in drug_names:
    dn = dn.strip()
    # drop all text following a paren
    pindex = dn.find('(')
    if pindex >= 0:
        dn = dn[:pindex]
    # and split by ';' for multiple names
    dn = dn.split(';')
    for n in dn:
        final_drug_names.add(n.strip())
# and add all of our manually gathered names
for n in names_to_check:
    final_drug_names.add(n)
# how'd we do?
final_drug_names = sorted(final_drug_names)
print('\n'.join(final_drug_names[:50]))

8-hour bayer
8-mop
a-hydrocort
a-methapred
a-poxide
a.p.l.
a/t/s
abacavir
abacavir and lamivudine
abacavir sulfate
abacavir sulfate and lamivudine
abacavir sulfate, lamivudine and zidovudine
abaloparatide
abarelix
abatacept
abciximab
abelcet
abemaciclib
abilify
abilify maintena kit
abilify mycite kit
abiraterone acetate
abitrexate
ablavar
ablysinol
abobotulinumtoxina
abraxane
abreva
abrilada
absorica
absorica ld
abstral
acalabrutinib
acamprosate calcium
acanya
acarbose
accolate
accretropin
accrufer
accuneb
accupril
accurbron
accuretic
accutane
acebutolol hydrochloride
aceon
acephen
acetadote
acetaminophen
acetaminophen and codeine phosphate


In [8]:
# fine, let's dump those for use in our word vector stuff
with open('fda_approved/fda_approved.processed.names', 'w') as outfile:
    outfile.write('\n'.join(final_drug_names))
    outfile.write('\n')

In [9]:
# let's also just play around with multi-token drug names
# what are the most similar vectors to the multiple tokens?
wv.most_similar(positive=['acebutolol', 'hydrochloride'])

[('penbutolol', 0.8522176146507263),
 ('hydrochloride-timolol', 0.8416755199432373),
 ('hydrochlorothizide', 0.8413857221603394),
 ('oxprenolol', 0.8394852876663208),
 ('atenolol', 0.8310449719429016),
 ('bufuralol-hydrochloride', 0.82554030418396),
 ('hydroxypropranolol', 0.8228106498718262),
 ('dimethylpropranolol', 0.8134440183639526),
 ('hydrochlorid', 0.8131069540977478),
 ('tiapride-hydrochloride', 0.8115503787994385)]

In [10]:
wv.most_similar(positive=['acetaminophen', 'and', 'codeine', 'phosphate'])

[('acetaminophen-containing', 0.8329548835754395),
 ('acetaminophens', 0.8154647350311279),
 ('p-acetaminophen', 0.812239408493042),
 ('acetaminophen-sensitive', 0.8013252019882202),
 ('dextropropoxyphene-containing', 0.8009302616119385),
 ('non-acetaminophen', 0.7994862198829651),
 ('post-acetaminophen', 0.7923027276992798),
 ('paracetamol-containing', 0.791881799697876),
 ('acetaminophen-sulfate', 0.7916483879089355),
 ('paracetamol', 0.7904905080795288)]

In [12]:
# how does this work though? just a normalized average?
ace_v = wv['acebutolol']
ace_norm = np.linalg.norm(ace_v)
ace_v_unit = ace_v / ace_norm
hcl_v = wv['hydrochloride']
hcl_norm = np.linalg.norm(hcl_v)
hcl_v_unit = hcl_v / hcl_norm
print(ace_norm)
print(hcl_norm)
unit_then_avg = (ace_v_unit + hcl_v_unit) / 2.0
unit_then_avg = unit_then_avg / np.linalg.norm(unit_then_avg)
avg_then_unit = (ace_v + hcl_v) / 2.0
avg_then_unit = avg_then_unit / np.linalg.norm(avg_then_unit)

4.3719134
3.927033


In [13]:
# do we scale them to unit vectors before averaging?
wv.most_similar(np.array([unit_then_avg]))

[('hydrochloride', 0.8997487425804138),
 ('acebutolol', 0.899748682975769),
 ('penbutolol', 0.8522176146507263),
 ('hydrochloride-timolol', 0.8416755199432373),
 ('hydrochlorothizide', 0.8413857221603394),
 ('oxprenolol', 0.8394852876663208),
 ('atenolol', 0.8310450315475464),
 ('bufuralol-hydrochloride', 0.82554030418396),
 ('hydroxypropranolol', 0.8228106498718262),
 ('dimethylpropranolol', 0.8134440183639526)]

In [14]:
# or scale their average to a unit vector afterward?
wv.most_similar(np.array([avg_then_unit]))

[('acebutolol', 0.9107880592346191),
 ('hydrochloride', 0.8881013989448547),
 ('penbutolol', 0.859885573387146),
 ('oxprenolol', 0.8477539420127869),
 ('hydrochlorothizide', 0.8413622975349426),
 ('hydrochloride-timolol', 0.8393110036849976),
 ('atenolol', 0.8377487063407898),
 ('hydroxypropranolol', 0.825943112373352),
 ('bufuralol-hydrochloride', 0.8228414058685303),
 ('metoprolol', 0.8174272179603577)]

In [15]:
# it looks like the most_similar function does the former
# scale the vectors to unit, average, and then re-unit scale
# makes sense really so that one vector doesn't dominate