In [1]:
import numpy as np
from gensim.models import KeyedVectors

In [2]:
# load the word embedding model
wv = KeyedVectors.load_word2vec_format('bio_embedding_intrinsic', binary=True)
# and our approved drug names
drugs = None
with open('fda_approved/fda_approved.processed.names') as infile:
    drugs = [l.strip() for l in infile]

In [5]:
# now get a big ordered list of hits
sars_1k = wv.most_similar_cosmul(positive=['sars', 'metformin'], negative=['diabetes'], topn=1000)
print(sars_1k[:100])

[('cantharimide', 0.8812281489372253), ('sars-3clpro', 0.8788511157035828), ('sars-3cl', 0.8778709173202515), ('chelidonine', 0.8775564432144165), ('sar405', 0.8769442439079285), ('ly2523355', 0.8736880421638489), ('turbinamide', 0.871955394744873), ('peramivir', 0.870306134223938), ('sars-cov', 0.8696171641349792), ('cantharidin-mediated', 0.8688936829566956), ('katamine', 0.8688304424285889), ('sars-cov-induced', 0.8673214912414551), ('dual-inhibitor', 0.8666826486587524), ('norcantharimide', 0.8661585450172424), ('2-rimantadine', 0.8660918474197388), ('delaviridine', 0.865639865398407), ('motuporamine', 0.8655034303665161), ('lycodine', 0.8650529980659485), ('cantharidin-based', 0.864751398563385), ('cantharidine', 0.8642935156822205), ('norcantharidin-induced', 0.8641064763069153), ('zanamivir', 0.8634259104728699), ('cantharimides', 0.8633273839950562), ('sars-cov-mediated', 0.8629789352416992), ('nvp-231', 0.8608194589614868), ('norcantharimides', 0.8604865074157715), ('cantharid

In [6]:
# and what do our drugs look like again?
print('\n'.join(drugs[:100]))

8-hour bayer
8-mop
a-hydrocort
a-methapred
a-poxide
a.p.l.
a/t/s
abacavir
abacavir and lamivudine
abacavir sulfate
abacavir sulfate and lamivudine
abacavir sulfate, lamivudine and zidovudine
abaloparatide
abarelix
abatacept
abciximab
abelcet
abemaciclib
abilify
abilify maintena kit
abilify mycite kit
abiraterone acetate
abitrexate
ablavar
ablysinol
abobotulinumtoxina
abraxane
abreva
abrilada
absorica
absorica ld
abstral
acalabrutinib
acamprosate calcium
acanya
acarbose
accolate
accretropin
accrufer
accuneb
accupril
accurbron
accuretic
accutane
acebutolol hydrochloride
aceon
acephen
acetadote
acetaminophen
acetaminophen and codeine phosphate
acetaminophen and hydrocodone bitartrate
acetaminophen and pentazocine hydrochloride
acetaminophen, aspirin and caffeine
acetaminophen, aspirin, and codeine phosphate
acetaminophen, caffeine and dihydrocodeine bitartrate
acetaminophen, caffeine, and dihydrocodeine bitartrate
acetasol
acetasol hc
acetated ringer's in plastic container
acetazolamide
a

In [17]:
# probably want to process these drug names into single tokens for matching
drug_tokens = set()
for d in drugs:
    # get rid of commas for a start
    d = d.replace(',', '')
    # and split on white space
    d_toks = d.split()
    for dt in d_toks:
        drug_tokens.add(dt)
# how does this list compare to the original
drug_tokens = sorted(drug_tokens)
print(len(drugs))
print(len(drug_tokens))

8561
7310


In [18]:
# and what do those tokens look like
print('\n'.join(drug_tokens[:100]))

""25""
""50""
""mg""
#1
#2
#3
#4
&
'125'
'200'
'250'
'400'
'500'
'800'
'875'
-
.625
0.037%
0.04%
0.075%
0.08%
0.1%
0.11%
0.12%
0.149%
0.15
0.15%
0.15/30-21
0.15/30-28
0.16%
0.167
0.2%
0.22%
0.224%
0.225%
0.25%
0.3
0.3%
0.30%
0.32%
0.33%
0.4%
0.45%
0.5
0.5/35
0.5/35-21
0.5/35-28
0.5/50-21
0.5/50-28
0.75
0.8%
0.83%
0.86%
0.9%
0/0
0/0/1.2
0/2.5
0/3.5
1
1%
1+35
1+50
1+80
1.25
1.5
1.5%
1.5/30
1/20
1/35
1/35-21
1/35-28
1/35e-21
1/35e-28
1/50
1/50-21
1/50-28
1/50e-21
1/50e-28
1/50m-21
1/50m-28
1/6
1/80
10
10%
10-21
10-25
10-4
10/11-21
10/11-28
100
100/3.6
100/33
100/50
1000
10000
104
10meq
11.4%
111
12


In [19]:
# well those probably won't be very useful - and probably won't match top hits anyway
# how about something in the middle
print('\n'.join(drug_tokens[3000:3100]))

goprelto
goserelin
gralise
gramicidin
granisetron
granisol
granix
grape
grazoprevir
green
grepafloxacin
grifulvin
gris-peg
grisactin
griseofulvin
griseofulvinultramicrosize
guaifenesin
guanabenz
guanadrel
guanethidine
guanfacine
guanidine
guselkumab
gvoke
gvs
gynazole-1
gyne-lotrimin
gyne-sulf
gynix
gynodiol
gynorest
h
h-cort
h-pylori
h.p.
h.r.-50
habitrol
hadlima
hailey
halaven
halazepam
halcinonide
halcion
haldol
haldrone
half-strength
halfan
halflytely
halobetasol
halofantrine
halog
halog-e
haloperidol
haloprogin
halotestin
halotex
halothane
harmonyl
harvoni
hb
hbc
hc
hca
hcl
hct
hd
head
heather
heavy
hectorol
hedspa
hedulin
helicosol
helidac
helium
hemabate
hemady
hemangeol
hemifumarate
hemihydrate
hemlibra
hemsol-hc
heparin
hepatamine
hepatasol
hepatolite
hepsera
heptahydrate
heptalac
her
herceptin
herplex
herzuma
hetacillin
hetlioz
hetrazan
hexa-betalin
hexa-germ
hexabrix
hexacetonide


In [21]:
# might be good enough, let's see what we get
# sars_1k contains tuples of names and similarity measures
sars_1k_names = [r[0] for r in sars_1k]
# this gets gross because I want to do a fuzzy match, and thus want to iterate over n*m
# could be made more efficient with a trie or something, but computers are fast...
# check every sars hit with for matches with approved drug tokens
sars_drug_hits = list()
for sn in sars_1k_names:
    drug_tok_hits = [d for d in drug_tokens if d in sn]
    if len(drug_tok_hits) > 0:
        # let's also track how we matched for debugging
        sn_tup = (sn, drug_tok_hits)
        sars_drug_hits.append(sn_tup)
# how did we do
print('starting list: {0}'.format(len(sars_1k_names)))
print('ending list: {0}'.format(len(sars_drug_hits)))
print('*************')
sdh_str = [str(tup) for tup in sars_drug_hits]
print('\n'.join(sdh_str))

starting list: 1000
ending list: 1000
*************
('cantharimide', ['a', 'ar', 'c', 'd', 'e', 'h', 'i', 'im', 'm', 'n', 'r', 't'])
('sars-3clpro', ['-', '3', 'a', 'ar', 'c', 'cl', 'l', 'lp', 'p', 'pro', 'r', 's', 'sa'])
('sars-3cl', ['-', '3', 'a', 'ar', 'c', 'cl', 'l', 'r', 's', 'sa'])
('chelidonine', ['c', 'd', 'e', 'h', 'i', 'in', 'l', 'n'])
('sar405', ['4', '40', '5', 'a', 'ar', 'r', 's', 'sa'])
('ly2523355', ['2', '25', '3', '35', '5', 'l'])
('turbinamide', ['a', 'b', 'd', 'e', 'i', 'in', 'm', 'n', 'r', 't'])
('peramivir', ['a', 'e', 'er', 'i', 'iv', 'm', 'p', 'peramivir', 'r', 'v'])
('sars-cov', ['-', 'a', 'ar', 'c', 'r', 's', 'sa', 'v'])
('cantharidin-mediated', ['-', 'a', 'ar', 'at', 'c', 'd', 'e', 'h', 'i', 'in', 'm', 'n', 'r', 'rid', 't'])
('katamine', ['a', 'at', 'e', 'i', 'in', 'k', 'm', 'n', 't'])
('sars-cov-induced', ['-', 'a', 'ar', 'c', 'd', 'e', 'i', 'in', 'n', 'r', 's', 'sa', 'v'])
('dual-inhibitor', ['-', 'a', 'b', 'd', 'dual', 'dual-', 'h', 'i', 'ib', 'in', 'inh',

In [23]:
# okay, well duh, we clearly can't allow single letter matches etc.
# let's get rid of all drug tokens that aren't at least 4 chars long for quick and dirty approach
drug_tokens_procd = [dt for dt in drug_tokens if len(dt) > 3]

In [24]:
# and try again
sars_drug_hits = list()
for sn in sars_1k_names:
    drug_tok_hits = [d for d in drug_tokens_procd if d in sn]
    if len(drug_tok_hits) > 0:
        # let's also track how we matched for debugging
        sn_tup = (sn, drug_tok_hits)
        sars_drug_hits.append(sn_tup)
# how did we do
print('starting list: {0}'.format(len(sars_1k_names)))
print('ending list: {0}'.format(len(sars_drug_hits)))
print('*************')
sdh_str = [str(tup) for tup in sars_drug_hits]
print('\n'.join(sdh_str))

starting list: 1000
ending list: 228
*************
('peramivir', ['peramivir'])
('dual-inhibitor', ['dual', 'dual-'])
('2-rimantadine', ['rimantadine'])
('cantharidin-based', ['base'])
('zanamivir', ['zanamivir'])
('oseltamivir-conjugated', ['conjugated', 'oseltamivir'])
('rimantadine', ['rimantadine'])
('-oseltamivir', ['oseltamivir'])
('oseltamivir', ['oseltamivir'])
('axinellamine', ['ella'])
('acetylamoxapine', ['acetyl', 'amoxapine', 'cetyl'])
('norcantharidin-conjugated', ['conjugated'])
('rimantadine-resistant', ['rimantadine'])
('pironetin', ['iron'])
('acetylamantadine', ['acetyl', 'amantadine', 'cetyl'])
('7-halogenoindirubins', ['halog', 'logen', 'ogen'])
('archazolids', ['azolid'])
('oseltamivir-zanamivir', ['oseltamivir', 'zanamivir'])
('bpei-niclosamide', ['niclosamide'])
('delavirdine', ['delavirdine'])
('dual-inhibitors', ['dual', 'dual-'])
('gilteritinib', ['gilteritinib'])
('cyproheptadine-mediated', ['cyproheptadine'])
('erdafitinib', ['erdafitinib'])
('gefitinib-sen

In [25]:
# looking a little better, but still need to refine the matching