In [1]:
import numpy as np
from gensim.models import KeyedVectors

In [2]:
# load the word embedding model
wv = KeyedVectors.load_word2vec_format('bio_embedding_intrinsic', binary=True)
# and our approved drug names
drugs = None
with open('fda_approved/fda_approved.processed.names') as infile:
    drugs = [l.strip() for l in infile]

In [3]:
# we're first going to convert every multi-token drug into a word vector average
def wv_avg_tokens(tokens, wv):
    # collect the unit vectors for each token
    unit_vecs = list()
    for t in tokens:
        # skip tokens that aren't in the embedding
        if t in wv:
            # scale to a unit vector
            uvec = wv[t]
            uvec = uvec / np.linalg.norm(uvec)
            unit_vecs.append(uvec)
    # now add them up if we got at least one
    if len(unit_vecs) < 1:
        return None
    # sum
    ret_vec = unit_vecs[0]
    for uvec in unit_vecs[1:]:
        ret_vec = ret_vec + uvec
    # and rescale
    ret_vec = ret_vec / np.linalg.norm(ret_vec)
    return ret_vec

In [4]:
# do the conversion and make note of any that don't come out with a vector
drug_vectors = list()
for d in drugs:
    toks = d.split()
    dvec = wv_avg_tokens(toks, wv)
    if dvec is None:
        print('No vector for: {0}'.format(d))
    else:
        drug_vectors.append((d, dvec))

No vector for: a-hydrocort
No vector for: a-methapred
No vector for: a-poxide
No vector for: a.p.l.
No vector for: a/t/s
No vector for: abitrexate
No vector for: ablysinol
No vector for: abrilada
No vector for: absorica
No vector for: acanya
No vector for: accretropin
No vector for: accrufer
No vector for: accuneb
No vector for: accurbron
No vector for: accuretic
No vector for: aceon
No vector for: acetasol
No vector for: aches-n-pain
No vector for: acilac
No vector for: aclovate
No vector for: acrisorcin
No vector for: actahist
No vector for: acticlate
No vector for: acticort
No vector for: actidil
No vector for: actin-n
No vector for: actinex
No vector for: activella
No vector for: adagen
No vector for: adakveo
No vector for: adalimumab-adaz
No vector for: adalimumab-adbm
No vector for: adalimumab-afzb
No vector for: adalimumab-atto
No vector for: adalimumab-bwwd
No vector for: adipex-p
No vector for: adlyxin
No vector for: admelog
No vector for: adphen
No vector for: adrenaclick
No 

No vector for: neopap
No vector for: neopasalate
No vector for: neoscan
No vector for: neothylline
No vector for: neotrizine
No vector for: nephroflow
No vector for: nerlynx
No vector for: netspot
No vector for: neuraceq
No vector for: neuramate
No vector for: neutrexin
No vector for: nexcede
No vector for: nexletol
No vector for: nexlizet
No vector for: niacor
No vector for: niclocide
No vector for: nicolar
No vector for: nilandron
No vector for: nilstat
No vector for: niravam
No vector for: nithiodote
No vector for: nitrol
No vector for: nitromist
No vector for: nitronal
No vector for: nityr
No vector for: nivestym
No vector for: nocdurna
No vector for: noctiva
No vector for: nor-qd
No vector for: norcet
No vector for: nordette-21
No vector for: nordette-28
No vector for: norethin 1/35e-21
No vector for: norethin 1/35e-28
No vector for: norethin 1/50m-21
No vector for: norethin 1/50m-28
No vector for: noritate
No vector for: norlutate
No vector for: normodyne
No vector for: normozide

In [5]:
# now get the sars treatment endpoint vector
metf_v = wv['metformin'] / np.linalg.norm(wv['metformin'])
diab_v = wv['diabetes'] / np.linalg.norm(wv['diabetes'])
sars_v = wv['sars'] / np.linalg.norm(wv['sars'])
treat_sars_vec = metf_v - diab_v + sars_v
treat_sars_vec = treat_sars_vec / np.linalg.norm(treat_sars_vec)

In [6]:
# and rank our drug vectors by their distance from the sars treatment vector
# the drug vectors are currently stored as tuples with the original name
dvs = np.array([dt[1] for dt in drug_vectors])
drug2treat_sims = wv.cosine_similarities(treat_sars_vec, dvs)
# put those similarities into tuples with the drug names now
drug_sim_tuples = list()
for i,dt in enumerate(drug_vectors):
    dn = dt[0]
    sim = drug2treat_sims[i]
    drug_sim_tuples.append((dn,sim))
# and sort by similarity descending
ranked_results = sorted(drug_sim_tuples, key=lambda x: x[1], reverse=True)
ranked_results_str = [str(tup) for tup in ranked_results]
print('\n'.join(ranked_results_str[:100]))

('gilteritinib fumarate', 0.5596476)
('peramivir', 0.55696976)
('zanamivir', 0.5469999)
('erdafitinib', 0.528765)
('atovaquone and proguanil hydrochloride', 0.5266413)
('rimantadine hydrochloride', 0.52499855)
('delavirdine mesylate', 0.5242566)
('atazanavir sulfate and ritonavir', 0.5215566)
('cobimetinib fumarate', 0.520225)
('niclosamide', 0.51958627)
('lopinavir and ritonavir', 0.51907957)
('temsirolimus', 0.51461643)
('rilpivirine hydrochloride', 0.51084423)
('alectinib hydrochloride', 0.5094976)
('lefamulin acetate', 0.5077356)
('perphenazine and amitriptyline hydrochloride', 0.5071963)
('alogliptin and metformin hydrochloride', 0.506837)
('tamiflu', 0.50654095)
('selinexor', 0.5053856)
('amprenavir', 0.50434387)
('ibuprofen and diphenhydramine citrate', 0.5035539)
('olanzapine and fluoxetine hydrochloride', 0.50291455)
('probenecid and colchicine', 0.50227225)
('erlotinib hydrochloride', 0.5016403)
('bicalutamide', 0.5016144)
('alomide', 0.50147206)
('amantadine hydrochloride', 

In [7]:
for i,dst in enumerate(drug_sim_tuples):
    if 'prazosin' in dst[0]:
        print((i, dst[0], dst[1]))

(4775, 'prazosin hydrochloride', 0.41114467)


In [8]:
# prep the top 100 for manual review
for i,rr in enumerate(ranked_results[:100]):
    print('{0}. {1} \\[{2:.3f}\\]'.format(i+1, rr[0], rr[1]))

1. gilteritinib fumarate \[0.560\]
2. peramivir \[0.557\]
3. zanamivir \[0.547\]
4. erdafitinib \[0.529\]
5. atovaquone and proguanil hydrochloride \[0.527\]
6. rimantadine hydrochloride \[0.525\]
7. delavirdine mesylate \[0.524\]
8. atazanavir sulfate and ritonavir \[0.522\]
9. cobimetinib fumarate \[0.520\]
10. niclosamide \[0.520\]
11. lopinavir and ritonavir \[0.519\]
12. temsirolimus \[0.515\]
13. rilpivirine hydrochloride \[0.511\]
14. alectinib hydrochloride \[0.509\]
15. lefamulin acetate \[0.508\]
16. perphenazine and amitriptyline hydrochloride \[0.507\]
17. alogliptin and metformin hydrochloride \[0.507\]
18. tamiflu \[0.507\]
19. selinexor \[0.505\]
20. amprenavir \[0.504\]
21. ibuprofen and diphenhydramine citrate \[0.504\]
22. olanzapine and fluoxetine hydrochloride \[0.503\]
23. probenecid and colchicine \[0.502\]
24. erlotinib hydrochloride \[0.502\]
25. bicalutamide \[0.502\]
26. alomide \[0.501\]
27. amantadine hydrochloride \[0.501\]
28. azelastine hydrochloride and 

In [9]:
for i,dst in enumerate(drug_sim_tuples):
    if 'baricitinib' in dst[0]:
        print((i, dst[0], dst[1]))

(625, 'baricitinib', 0.40153417)


In [10]:
len(ranked_results)

6506

In [16]:
with open('sars.metformin_diabetes.approved.full.txt', 'w') as outfile:
    outfile.write('hit\tsimilarity\n')
    for hit,sim in ranked_results:
        outfile.write('{0}\t{1}\n'.format(hit, sim))