# Load Data
Load up the word vector and raw FDA approved drug names.

In [3]:
import numpy as np
from gensim.models import KeyedVectors
# load the word embedding model
wv = KeyedVectors.load_word2vec_format('bio_embedding_intrinsic', binary=True)
# and our approved drug names
drugs = None
with open('fda_approved/fda_approved.processed.names') as infile:
    drugs = [l.strip() for l in infile]

# Convert Raw FDA Approved Drug Names to Vectors
Here we take the FDA approved drug names and convert them to word vectors.
First, we break them into tokens.
Then, for each drug, we get word vectors for each individual token and average them for the drug.
We drop individual tokens that are not in the vocab, modifying the drug name along with the average.
Obviously, drug names with no tokens in the vocab are dropped.

In [4]:
# we're first going to convert every multi-token drug into a word vector average
def wv_avg_tokens(tokens, wv):
    # collect the unit vectors for each token
    new_toks = list()
    unit_vecs = list()
    for t in tokens:
        # skip tokens that aren't in the embedding
        if t in wv:
            # keep the token
            new_toks.append(t)
            # scale to a unit vector
            uvec = wv[t]
            uvec = uvec / np.linalg.norm(uvec)
            unit_vecs.append(uvec)
    # now add them up if we got at least one
    if len(unit_vecs) < 1:
        return None,None
    # sum
    ret_vec = unit_vecs[0]
    for uvec in unit_vecs[1:]:
        ret_vec = ret_vec + uvec
    # and rescale
    ret_vec = ret_vec / np.linalg.norm(ret_vec)
    return new_toks,ret_vec

In [5]:
# how many raw drugs are we starting with again?
print('Starting drug count: {0}'.format(len(drugs)))
# gather up our drug vectors with the corresponding "new" names (based on token dropping)
drug_vec_tups = list()
# and track the drugs that were dropped completely
dropped_drugs = list()
for d in drugs:
    # split and average word vectors
    toks = d.split()
    new_toks,drug_vec = wv_avg_tokens(toks, wv)
    # check for complete loss of a drug, or track the new name and vector
    if drug_vec is None:
        dropped_drugs.append(d)
    else:
        new_name = ' '.join(new_toks)
        drug_vec_tups.append((new_name, drug_vec))
# and how many raw drug names were we able to convert into something
print('Converted drug count: {0}'.format(len(drug_vec_tups)))
# because we have modified drug names, we may have ended up with new dupes
tmp = list()
drug_seen = set()
for dvt in drug_vec_tups:
    if dvt[0] not in drug_seen:
        tmp.append(dvt)
        drug_seen.add(dvt[0])
drug_vec_tups = tmp
# and how many distinct converted drug names do we have
print('Final distinct drug vector count: {0}'.format(len(drug_vec_tups)))

Starting drug count: 8561
Converted drug count: 6506
Final distinct drug vector count: 5850


# Build Treatment Analogy Vectors
Now we generate treatment analogy vectors.
We use three analogies: Metformin-Diabetes, Benazepril-Hypertension, and Albuterol-Asthma.
These are the vectors we will use to rank the drugs.

In [6]:
def get_sars_treatment_analogy(seed_drug, seed_disease, wv):
    drug_v = wv[seed_drug] / np.linalg.norm(wv[seed_drug])
    dis_v = wv[seed_disease] / np.linalg.norm(wv[seed_disease])
    sars_v = wv['sars'] / np.linalg.norm(wv['sars'])
    treat_sars_vec = drug_v - dis_v + sars_v
    treat_sars_vec = treat_sars_vec / np.linalg.norm(treat_sars_vec)
    return treat_sars_vec

In [7]:
metf_diab_sars_v = get_sars_treatment_analogy('metformin', 'diabetes', wv)
benz_hypr_sars_v = get_sars_treatment_analogy('benazepril', 'hypertension', wv)
albu_asth_sars_v = get_sars_treatment_analogy('albuterol', 'asthma', wv)

# Check Top 20 Hits for Treatment Vectors
We want to manually evaluate the top 20 hits for each treatment analogy to check for drugs and drug targets.
Finding drugs and drug targets in the top hits would suggest that the treatment vectors are in the right neighborhood.

In [27]:
metf_diab_sars_top20 = wv.most_similar(positive=[metf_diab_sars_v], topn=20)
benz_hypr_sars_top20 = wv.most_similar(positive=[benz_hypr_sars_v], topn=20)
albu_asth_sars_top20 = wv.most_similar(positive=[albu_asth_sars_v], topn=20)

### Metformin/Diabetes Top 20

In [28]:
# just have a quick look at the hits and similarities
print('\n'.join([str(h) for h in metf_diab_sars_top20]))

('sars', 0.7350299954414368)
('sars-cov', 0.5995138883590698)
('sars-3cl', 0.5938767194747925)
('sars-3clpro', 0.5917655825614929)
('sars-like', 0.588849663734436)
('sars-covs', 0.5769191980361938)
('sars-cov-induced', 0.5742613077163696)
('sars-cov-mediated', 0.5720081925392151)
('sars-cov-like', 0.5706111788749695)
('anti-sars-cov', 0.5702001452445984)
('pcsars-cov', 0.5684103965759277)
('hsars-cov', 0.5669524669647217)
('sars-co', 0.5651364922523499)
('anticoronaviral', 0.561847984790802)
('cantharimide', 0.5608478784561157)
('sar405', 0.5591368675231934)
('peramivir', 0.5569697618484497)
('norcantharidin-induced', 0.5555316209793091)
('cantharidin-mediated', 0.555138111114502)
('delaviridine', 0.5549775958061218)


In [29]:
# and clean them up for a LaTeX table
print('\n'.join(['{0} \\\\'.format(h[0]) for h in metf_diab_sars_top20]))

sars \\
sars-cov \\
sars-3cl \\
sars-3clpro \\
sars-like \\
sars-covs \\
sars-cov-induced \\
sars-cov-mediated \\
sars-cov-like \\
anti-sars-cov \\
pcsars-cov \\
hsars-cov \\
sars-co \\
anticoronaviral \\
cantharimide \\
sar405 \\
peramivir \\
norcantharidin-induced \\
cantharidin-mediated \\
delaviridine \\


### Benazepril/Hypertension Top 20

In [30]:
# just have a quick look at the hits and similarities
print('\n'.join([str(h) for h in benz_hypr_sars_top20]))

('sars', 0.6842663288116455)
('sars-3cl', 0.6048033237457275)
('sars-3clpro', 0.5865695476531982)
('sars-', 0.5783016085624695)
('sars-cov', 0.5710662007331848)
('sars-covs', 0.5611740946769714)
('p-sars', 0.5571820735931396)
('sars-like', 0.5532996654510498)
('sarsp', 0.5501185655593872)
('sars-cov-like', 0.5482956171035767)
('sars-hcov', 0.5416980981826782)
('anti-sars-cov', 0.5390284061431885)
('sars-s', 0.5341991186141968)
('coronavirion', 0.5340977907180786)
('lycodine', 0.5312058925628662)
('sarspp', 0.5307182669639587)
('sarse', 0.5294245481491089)
('sars-cov-s', 0.5278017520904541)
('sars-cov-', 0.5276678204536438)
('pcsars-cov', 0.5257420539855957)


In [31]:
# and clean them up for a LaTeX table
print('\n'.join(['{0} \\\\'.format(h[0]) for h in benz_hypr_sars_top20]))

sars \\
sars-3cl \\
sars-3clpro \\
sars- \\
sars-cov \\
sars-covs \\
p-sars \\
sars-like \\
sarsp \\
sars-cov-like \\
sars-hcov \\
anti-sars-cov \\
sars-s \\
coronavirion \\
lycodine \\
sarspp \\
sarse \\
sars-cov-s \\
sars-cov- \\
pcsars-cov \\


### Albuterol/Asthma Top 20

In [32]:
# just have a quick look at the hits and similarities
print('\n'.join([str(h) for h in albu_asth_sars_top20]))

('sars', 0.7238022685050964)
('sars-cov', 0.5882976055145264)
('csars', 0.5856403112411499)
('sars-covs', 0.5827745199203491)
('sarspp', 0.5768842101097107)
('sars-like', 0.5768049359321594)
('sars-cov-like', 0.5722691416740417)
('peramivir', 0.5660814046859741)
('vero-pipecuronium', 0.5657573938369751)
('sarsp', 0.5637783408164978)
('pancuronium-metocurine', 0.559868335723877)
('sars-hcov', 0.5589520931243896)
('sarse', 0.5587899684906006)
('pcsars-cov', 0.5573618412017822)
('sars-3cl', 0.5541790723800659)
('p-sars', 0.5507079362869263)
('sars-3clpro', 0.548815131187439)
('sars-', 0.5467952489852905)
('sars-coronavirus', 0.5443072319030762)
('pralidoxime', 0.5440007448196411)


In [33]:
# and clean them up for a LaTeX table
print('\n'.join(['{0} \\\\'.format(h[0]) for h in albu_asth_sars_top20]))

sars \\
sars-cov \\
csars \\
sars-covs \\
sarspp \\
sars-like \\
sars-cov-like \\
peramivir \\
vero-pipecuronium \\
sarsp \\
pancuronium-metocurine \\
sars-hcov \\
sarse \\
pcsars-cov \\
sars-3cl \\
p-sars \\
sars-3clpro \\
sars- \\
sars-coronavirus \\
pralidoxime \\


# Get Top 50 FDA Approved Drugs for Treatment Vectors
We now sort all of the FDA approved drug vectors by their cosine similarity to the analogy vectors.

In [34]:
def get_ranked_drugs_for_treat_vec(treat_vector, drug_vec_tups, wv):
    # first get the similarities for all drugs
    dvs = [dt[1] for dt in drug_vec_tups]
    drug_treat_sims = wv.cosine_similarities(treat_vector, dvs)
    # then zip them up with the drug names
    drug_sim_tups = [(dvt[0],drug_treat_sims[i]) for i,dvt in enumerate(drug_vec_tups)]
    # and sort, descending
    return sorted(drug_sim_tups, key=lambda x: x[1], reverse=True)

In [38]:
metf_diab_drugs_top50 = get_ranked_drugs_for_treat_vec(metf_diab_sars_v, drug_vec_tups, wv)[:50]

In [39]:
print('\n'.join([str(x) for x in metf_diab_drugs_top50]))

('gilteritinib fumarate', 0.5596477)
('peramivir', 0.5569698)
('zanamivir', 0.547)
('erdafitinib', 0.5287651)
('atovaquone and proguanil hydrochloride', 0.52664137)
('rimantadine hydrochloride', 0.52499855)
('delavirdine mesylate', 0.52425665)
('atazanavir sulfate and ritonavir', 0.52155674)
('cobimetinib fumarate', 0.520225)
('niclosamide', 0.5195863)
('lopinavir and ritonavir', 0.5190796)
('temsirolimus', 0.5146165)
('rilpivirine hydrochloride', 0.5108443)
('alectinib hydrochloride', 0.5094977)
('lefamulin acetate', 0.5077357)
('perphenazine and amitriptyline hydrochloride', 0.50719637)
('alogliptin and metformin hydrochloride', 0.506837)
('tamiflu', 0.506541)
('selinexor', 0.50538564)
('amprenavir', 0.5043439)
('ibuprofen and diphenhydramine citrate', 0.5035539)
('olanzapine and fluoxetine hydrochloride', 0.50291455)
('probenecid and colchicine', 0.50227225)
('erlotinib hydrochloride', 0.5016404)
('bicalutamide', 0.50161445)
('alomide', 0.5014721)
('amantadine hydrochloride', 0.5013