# COVID-19 Drug Repurposing via gene-compounds relations
This example shows how to do drug repurposing using DRKG even with the pretrained model.

## Collecting COVID-19 related disease
At the very beginning we need to collect a list of associated genes for Corona-Virus(COV) in DRKG. 

In [1]:
import pandas as pd
import numpy as np
file='coronavirus-related-host-genes.tsv'
df = pd.read_csv(file, sep="\t")
cov_genes = np.unique(df.values[:,2]).tolist()
file='covid19-host-genes.tsv'
df = pd.read_csv(file, sep="\t")
cov2_genes = np.unique(df.values[:,2]).tolist()
# keep unique related genes

cov_related_genes=list(set(cov_genes+cov2_genes))
#cov_related_genes=list(set(cov2_genes))
print(len(cov_related_genes))

442


## Candidate drugs
Now we use FDA-approved drugs in Drugbank as candidate drugs. (we exclude drugs with molecule weight < 250) The drug list is in infer\_drug.tsv

In [2]:
import csv

# Load entity file
drug_list = []
with open("./infer_drug.tsv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['drug','ids'])
    for row_val in reader:
        drug_list.append(row_val['drug'])

In [3]:
len(drug_list)

8104

## Inhibits relation

One inhibit relation in this context

In [4]:
treatment = ['GNBR::N::Compound:Gene']#'DRUGBANK::target::Compound:Gene','DGIDB::INHIBITOR::Gene:Compound']

## Get pretrained model
We can directly use the pretrianed model to do drug repurposing.

In [5]:
import pandas as pd
import numpy as np
import sys
import csv
sys.path.insert(1, '../utils')
from utils import download_and_extract
download_and_extract()

In [6]:
entity_idmap_file = '../data/drkg/embed/entities.tsv'
relation_idmap_file = '../data/drkg/embed/relations.tsv'

## Get embeddings for genes and drugs

In [7]:
# Get drugname/disease name to entity ID mappings
entity_map = {}
entity_id_map = {}
relation_map = {}
with open(entity_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
    for row_val in reader:
        entity_map[row_val['name']] = int(row_val['id'])
        entity_id_map[int(row_val['id'])] = row_val['name']
        
with open(relation_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
    for row_val in reader:
        relation_map[row_val['name']] = int(row_val['id'])
        
# handle the ID mapping
drug_ids = []
gene_ids = []
for drug in drug_list:
    drug_ids.append(entity_map[drug])
    
for gene in cov_related_genes:
    gene_ids.append(entity_map[gene])

treatment_rid = [relation_map[treat]  for treat in treatment]

In [8]:
# Load embeddings
import torch as th
entity_emb = np.load('../data/drkg/embed/DRKG_TransE_l2_entity.npy')
rel_emb = np.load('../data/drkg/embed/DRKG_TransE_l2_relation.npy')

drug_ids = th.tensor(drug_ids).long()
gene_ids = th.tensor(gene_ids).long()
treatment_rid = th.tensor(treatment_rid)

drug_emb = th.tensor(entity_emb[drug_ids])
treatment_embs = [th.tensor(rel_emb[rid]) for rid in treatment_rid]

## Drug Repurposing Based on Edge Score
We use following algorithm to calculate the edge score. Note, here we use logsigmiod to make all scores < 0. The larger the score is, the stronger the $h$ will have $r$ with $t$.

$\mathbf{d} = \gamma - ||\mathbf{h}+\mathbf{r}-\mathbf{t}||_{2}$

$\mathbf{score} = \log\left(\frac{1}{1+\exp(\mathbf{-d})}\right)$

When doing drug repurposing, we only use the treatment related relations.

In [9]:
import torch.nn.functional as fn

gamma=12.0
def transE_l2(head, rel, tail):
    score = head + rel - tail
    return gamma - th.norm(score, p=2, dim=-1)

scores_per_gene = []
dids_per_gene = []
for rid in range(len(treatment_embs)):
    treatment_emb=treatment_embs[rid]
    for gene_id in gene_ids:
        gene_emb = th.tensor(entity_emb[gene_id])
        if treatment[rid]=='DGIDB::INHIBITOR::Gene:Compound':
            score = fn.logsigmoid(transE_l2(gene_emb, treatment_emb,
                                        drug_emb))
        else:
            score = fn.logsigmoid(transE_l2(drug_emb, treatment_emb,
                                            gene_emb))
        scores_per_gene.append(score)
        dids_per_gene.append(drug_ids)
scores = th.cat(scores_per_gene)
dids = th.cat(dids_per_gene)


### Check clinical trial drugs per gene
Here we load the clinical trial drugs

In [10]:
clinical_drugs_file = './COVID19_clinical_trial_drugs.tsv'
clinical_drug_map = {}
with open(clinical_drugs_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['id', 'drug_name','drug_id'])
    for row_val in reader:
        clinical_drug_map[row_val['drug_id']] = row_val['drug_name']

Next we measure some statistics per gene.

In [13]:
maxhit=0
drugs_in_top_k={}
drugsfr_in_top_k={}
for i in range(len(scores_per_gene)):
    score=scores_per_gene[i]
    did=dids_per_gene[i]
    idx = th.flip(th.argsort(score), dims=[0])
    score = score[idx].numpy()
    did = did[idx].numpy()
    #print(did)
    _, unique_indices = np.unique(did, return_index=True)
    topk=100
    topk_indices = np.sort(unique_indices)[:topk]
    proposed_did = did[topk_indices]
    proposed_score = score[topk_indices]
    found_in_top_k=0
    found_drugs="\n"
    for j in range(topk):
        drug = entity_id_map[int(proposed_did[j])][10:17]
        if clinical_drug_map.get(drug, None) is not None:
            found_in_top_k+=1
            score = proposed_score[j]
            if drug in drugs_in_top_k:
                drugs_in_top_k[drug]+=1
                drugsfr_in_top_k[drug]+=1/(j+1)
            else:
                drugs_in_top_k[drug]=1
                drugsfr_in_top_k[drug]=1/(j+1)
            found_drugs+="[{}]{}\n".format(j, clinical_drug_map[drug])
            #print("[{}]{}".format(j, clinical_drug_map[drug]))
    #print("{}\t{}".format(cov_related_genes[i], found_in_top_k))
    if maxhit< found_in_top_k:
        maxhit=found_in_top_k
        maxgene=cov_related_genes[i]
        max_dugs=found_drugs
print("{}\t{}\t{}".format(maxgene, maxhit,max_dugs))

res=[[drug, clinical_drug_map[drug] ,drugs_in_top_k[drug],drugsfr_in_top_k[drug]] for drug in drugs_in_top_k.keys()]
res=reversed(sorted(res, key=lambda x : x[2]))
for drug in res:
    print("{}\t{}\t{}\t{}".format(drug[0], drug[1] ,drug[2],drug[3]))
    

Gene::6441	9	
[0]Dexamethasone
[29]Methylprednisolone
[30]Ribavirin
[40]Thalidomide
[46]Chloroquine
[77]Losartan
[86]Sargramostim
[88]Azithromycin
[90]Hydroxychloroquine

DB01234	Dexamethasone	401	17.424322932617844
DB01041	Thalidomide	336	9.52602832899466
DB00608	Chloroquine	258	5.281556104219857
DB00746	Deferoxamine	111	2.3803197362314727
DB01394	Colchicine	108	1.9397152439066307
DB00959	Methylprednisolone	105	1.6800667504790185
DB00678	Losartan	92	1.9905909204249115
DB00811	Ribavirin	92	2.0302922908647756
DB08877	Ruxolitinib	47	0.7744534092963637
DB08895	Tofacitinib	33	0.46233716095307054
DB01611	Hydroxychloroquine	14	0.20167135495496702
DB05511	Piclidenoson	6	0.1513038675225646
DB00207	Azithromycin	5	0.05829492617697397
DB00198	Oseltamivir	1	0.2
DB00020	Sargramostim	1	0.011494252873563218
