In [2]:
import itertools
import math
import numpy as np
import pandas as pd
import os

In [3]:
drug_target = pd.read_csv(os.path.join(temp, drug_target))
target_seq = pd.read_csv(os.path.join(temp, drug_seq))
drug_goa = pd.read_csv(os.path.join(temp, drug_goa))
drug_smiles = pd.read_csv(os.path.join(temp, drug_smiles))
drug_se = pd.read_csv(os.path.join(temp, drug_se))

#drug_target = pd.read_csv('data/input/drugbank-drug-target.csv')
#target_seq = pd.read_csv('data/input/drugbank-target-seq.csv')
#drug_goa = pd.read_csv('data/input/drugbank-drug-goa.csv')
#drug_smiles = pd.read_csv('data/input/drugbank-drug-smiles.csv')
#drug_se = pd.read_csv('data/input/drugbank-sider-se.csv')
drug_smiles.head()


Unnamed: 0,drugid,smiles
0,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
1,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
2,DB00091,CCC1NC(=O)C(C(O)C(C)CC=CC)N(C)C(=O)C(C(C)C)N(C...
3,DB00104,C[C@@H](O)[C@@H](CO)NC(=O)[C@@H]1CSSC[C@H](NC(...
4,DB00114,CC1=NC=C(COP(O)(O)=O)C(C=O)=C1O


## Get the drug data

In [4]:
print ("%d drugs have all Target feature "%len(  drug_target.drugid.unique()))
print ("%d drugs have Target GOA feature "%len( drug_goa.drugid.unique()))
print ("%d drugs have Fingerprint feature "%len(  drug_smiles.drugid.unique()))
print ("%d drugs have Sideeffect feature "%len( drug_se.drugid.unique()))

579 drugs have all Target feature 
579 drugs have all Target GOA feature 
583 drugs have all Fingerprint feature 
524 drugs have all Sideeffect feature 


In [5]:
drug_target_seq = drug_target.merge(target_seq, on= ['geneid'])
print ("%d drugs have Target SEQ feature "%len( drug_target_seq.drugid.unique()))

579 drugs have all Target SEQ feature 


In [6]:
a=drug_goa['drugid'].unique()
b=drug_target['drugid'].unique()
c=drug_smiles['drugid'].unique()
d=drug_se['drugid'].unique()
commonDrugs= set(a).intersection(b).intersection(c).intersection(d)
print (len(a),len(b),len(c),len(d))
print (len(commonDrugs))

579 579 583 524
510


In [7]:
drug_se.head()

Unnamed: 0,drugid,umlsid
0,DB00583,C0000729
1,DB00583,C0000737
2,DB00583,C0002418
3,DB00583,C0002871
4,DB00583,C0003123


## Drug side effect similarity¶
## calculating Jaccard coefficient based on drug sideefects

In [None]:
os.chdir(abs_path)

In [8]:
drugSEDict = {k: g["umlsid"].tolist() for k,g in drug_se.groupby("drugid")}
scores = []

for comb in itertools.combinations(commonDrugs,2):
    drug1 =comb[0]
    drug2 =comb[1]

    sideeffects1 = drugSEDict[drug1]
    sideeffects2 = drugSEDict[drug2]
    c = set(sideeffects1).intersection(sideeffects2)
    u = set(sideeffects1).union(sideeffects2)
    score = len(c)/float(len(u))
    scores.append([drug1, drug2, score])


In [9]:
drug_se_df = pd.DataFrame(scores, columns =['Drug1','Drug2','SE-SIM'])


In [10]:
drug_se_df.head()

Unnamed: 0,Drug1,Drug2,SE-SIM
0,DB00882,DB00366,0.085502
1,DB00882,DB01029,0.131285
2,DB00882,DB01217,0.145215
3,DB00882,DB00304,0.07489
4,DB00882,DB01400,0.076


In [11]:
drug_se_df.to_csv(os.path.join(abs_path,'data/features/drugs-se-sim.csv'), index=False)

## PPI based drug-drug similarity
###  calculate distance between drugs on protein-protein interaction network

In [47]:
#!pip install -q networkx==1.11

In [30]:
# calcuate pairwise distance between proteins in the human PPI network
import networkx as nx
G= nx.Graph()
with open('data/input/human-interactome.csv') as ppiFile: # human PPI network
    next(ppiFile) # skip first line
    drugs=set()
    for line in ppiFile:
        line=line.replace("'","").strip().split(',')
        G.add_edge(line[0],line[1])


In [31]:
ppi=nx.shortest_path_length(G)

In [32]:

def grapDistance(ppi, target1, target2):
    """
    return the shortest path between two proteins in the PPI network
    ppi : dictonary that contains distance of PPI 
    target1 : first protein name
    target2 : second protein name
    """
    maxValue = 9999
    if target1 not in ppi:
        return maxValue
    else:
        if target2 not in ppi[target1]:
            return maxValue
        else:
            return ppi[target1][target2]

drug_targetlist = {k: g["geneid"].tolist() for k,g in drug_target.groupby("drugid")}
values = []

# calculate PPI-based pairwise drug similarity (Closeness)
# First distances between proteins were transformed to similarity values using the formula described in Perlman et al (2011)
# A, b were chosen according to Perlman et al (2011) to be 0.9 × e and 1, respectively.
# Self similarity was assigned a value of 1.

# For drugs similarities, maximal values between the two lists of associated genes were averaged 
# (taking into account both sides for symmetry).

A = 0.9
b = 1
for comb in itertools.combinations(commonDrugs,2) :
    drug1 = comb[0]
    drug2 = comb[1]
    if not(drug1 in drug_targetlist and drug2 in drug_targetlist) : continue
    targetList1 = drug_targetlist[drug1]
    targetList2 = drug_targetlist[drug2]
    allscores =[]
    for target1 in sorted(targetList1):
        genescores = []
        for target2 in sorted(targetList2):
            target1 =str(target1)
            target2 =str(target2)    
            if target1 == target2:
                score=1.0
            else:
                score = A*math.exp(-b* grapDistance(ppi, target1, target2))
            genescores.append(score)
    # add maximal values between the two lists of associated genes 
    allscores.append(max(genescores))
    if len(allscores) ==0: continue
    #average the maximal scores 
    maxScore =np.mean(allscores)
    if maxScore >= 0:
        values.append([drug1, drug2, maxScore])

In [33]:
drug_ppi_df = pd.DataFrame(values, columns =['Drug1','Drug2','PPI-SIM'])

In [34]:
drug_ppi_df.head()


Unnamed: 0,Drug1,Drug2,PPI-SIM
0,DB00882,DB00366,0.044808
1,DB00882,DB01029,0.121802
2,DB00882,DB01217,0.121802
3,DB00882,DB00304,0.331091
4,DB00882,DB01400,0.044808


In [35]:
drug_ppi_df.to_csv(os.path.join(abs_path,'data/features/drugs-ppi-sim.csv'), index=False)

## Drug fingerprint similarity
### calculating MACS based fingerprint (substructure) similarity

In [None]:
# install following packages oddt and openbabel using conda
#!conda install -c oddt oddt
#!conda install -c openbabel openbabel

In [133]:
from oddt import toolkit
from oddt import fingerprints

import pandas as pd
import numpy as np
import itertools

In [130]:
drug_smiles = drug_smiles[drug_smiles.drugid.isin(commonDrugs)]
drug_smiles.head()

Unnamed: 0,drugid,smiles
0,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
1,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
2,DB00091,CCC1NC(=O)C(C(O)C(C)CC=CC)N(C)C(=O)C(C(C)C)N(C...
3,DB00104,C[C@@H](O)[C@@H](CO)NC(=O)[C@@H]1CSSC[C@H](NC(...
6,DB00122,C[N+](C)(C)CCO


In [131]:
drug_smiles.head()

Unnamed: 0,drugid,smiles
0,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
1,DB00035,NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...
2,DB00091,CCC1NC(=O)C(C(O)C(C)CC=CC)N(C)C(=O)C(C(C)C)N(C...
3,DB00104,C[C@@H](O)[C@@H](CO)NC(=O)[C@@H]1CSSC[C@H](NC(...
6,DB00122,C[N+](C)(C)CCO


In [134]:
#Create a dictionary of chemicals to be compared:
input_dict = dict()
for index,line in drug_smiles.iterrows():
    id = line['drugid']
    
    smiles = line['smiles']
    mol = toolkit.readstring(format='smiles',string=smiles)
    fp =mol.calcfp(fptype='MACCS').raw
    input_dict[id] = fp

In [135]:
def tanimoto_score(fp1, fp2):
    return np.sum(fp1 &  fp2) / np.sum(fp1 | fp2)

In [136]:
sim_values=[]
for chemical1, chemical2 in itertools.combinations(input_dict.keys(),2):
    TC= tanimoto_score(input_dict[chemical1], input_dict[chemical2])
    if chemical1 != chemical2:
        sim_values.append([chemical1, chemical2, TC])

In [137]:
chem_sim_df = pd.DataFrame(sim_values, columns=['Drug1','Drug2','TC'])
chem_sim_df.head()

Unnamed: 0,Drug1,Drug2,TC
0,DB00014,DB00035,0.732558
1,DB00014,DB00091,0.574713
2,DB00014,DB00104,0.655172
3,DB00014,DB00122,0.313953
4,DB00014,DB00125,0.361446


In [138]:
chem_sim_df.to_csv(os.path.join(abs_path,'data/features/drugs-fingerprint-sim.csv'), index=False)

## Drug target sequence similarity
### Calculation of SmithWaterman sequence alignment scores

In [112]:
def fasta2seq(lines):
    lines = lines[lines.index('\n')+1:]
    lines =lines.replace('\n','')
    return lines

target_seq.seq =target_seq.seq.map(fasta2seq)
target_seq = target_seq[target_seq.geneid.isin(drug_target.geneid)]


In [113]:
target_seq.head()

Unnamed: 0,geneid,seq
0,1128,MNTSAPPAVSPNITVLAPGKGPWQVAFIGITTGLLSLATVTGNLLV...
1,1129,MNNSTNSSNNSLALTSPYKTFEVVFIVLVAGSLSLVTIIGNILVMV...
2,148,MVFLSGNASDSSNCTQPPAPVNISKAILLGVILGGLILFGVLGNIL...
3,147,MNPDLDTGHNTSAPAHWGELKNANFTGPNQTSSNSTLPQLDITRAI...
4,146,MTFRDLLSVSFEGPRPDSSAGGSSAGGGGGSAGGAAPSEGPAVGGV...


In [114]:
target_seq_file="data/intermediate/drugbank-target-seq-trimmed.tab"
target_seq.to_csv(target_seq_file,'\t',index=False,header=None)

In [None]:
target_seq_file="data/intermediate/drugbank-target-seq-trimmed.tab"
target_seq_sim_file="data/intermediate/target-target-seq-sim-biojava.tab"

!java -cp .:lib/smithwaterman.jar:lib/biojava-alignment-4.0.0.jar:lib/biojava-core-4.0.0.jar:lib/slf4j-api-1.7.10.jar biojava.targetseq.CalcLocalAlign {target_seq_file} > {target_seq_sim_file}


SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


In [123]:
targetSeqSim=dict()
target_seq_sim_file="data/intermediate/target-target-seq-sim-biojava.tab"
with open(target_seq_sim_file) as tarSimfile:
    for row in tarSimfile:
        row = row.strip().split("\t")
        t1 =row[0]
        t2 = row[1]
        sim = float(row[2])
        targetSeqSim[(t1,t2)]=sim
        targetSeqSim[(t2,t1)]=sim 


In [124]:

drug_targetlist = {k: g["geneid"].tolist() for k,g in drug_target_seq.groupby("drugid")}
values = []

for comb in itertools.combinations(commonDrugs,2) :
    drug1 = comb[0]
    drug2 = comb[1]
    if not(drug1 in drug_targetlist and drug2 in drug_targetlist) : continue
    targetList1 = drug_targetlist[drug1]
    targetList2 = drug_targetlist[drug2]
    allscores =[]
    for target1 in sorted(targetList1):
        genescores = []
        for target2 in sorted(targetList2):
            target1 =str(target1)
            target2 =str(target2)    
            if target1 == target2:
                score=1.0
            else:
                score = targetSeqSim[(target1,target2)] / (math.sqrt(targetSeqSim[(target1,target1)]) * math.sqrt(targetSeqSim[(target2,target2)]))
            genescores.append(score)
    # add maximal values between the two lists of associated genes 
    allscores.append(max(genescores))
    if len(allscores) ==0: continue
    #average the maximal scores 
    maxScore =np.mean(allscores)
    values.append([drug1, drug2, maxScore])

In [126]:
drug_seq_df = pd.DataFrame(values, columns =['Drug1','Drug2','TARGETSEQ-SIM'])


In [127]:
drug_seq_df.head()

Unnamed: 0,Drug1,Drug2,TARGETSEQ-SIM
0,DB00882,DB00366,0.033212
1,DB00882,DB01029,0.045411
2,DB00882,DB01217,0.043559
3,DB00882,DB00304,0.439197
4,DB00882,DB01400,0.032739


In [151]:
drug_seq_df.to_csv('data/features/drugs-target-seq-sim.csv', index=False)

## GO based drug-drug similarity

In [45]:
drug_goa.drugid = drug_goa.drugid.map(lambda d: 'http://purl.obolibrary.org/obo/'+d)
drug_goa.to_csv('data/intermediate/drug_goa.txt',sep='\t', header=False, index=False)

In [37]:
#cleaning GO annotations
rows = []
for comb in itertools.combinations(commonDrugs,2):
    t1=comb[0]
    t2=comb[1]
    rows.append(['http://purl.obolibrary.org/obo/'+str(t1),'http://purl.obolibrary.org/obo/'+str(t2)])

In [46]:
drug_query_df = pd.DataFrame(rows, columns =['Drug1','Drug2'])
drug_query_df.to_csv('data/intermediate/drug.gene.go.query',sep='\t', header=False, index=False)

In [47]:
### run the semantic relatedness library with given query and anotation file it will produce a file named: gene.go.sim.out
!java -jar lib/sml-toolkit-0.9.jar -t sm -xmlconf data/conf/sml.gene.go.conf

----------------------------------------------------------------------
	Semantic Measures Library Toolkit 0.9
----------------------------------------------------------------------
Toolkit dedicated to Semantic Measures computation and studies.
Website: http://www.semantic-measures-library.org
Developer: Sébastien Harispe

Please cite: 
The Semantic Measures Library and Toolkit: 
fast computation of semantic similarity and relatedness 
using biomedical ontologies.
Sébastien Harispe*, Sylvie Ranwez, Stefan Janaqi, Jacky Montmain
Bioinformatics 2013; doi: 10.1093/bioinformatics/btt581
----------------------------------------------------------------------
Args [-t, sm]
Args Tool    [-xmlconf, data/conf/sml.gene.go.conf]
Loading SM Tool
----------------------------------------------------------------------
	Semantic Measures Computer 0.9
----------------------------------------------------------------------
Tool used to compute Semantic similarity/relatedness considering an
XML configurati

In [108]:
go_sim_df = pd.read_csv('data/intermediate/drug.gene.go.sim.out',sep='\t')
go_sim_df.head()

Unnamed: 0,e1,e2,bma
0,http://purl.obolibrary.org/obo/DB00882,http://purl.obolibrary.org/obo/DB00366,0.628361
1,http://purl.obolibrary.org/obo/DB00882,http://purl.obolibrary.org/obo/DB01029,0.747916
2,http://purl.obolibrary.org/obo/DB00882,http://purl.obolibrary.org/obo/DB01217,0.548093
3,http://purl.obolibrary.org/obo/DB00882,http://purl.obolibrary.org/obo/DB00304,0.965051
4,http://purl.obolibrary.org/obo/DB00882,http://purl.obolibrary.org/obo/DB01400,0.583699


In [109]:
go_sim_df.rename(columns={'e1':'Drug1','e2':'Drug2','bma':'GO-SIM'}, inplace=True)

In [110]:
go_sim_df.Drug1 = go_sim_df.Drug1.str.replace('http://purl.obolibrary.org/obo/','')
go_sim_df.Drug2 = go_sim_df.Drug2.str.replace('http://purl.obolibrary.org/obo/','')
go_sim_df.head()

Unnamed: 0,Drug1,Drug2,GO-SIM
0,DB00882,DB00366,0.628361
1,DB00882,DB01029,0.747916
2,DB00882,DB01217,0.548093
3,DB00882,DB00304,0.965051
4,DB00882,DB01400,0.583699


In [111]:
go_sim_df.to_csv('data/features/drugs-target-go-sim.csv')

## Disease Phenotype Similarity
### MESH term based Similarity

In [139]:
predict_df = pd.read_csv('data/input/openpredict-omim-drug.csv')
predict_df.head()

Unnamed: 0,drugid,omimid
0,DB01148,231200
1,DB01148,155100
2,DB01148,273800
3,DB00575,607554
4,DB00575,171300


In [140]:
predict_df.rename(columns={'drugid':'Drug','omimid':'Disease'}, inplace=True)

In [141]:
gold_diseases = set( predict_df.Disease.unique())
print ('Gold std. diseases',len(gold_diseases))

Gold std. diseases 313


In [142]:
mesh_ann = {}
allmeshterm = []
with open('data/input/omim-disease-mesh.csv') as meshfile:
    next(meshfile)
    for line in meshfile:
        line = line.strip().split(',')
        if len(line) != 2: continue
        di = line[0]
        mesh = line[1].split(',')
        mesh_ann[di]=mesh
        allmeshterm.extend(mesh)

In [143]:
vocabulary = list(set(allmeshterm))
len(vocabulary)

6166

In [144]:
# create a co-occurrence matrix
co_mat = np.zeros((len(mesh_ann),len(vocabulary)))

In [145]:
commonDiseases = mesh_ann.keys()
mesh2id= { di:i for i,di in enumerate(mesh_ann.keys())}
# fill in the co-occurrence matrix
for key in mesh_ann:
    annotations = mesh_ann[key]
    col_index = [vocabulary.index(a) for a in annotations]
    co_mat[mesh2id[key],col_index] =1

In [146]:
def cosine_similarity(a,b):
    return  np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [147]:
values = []
# calculate cosine similarity between diseases using mesh annotation vector
for comb in itertools.combinations(commonDiseases,2) :
    disease1 = comb[0]
    disease2 = comb[1]
    sim = cosine_similarity(co_mat[mesh2id[disease1],:], co_mat[mesh2id[disease2],:])
    values.append([disease1, disease2, sim])

In [148]:
disease_pheno_df = pd.DataFrame(values, columns =['Disease1','Disease2','PHENO-SIM'])


In [149]:
disease_pheno_df.head()

Unnamed: 0,Disease1,Disease2,PHENO-SIM
0,231200,155100,0.0
1,231200,273800,0.0
2,231200,607554,0.0
3,231200,171300,0.0
4,231200,102300,0.0


In [150]:
disease_pheno_df.to_csv('data/features/diseases-pheno-sim.csv',index=False)

## HPO based disease-disease similarity

In [69]:
# reading the hpo annotation file taken from compbio.charite.de/jennkins/jobs/hpo.annotations/
disease_hpo = pd.read_csv('data/input/omim-disease-hpo.csv')
disease_hpo.head()


Unnamed: 0,diseaseid,hpoid
0,231200,hpo:0000007
1,231200,hpo:0002239
2,231200,hpo:0000132
3,231200,hpo:0003010
4,231200,hpo:0001873


In [70]:
disease_hpo.rename(columns={'diseaseid':'Disease','hpoid':'HPO'}, inplace=True)
disease_hpo.HPO= disease_hpo.HPO.str.replace('hpo','hp')

In [71]:
disease_hpo.head()

Unnamed: 0,Disease,HPO
0,231200,hp:0000007
1,231200,hp:0002239
2,231200,hp:0000132
3,231200,hp:0003010
4,231200,hp:0001873


In [72]:
diseasesWithFeatures= set(disease_hpo.Disease.unique()).intersection( gold_diseases )
print (len(diseasesWithFeatures))
rows = []
for comb in itertools.combinations(diseasesWithFeatures,2):
    t1=comb[0]
    t2=comb[1]
    rows.append(['omim:'+str(t1),'omim:'+str(t2)])

310


In [73]:
disease_hpo["Disease"]=disease_hpo["Disease"].map(lambda d: 'omim:'+str(d))
disease_hpo.to_csv('data/intermediate/disease_hpo.txt',sep='\t', header=False, index=False)

In [74]:
disease_query_df = pd.DataFrame(rows, columns =['Disease1','Disease2'])
disease_query_df.to_csv('data/intermediate/hpo.sml.omim.query',sep='\t', header=False, index=False)

In [77]:
### run the semantic relatedness library with given query and anotation file it will produce a file named: hpo.sim.out
!java -jar lib/sml-toolkit-0.9.jar -t sm -xmlconf data/conf/sml.omim.hpo.conf

----------------------------------------------------------------------
	Semantic Measures Library Toolkit 0.9
----------------------------------------------------------------------
Toolkit dedicated to Semantic Measures computation and studies.
Website: http://www.semantic-measures-library.org
Developer: Sébastien Harispe

Please cite: 
The Semantic Measures Library and Toolkit: 
fast computation of semantic similarity and relatedness 
using biomedical ontologies.
Sébastien Harispe*, Sylvie Ranwez, Stefan Janaqi, Jacky Montmain
Bioinformatics 2013; doi: 10.1093/bioinformatics/btt581
----------------------------------------------------------------------
Args [-t, sm]
Args Tool    [-xmlconf, data/conf/sml.omim.hpo.conf]
Loading SM Tool
----------------------------------------------------------------------
	Semantic Measures Computer 0.9
----------------------------------------------------------------------
Tool used to compute Semantic similarity/relatedness considering an
XML configurat

In [101]:
hpo_sim_df = pd.read_csv('data/intermediate/omim.hpo.sim.out',sep='\t')

In [102]:
hpo_sim_df.head()

Unnamed: 0,e1,e2,bma
0,omim:157950,omim:606798,0.373642
1,omim:157950,omim:115300,0.344222
2,omim:157950,omim:270960,0.355539
3,omim:157950,omim:606842,0.322567
4,omim:157950,omim:246400,0.343388


In [104]:
hpo_sim_df.rename(columns={'e1':'Disease1','e2':'Disease2','bma':'HPO-SIM'}, inplace=True)

In [106]:
hpo_sim_df.Disease1 = hpo_sim_df.Disease1.str.replace('omim:','')
hpo_sim_df.Disease2 = hpo_sim_df.Disease2.str.replace('omim:','')
hpo_sim_df.head()

Unnamed: 0,Disease1,Disease2,HPO-SIM
0,157950,606798,0.373642
1,157950,115300,0.344222
2,157950,270960,0.355539
3,157950,606842,0.322567
4,157950,246400,0.343388


In [107]:
hpo_sim_df.to_csv('data/features/diseases-hpo-sim.csv', index=False)

In [2]:
import shutil
import os
shutil.copytree('data/features/', os.path.join(temp,'features'))

'/tmp/m02_lvtj/features'