In [1]:
"""
RDF generator for the PREDICT drug indication gold standard (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/bin/msb201126-s4.xls)
@version 1.0
@author Remzi Celebi
"""


import pandas as pd
from csv import reader
from src.util import utils
from src.util.utils import Dataset, DataResource
from rdflib import Graph, URIRef, Literal, RDF, ConjunctiveGraph
from rdflib import Namespace
import datetime

In [2]:
mapping_df = pd.read_excel('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/bin/msb201126-s4.xls')
mapping_df.head()

Unnamed: 0,OMIM ID,OMIM disease name,UMLS concept ID,UMLS concept name
0,102100,"Acromegaloid Changes, Cutis Verticis Gyrata, A...",C1868756,Corneal leukoma
1,102100,"Acromegaloid Changes, Cutis Verticis Gyrata, A...",C0263417,Cutis verticis gyrata
2,102300,"Restless Legs Syndrome, Susceptibility To, 1; ...",C0035258,Restless Legs Syndrome
3,102300,"Restless Legs Syndrome, Susceptibility To, 1; ...",C1876177,RLS1 (Ekbom Syndrome)
4,102400,Acroosteolysis,C0917990,Acroosteolysis (Acro-Osteolysis)


In [3]:
#save the original file
mapping_df.to_csv('data/external/msb201126-s4.csv', index=False)

In [4]:
mapping_df['OMIM disease name'].replace({'Neuropathy, Hereditary Sensory And Autonomic, Type I, With Cough And':
                                         'Neuropathy, Hereditary Sensory And Autonomic, Type I, With Cough And Gastroesophageal Reflux'}, inplace=True)

In [5]:
goldstd_df = pd.read_excel('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/bin/msb201126-s1.xls')
goldstd_df.head()

Unnamed: 0,Drug name,Disease name
0,Acamprosate,Alcohol Dependence
1,Acarbose,"Diabetes Mellitus, Noninsulin-Dependent; Niddm"
2,Acarbose,"Maturity-Onset Diabetes Of The Young, Type 1; ..."
3,Acarbose,"Maturity-Onset Diabetes Of The Young, Type 2; ..."
4,Acarbose,"Maturity-Onset Diabetes Of The Young, Type 3; ..."


In [6]:
goldstd_df['Drug name'].replace({'Divalproex Sodium':'Valproic Acid',
                                 'Bismuth':'Bismuth subsalicylate',
                                'Clobetasol':'Clobetasol propionate',
                               'Guanadrel Sulfate':'Guanadrel',
                                 'Marinol':'Dronabinol',
                               'Medroxyprogesterone':'Medroxyprogesterone acetate',
                                'Megestrol':'Megestrol acetate',
                                'Propoxyphene':'Dextropropoxyphene',
                                 'Salicyclic Acid':'Salicylic acid',
                                'Ipratropium':'Ipratropiumbromid',
                                'Adenosine Monophosphate':'Adenosine monophosphate',
                                'Arsenic Trioxide':'Arsenic trioxide',
                                'Ethacrynic Acid':'Ethacrynic acid',
                                'Fondaparinux Sodium':'Fondaparinux sodium',
                                 'Meclofenamic Acid':'Meclofenamic acid',
                                'Methyl Aminolevulinate':'Methyl aminolevulinate'},inplace=True)

In [7]:
merged_df = goldstd_df.merge(mapping_df, left_on='Disease name', right_on='OMIM disease name')


In [8]:
merged_df.head()

Unnamed: 0,Drug name,Disease name,OMIM ID,OMIM disease name,UMLS concept ID,UMLS concept name
0,Acamprosate,Alcohol Dependence,103780,Alcohol Dependence,C0001973,"Alcohol Dependence (Alcoholic Intoxication, Ch..."
1,Chlordiazepoxide,Alcohol Dependence,103780,Alcohol Dependence,C0001973,"Alcohol Dependence (Alcoholic Intoxication, Ch..."
2,Citalopram,Alcohol Dependence,103780,Alcohol Dependence,C0001973,"Alcohol Dependence (Alcoholic Intoxication, Ch..."
3,Disulfiram,Alcohol Dependence,103780,Alcohol Dependence,C0001973,"Alcohol Dependence (Alcoholic Intoxication, Ch..."
4,Naltrexone,Alcohol Dependence,103780,Alcohol Dependence,C0001973,"Alcohol Dependence (Alcoholic Intoxication, Ch..."


In [9]:
sparql_endpoint="https://graphdb.dumontierlab.com/repositories/openpredict"
!curl -H "Accept: text/csv" --data-urlencode query@data/sparql/drugbank-drug-synonym.rq {sparql_endpoint} > data/input/drugbank-drug-synonym.csv


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  811k    0  810k  100   560  32354     21  0:00:26  0:00:25  0:00:01 96470


In [10]:
drug_synonym_df = pd.read_csv('data/input/drugbank-drug-synonym.csv')
drug_synonym_df.head()

Unnamed: 0,drugid,name
0,DB01148,2-(1-Piperidinyl)ethyl 3-methyl-4-oxo-2-phenyl...
1,DB01148,2-Piperidinoethyl 3-methyl-4-oxo-2-phenyl-4H-1...
2,DB01148,2-Piperidinoethyl 3-methylflavone-8-carboxylate
3,DB01148,beta-Piperidinoethyl 3-methylflavone-8-carboxy...
4,DB01148,Flavoxate


In [11]:
merged_df = merged_df.merge(drug_synonym_df, left_on='Drug name', right_on='name')

print ('# of drug-disease associations',len(merged_df[['drugid','OMIM ID']].drop_duplicates()))


gold_std_mapped_df = merged_df[['drugid','OMIM ID']].drop_duplicates()
gold_std_mapped_df['drugid'] = gold_std_mapped_df['drugid'].map(lambda x: 'http://bio2rdf.org/drugbank:'+str(x))
gold_std_mapped_df['OMIM ID'] = gold_std_mapped_df['OMIM ID'].map(lambda x: 'http://bio2rdf.org/omim:'+str(x))
gold_std_mapped_df.rename(columns={'OMIM ID':'http://bio2rdf.org/openpredict_vocabulary:indication'},inplace=True)
gold_std_mapped_df= gold_std_mapped_df.set_index('drugid', drop=True)



# of drug-disease associations 1933


In [12]:
column_types ={'http://bio2rdf.org/openpredict_vocabulary:indication':'URI'}
graphURI ='http://w3id.org/fairworkflows/dataset.openpredict.indications.R1'
g = ConjunctiveGraph(identifier = URIRef(graphURI))  
g=  utils.to_rdf(g, gold_std_mapped_df, column_types, 'http://bio2rdf.org/drugbank:Drug' )


In [13]:
g.serialize('data/rdf/predict_gold_standard_omim.nq', format='nquads')

In [14]:
def addMetaData(g, graphURI):
    #generate dataset
    data_source = Dataset(qname=graphURI, graph = g)
    data_source.setURI(graphURI)
    data_source.setTitle('Supplementary data used in the PREDICT')
    data_source.setDescription('Drug indications gold standard and mappings used in the study of "PREDICT: a method for inferring novel drug indications with application to personalized medicine" ')
    data_source.setPublisher('https://www.embopress.org/journal/17444292')
    data_source.setPublisherName('Molecular Systems Biology')
    data_source.addRight('use-share-modify')
    data_source.addTheme('http://www.wikidata.org/entity/Q56863002')
    data_source.setLicense('https://www.embopress.org/page/journal/17444292/about')
    data_source.setHomepage('https://dx.doi.org/10.1038%2Fmsb.2011.26')
    data_source.setVersion('1.0')


    #generate dataset distribution
    data_dist1 = DataResource(qname=graphURI, graph = data_source.toRDF())
    data_dist1.setURI('http:/w3id.org/fairworkflows/dataset.openpredict.mapping/version/1/source')
    data_dist1.setTitle('Mapping between OMIM diseases and UMLS concepts used in the PREDICT study (msb201126-s4.xls)')
    data_dist1.setDescription('This file contains the mappings between OMIM diseases and UMLS concepts used in the PREDICT study')
    data_dist1.setLicense('https://creativecommons.org/publicdomain/zero/1.0/')
    data_dist1.setVersion('1.0')
    data_dist1.setFormat('application/vnd.ms-excel')
    data_dist1.setMediaType('application/vnd.ms-excel')
    data_dist1.setPublisher('https://www.embopress.org/journal/17444292')
    data_dist1.addRight('use-share-modify')
    data_dist1.setDownloadURL('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/bin/msb201126-s4.xls')
    data_dist1.setRetrievedDate(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    data_dist1.setDataset(data_source.getURI())
    
    
    #generate dataset distribution
    data_dist2 = DataResource(qname=graphURI, graph = data_dist1.toRDF())
    data_dist2.setURI('http:/w3id.org/fairworkflows/dataset.openpredict.indications/version/1/source')
    data_dist2.setTitle('Drug indications gold standard used in the PREDICT study (msb201126-s1.xls)')
    data_dist2.setDescription('This file contains the gold standard drug indications used in the PREDICT study')
    data_dist2.setLicense('https://creativecommons.org/publicdomain/zero/1.0/')
    data_dist2.setVersion('1.0')
    data_dist2.setFormat('application/vnd.ms-excel')
    data_dist2.setMediaType('application/vnd.ms-excel')
    data_dist2.setPublisher('https://www.embopress.org/journal/17444292')
    data_dist2.addRight('use-share-modify')
    data_dist2.setDownloadURL('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/bin/msb201126-s1.xls')
    data_dist2.setRetrievedDate(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    data_dist2.setDataset(data_source.getURI())
     

    #generate RDF data distrubtion
    rdf_dist = DataResource(qname=graphURI, graph = data_dist2.toRDF() )
    rdf_dist.setURI('http:/w3id.org/fairworkflows/dataset.openpredict.indications/version/1/rdf/data')
    rdf_dist.setTitle('RDF version of PREDICT drug indication gold standard')
    rdf_dist.setDescription('This file is the RDF version of PREDICT drug indication gold standard')
    rdf_dist.setLicense('http://creativecommons.org/licenses/by/3.0/')
    rdf_dist.setVersion('1.0')
    rdf_dist.setFormat('application/n-quads')
    rdf_dist.setMediaType('application/n-quads')
    rdf_dist.addRight('use-share-modify')
    rdf_dist.addRight('by-attribution')
    rdf_dist.addRight('restricted-by-source-license')
    rdf_dist.setCreateDate(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    rdf_dist.setCreator('https://github.com/fair-workflows/openpredict/src/MappingPREDICTGoldstandard.py')
    rdf_dist.setDownloadURL('https://github.com/fair-workflows/openpredict/known_associations/predict-gold-standard-omim.nq.gz')
    rdf_dist.setDataset(data_dist2.getURI())
    rdf_dist.addSource('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/bin/msb201126-s1.xls')
    rdf_dist.addSource('https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3159979/bin/msb201126-s4.xls')
      
    return rdf_dist.toRDF()



In [15]:
g = ConjunctiveGraph(identifier = graphURI) 
g= addMetaData(g, graphURI)


In [16]:
outfile ='data/rdf/predict_gold_standard_omim_metadata.nq'
g.serialize(outfile, format='nquads')
print('RDF is generated at '+outfile)

RDF is generated at data/rdf/predict_gold_standard_omim_metadata.nq
