In [1]:
import pandas as pd

In [2]:
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'
drugbank_map_df = pd.read_table(url)

  


In [4]:
drugbank_map_df.to_csv('data/input/pubchem.tsv', sep='\t', index=False)

In [22]:
drugbank_map_df.head()

Unnamed: 0_level_0,http://bio2rdf.org/openpredict_vocabulary:x-pubchemcompound
drugbank_id,Unnamed: 1_level_1
http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:11980055
http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:11981235
http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:11982741
http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:16052011
http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:23581804


In [6]:
from rdflib import Graph, URIRef, Literal, RDF, ConjunctiveGraph
def to_rdf(g, df, column_types, row_uri):
    """
    Parameters
    ----------
    g : input rdflib.Graph  
    df: DataFrame to be converted into RDF Graph
    column_types: dictonary of column and its type, type can be URI or Literal
    row_uri: rdf:type value for row index, should be URI
    Returns
    -------
    g: rdflib.Graph generated from DataFrame object
    """
    
    for (index, series) in df.iterrows():
        g.add((URIRef(index), RDF.type, URIRef(row_uri)) )
        for (column, value) in series.iteritems():
            if column_types[column] == 'URI':
                g.add((URIRef(index), URIRef(column), URIRef(value)))
            else:
                g.add((URIRef(index), URIRef(column), Literal(value)))
                
    return g

In [7]:
drugbank_map_df['drugbank_id'] = drugbank_map_df['drugbank_id'].map(lambda x: 'http://bio2rdf.org/drugbank:'+str(x))

In [8]:
drugbank_map_df['pubchem_id'] = drugbank_map_df['pubchem_id'].map(lambda x: 'http://bio2rdf.org/pubchem.compound:'+str(x))

In [9]:
drugbank_map_df.head()

Unnamed: 0,drugbank_id,pubchem_id
0,http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:11980055
1,http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:11981235
2,http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:11982741
3,http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:16052011
4,http://bio2rdf.org/drugbank:DB00014,http://bio2rdf.org/pubchem.compound:23581804


In [10]:
drugbank_map_df = drugbank_map_df.set_index('drugbank_id', drop=True)

In [11]:
drugbank_map_df.rename(columns={'pubchem_id':'http://bio2rdf.org/openpredict_vocabulary:x-pubchemcompound'},inplace=True)

In [19]:
column_types ={'http://bio2rdf.org/openpredict_vocabulary:x-pubchemcompound':'URI'}
graphURI = URIRef('http://fairworkflows.org/openpredict_resource:fairworkflows.dataset.openpredict.pubchem.R1')
g =  ConjunctiveGraph(identifier = graphURI)     

g=  to_rdf(g, drugbank_map_df, column_types, 'http://bio2rdf.org/drugbank:Drug' )

In [20]:
from rdflib import Namespace
import datetime
DC = Namespace("http://purl.org/dc/terms/")
def addProvanace(g, graphURI):
    now = datetime.datetime.now()
    
    datasetURI= URIRef('https://github.com/fair-workflows/openpredict/data/rdf/pubchem_mapping.nq')
    g.add((graphURI, RDF.type, DC.Dataset))
    g.add((graphURI, URIRef('http://www.w3.org/ns/dcat#distribution'), datasetURI))
    sourcedatasetURI =  URIRef('https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv')
    
    g.add((datasetURI, DC['title'], Literal('Pubchem mapping for Drugbank ids ')))
    g.add((datasetURI, DC['format'], Literal('application/n-quads')))
    g.add((datasetURI, DC['created'], Literal(now.strftime("%Y-%m-%d %H:%M:%S"))))
    g.add((datasetURI, DC['creator'], Literal('https://github.com/fair-workflows/openpredict/RDFConversionOfPubchemMapping.ipynb')))

    g.add((datasetURI, DC['homepage'], URIRef('https://github.com/fair-workflows/openpredict/')))
    g.add((datasetURI, DC['license'], URIRef('http://creativecommons.org/licenses/by/3.0/')))
    g.add((datasetURI, DC['rights'], Literal('use-share-modify')))
    g.add((datasetURI, DC['rights'], Literal('by-attribution')))
    g.add((datasetURI, DC['rights'], Literal('restricted-by-source-license')))

    g.add((datasetURI, DC['source'], sourcedatasetURI))
        
    g.add((sourcedatasetURI, DC['title'], Literal('Mapping From Drugbank to Pubchem  (pubchem.tsv)')))
    g.add((sourcedatasetURI, RDF['type'], URIRef('http://www.w3.org/ns/dcat#Distribution')))
    g.add((sourcedatasetURI, DC['homepage'], URIRef('https://github.com/dhimmel/drugbank')))
    g.add((sourcedatasetURI, URIRef('http://purl.org/pav/retrievedOn'), Literal(now.strftime("%Y-%m-%d %H:%M:%S"))))
    g.add((sourcedatasetURI, DC['format'], Literal('text/tsv')))
    g.add((sourcedatasetURI, DC['rights'], URIRef('https://creativecommons.org/licenses/by-nc/4.0/')))
    g.add((sourcedatasetURI, DC['publisher'], Literal('https://github.com/dhimmel/drugbank')))
    g.add((sourcedatasetURI, DC['rights'], Literal('use')))
    g.add((sourcedatasetURI, DC['rights'], Literal('no-commercial')))
    
    return g

In [21]:
g=addProvanace(g)

TypeError: addProvanace() missing 1 required positional argument: 'graphURI'

In [None]:
g.serialize('data/rdf/pubchem_mapping.nq', format='nquads')