In [1]:
import pandas as pd

In [20]:
url = 'http://www.paccanarolab.org/static_content/disease_similarity/mim2mesh.tsv'
df = pd.read_csv(url,header=None)

In [25]:
df.to_csv('data/external/mim2mesh.tsv', sep=',', index=False, header=None)

In [39]:
omim_annots =[]
with open('data/external/mim2mesh.tsv','r') as mim2mesh_file:
    for row in mim2mesh_file:
        line = row.strip().split('\t')
        omim =line[0]
        for i in range(1,len(line)):
            omim_annots.append([omim,line[i]])

In [40]:
mim2mesh_df = pd.DataFrame(omim_annots, columns=['disease','annotation'])

In [41]:
mim2mesh_df.head()

Unnamed: 0,disease,annotation
0,110000,D000293
1,110000,D000328
2,110000,D000368
3,110000,D003483
4,110000,D003937


In [42]:
from rdflib import Graph, URIRef, Literal, RDF, ConjunctiveGraph
def to_rdf(g, df, column_types, row_uri):
    """
    Parameters
    ----------
    g : input rdflib.Graph  
    df: DataFrame to be converted into RDF Graph
    column_types: dictonary of column and its type, type can be URI or Literal
    row_uri: rdf:type value for row index, should be URI
    Returns
    -------
    g: rdflib.Graph generated from DataFrame object
    """
    
    for (index, series) in df.iterrows():
        g.add((URIRef(index), RDF.type, URIRef(row_uri)) )
        for (column, value) in series.iteritems():
            if column_types[column] == 'URI':
                g.add((URIRef(index), URIRef(column), URIRef(value)))
            else:
                g.add((URIRef(index), URIRef(column), Literal(value)))
                
    return g

In [43]:
mim2mesh_df['disease'] = mim2mesh_df['disease'].map(lambda x: 'http://bio2rdf.org/omim:'+str(x))

In [44]:
mim2mesh_df['annotation'] = mim2mesh_df['annotation'].map(lambda x: 'http://bio2rdf.org/mesh:'+str(x))

In [45]:
mim2mesh_df.head()

Unnamed: 0,disease,annotation
0,http://bio2rdf.org/omim:110000,http://bio2rdf.org/mesh:D000293
1,http://bio2rdf.org/omim:110000,http://bio2rdf.org/mesh:D000328
2,http://bio2rdf.org/omim:110000,http://bio2rdf.org/mesh:D000368
3,http://bio2rdf.org/omim:110000,http://bio2rdf.org/mesh:D003483
4,http://bio2rdf.org/omim:110000,http://bio2rdf.org/mesh:D003937


In [46]:
mim2mesh_df = mim2mesh_df.set_index('disease', drop=True)

In [47]:
mim2mesh_df.rename(columns={'annotation':'http://semanticscience.org/resource/SIO_000255'},inplace=True)

In [48]:
column_types ={'http://semanticscience.org/resource/SIO_000255':'URI'}
graphURI = URIRef('http://fairworkflows.org/openpredict_resource:fairworkflows.dataset.openpredict.meshannot.R1')
    
g =  ConjunctiveGraph(identifier = graphURI )     
g=  to_rdf(g, mim2mesh_df, column_types, 'http://bio2rdf.org/omim_vocabulary:Phenotype' )

In [49]:
from rdflib import Namespace
import datetime
DC = Namespace("http://purl.org/dc/terms/")
def addProvanace(g, graphURI):
    now = datetime.datetime.now()
    datasetURI= URIRef('https://github.com/fair-workflows/openpredict/data/rdf/omim_mesh_annotations.nq')
    g.add((graphURI, RDF.type, DC.Dataset))
    g.add((graphURI, URIRef('http://www.w3.org/ns/dcat#distribution'), datasetURI))
    sourcedatasetURI =  URIRef('http://www.paccanarolab.org/static_content/disease_similarity/mim2mesh.tsv')
    
    g.add((datasetURI, DC['title'], Literal('Mesh Annotations for OMIM ids')))
    g.add((datasetURI, DC['format'], Literal('application/n-quads')))
    g.add((datasetURI, DC['created'], Literal(now.strftime("%Y-%m-%d %H:%M:%S"))))
    g.add((datasetURI, DC['creator'], Literal('https://github.com/fair-workflows/openpredict/RDFConversionOfMeshAnnotation.ipynb')))

    g.add((datasetURI, DC['homepage'], URIRef('https://github.com/fair-workflows/openpredict/')))
    g.add((datasetURI, DC['license'], URIRef('http://creativecommons.org/licenses/by/3.0/')))
    g.add((datasetURI, DC['rights'], Literal('use-share-modify')))
    g.add((datasetURI, DC['rights'], Literal('by-attribution')))
    g.add((datasetURI, DC['rights'], Literal('restricted-by-source-license')))

    g.add((datasetURI, DC['source'], sourcedatasetURI))
        
    g.add((sourcedatasetURI, DC['title'], Literal('OMIM Mesh Annotations (mim2mesh.tsv)')))
    g.add((sourcedatasetURI, RDF['type'], URIRef('http://www.w3.org/ns/dcat#Distribution')))
    g.add((sourcedatasetURI, DC['homepage'], URIRef('http://www.paccanarolab.org/disease_similarity/')))
    g.add((sourcedatasetURI, DC['homepage'], URIRef('https://doi.org/10.1038/srep17658')))
    g.add((sourcedatasetURI, URIRef('http://purl.org/pav/retrievedOn'), Literal(now.strftime("%Y-%m-%d %H:%M:%S"))))
    g.add((sourcedatasetURI, DC['format'], Literal('text/tsv')))
    g.add((sourcedatasetURI, DC['rights'], URIRef('http://creativecommons.org/licenses/by/4.0/')))
    g.add((sourcedatasetURI, DC['publisher'], Literal('http://www.paccanarolab.org/')))
    g.add((sourcedatasetURI, DC['rights'], Literal('use')))
    g.add((sourcedatasetURI, DC['rights'], Literal('no-commercial')))
    
    return g

In [50]:
g= addProvanace(g, graphURI)

In [51]:
g.serialize('data/rdf/omim_mesh_annotations.nq', format='nquads')