In [2]:
%load_ext autoreload
%autoreload 2

### Basic imports

In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
import datetime

In [18]:
DATA_DIR = Path("/home/jovyan/BioBLP/data")
biokg_disease_path = DATA_DIR.joinpath("raw/biokg.metadata.disease.tsv")
mesh_scopenotes_path = DATA_DIR.joinpath("raw/meshid_scopenote.csv")
mesh_scr_disease_notes_path = Path("./mesh_scr_disease_notes.tsv")


## Contents
1. Inspect BioKG disease entities, and collect mesh identifiers
2. Use MeSH identifiers to retrieve textual descriptions/'notes corresponding to disease entity
    * a. retrieve textual `scopeNote` attributes by executing SPARRQL against MeSH rdf graph for entities of type rdfs:type DISEASE 
    * b. retrieve textual `note` attributes by executing SPARRQL against MeSH rdf graph for entities of type rdfs:type SCR_DISEASE
          (SCR_DISEASE are additional to the regular MeSH DISEASE concepts, and are sourced from Supplementary Concept Diseases See https://www.nlm.nih.gov/bsd/indexing/training/CATC_053.html)
    * c. process results into useable form
3. Merge BioKG entities with their textual attributes




## 1. Inspect BioKG disease entities, and collect mesh identifiers

In [21]:
# get biokg disease data
biokg_dis_df = pd.read_csv(biokg_disease_path, sep="\t", names=["mesh_id", "type", "entity"], header=None)
biokg_dis_init_len = len(biokg_dis_df)
biokg_dis_counts = biokg_dis_df.entity.value_counts()
biokg_dis_counts

SCR_DISEASE                                       6479
DISEASE                                           4868
Aural Atresia, Congenital                            1
Snowflake vitreoretinal degeneration                 1
Carnitine-Acylcarnitine Translocase Deficiency       1
                                                  ... 
Severe Dengue                                        1
Neurodegenerative Diseases                           1
Uterine Inversion                                    1
Hepatitis, Autoimmune                                1
Hypobetalipoproteinemia, Familial, 1                 1
Name: entity, Length: 11349, dtype: int64

We see that the dataframe has several `rdfs:type` statements, where the entity either is of type DISEASE, or SCR_DISEASE.
This effectively duplicates the records against a certain MeSH identifier. We are only interested in the records which contain the actual entity nam label

In [29]:
print(f"mesh ids are duplicated with extra rows coming from rdf type triples.\nTotal rows: {biokg_dis_init_len}\n"\
f"# DISEASE type nodes: {biokg_dis_counts['DISEASE']} \n# SCR_DISEASE nodes {biokg_dis_counts['SCR_DISEASE']}")
biokg_dis_df = biokg_dis_df[~biokg_dis_df["entity"].isin(["DISEASE", "SCR_DISEASE"])]
print(f"Biokg rows on dropping rdf type rows: {biokg_dis_init_len} --> {len(biokg_dis_df)} ")
biokg_dis_df.head()

mesh ids are duplicated with extra rows coming from rdf type triples.
Total rows: 22694
# DISEASE type nodes: 4868 
# SCR_DISEASE nodes 6479
Biokg rows on dropping rdf type rows: 22694 --> 11347 


Unnamed: 0,mesh_id,type,entity
11347,D000006,NAME,"Abdomen, Acute"
11348,D000007,NAME,Abdominal Injuries
11349,D000008,NAME,Abdominal Neoplasms
11350,D000012,NAME,Abetalipoproteinemia
11351,D000013,NAME,Congenital Abnormalities


In [None]:
# so we see that there are actually 11347 unique mesh concepts here, instead of 22K. The rest stem from duplicated entries citing provenance of source type, such as 'DISEASE', or "SCR_DISEASE'

## 2. Use MeSH identifiers to retrieve textual descriptions/'notes corresponding to disease entity
#### 2a. retrieve textual `scopeNote` attributes by executing SPARRQL against MeSH rdf graph for entities of type rdfs:type DISEASE 

In [30]:
# Reusing mesh scopenotes for entities of type `DISEASE` from bioblp v0 work
mesh_notes_df = pd.read_csv(mesh_scopenotes_path, index_col=0)
print(len(mesh_notes_df))
mesh_notes_df.head()

29525


Unnamed: 0,concept,prefcon,name,scopeNote
0,http://id.nlm.nih.gov/mesh/2020/D014525,http://id.nlm.nih.gov/mesh/2020/M0022335,Urethral Stricture,Narrowing of any part of the URETHRA. It is ch...
1,http://id.nlm.nih.gov/mesh/2020/D017262,http://id.nlm.nih.gov/mesh/2020/M0026201,Siderophores,Low-molecular-weight compounds produced by mic...
2,http://id.nlm.nih.gov/mesh/2020/D001321,http://id.nlm.nih.gov/mesh/2020/M0001983,Autistic Disorder,A disorder beginning in childhood. It is marke...
3,http://id.nlm.nih.gov/mesh/2020/D015730,http://id.nlm.nih.gov/mesh/2020/M0024115,Djibouti,"A republic in eastern Africa, on the Gulf of A..."
4,http://id.nlm.nih.gov/mesh/2020/D002330,http://id.nlm.nih.gov/mesh/2020/M0003490,Carmustine,A cell-cycle phase nonspecific alkylating anti...


In [31]:
def parse_id_from_uri(uri: str):
    return uri.split("/")[-1]

mesh_notes_df["concept"] = mesh_notes_df['concept'].apply(lambda x: parse_id_from_uri(x))
mesh_notes_df = mesh_notes_df[["concept", "scopeNote"]]
mesh_notes_df = mesh_notes_df.rename(columns={"concept":"mesh_id", "scopeNote": "note"})
mesh_notes_df.head(2)

Unnamed: 0,mesh_id,note
0,D014525,Narrowing of any part of the URETHRA. It is ch...
1,D017262,Low-molecular-weight compounds produced by mic...


Let's merge biokg entities with the mesh notes using the mesh identifier

In [35]:
biokg_mesh_disease_df = biokg_dis_df.merge(mesh_notes_df, how="inner", left_on="mesh_id", right_on="mesh_id")
biokg_mesh_disease_df.head(2)

Unnamed: 0,mesh_id,type,entity,note
0,D000006,NAME,"Abdomen, Acute",A clinical syndrome with acute abdominal pain ...
1,D000007,NAME,Abdominal Injuries,General or unspecified injuries involving orga...


In [36]:
print(f"Number of biokg entities for which scope notes or mesh ids were found {len(biokg_dis_df)} -> {len(biokg_mesh_disease_df)}")

Number of biokg entities for which scope notes or mesh ids were found 11347 -> 4868


We seem to be losing several entities. This is because we are losing all the SCR_DISEASE nodes during the inner merge. 
The textual properties for nodes of type SCR_DISEASE are hidden under different properties, namely meshv:note, instead of meshv:scopeNote
We'll retrieve these separately

###   2b. retrieve textual `note` attributes by executing SPARRQL against MeSH rdf graph for entities of type rdfs:type SCR_DISEASE

(SCR_DISEASE are additional to the regular MeSH DISEASE concepts, and are sourced from Supplementary Concept Diseases See https://www.nlm.nih.gov/bsd/indexing/training/CATC_053.html)

In [13]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Specify the DBPedia endpoint
sparql = SPARQLWrapper("http://id.nlm.nih.gov/mesh/sparql")


sparql.setReturnFormat('n3')
# Query for the description of "Capsaicin", filtered by language
sparql.setQuery("""
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX meshv: <http://id.nlm.nih.gov/mesh/vocab#>
PREFIX mesh: <http://id.nlm.nih.gov/mesh/>
CONSTRUCT {?s meshv:note ?note.
         }
FROM <http://id.nlm.nih.gov/mesh>

WHERE {
  ?s a meshv:SCR_Disease.
  ?s meshv:note ?note.


     }
""")

results = sparql.queryAndConvert()


In [14]:
record_list = results.decode('utf-8').split("\n\n")
record_list[:5]

['<http://id.nlm.nih.gov/mesh/C538525>\n        <http://id.nlm.nih.gov/mesh/vocab#note>\n                "Mitochondrial myopathy encephalopathy lactic acidosis and strokelike episodes"@en .',
 '<http://id.nlm.nih.gov/mesh/C563160>\n        <http://id.nlm.nih.gov/mesh/vocab#note>\n                "A rare hereditary autosomal dominant condition that affects multiple parts of the body; particularly the face, eyes, teeth, and extremities. Affected individuals often have small eyes (MICROPHTHALMIA), small or missing teeth, weak enamel, multiple cavities, and early tooth loss. Other common features include a thin nose and SYNDACTYLY between the fourth and fifth fingers. HYPOTRICHOSIS, syndactyly of the toes, curvature of fingers, MICROCEPHALY, and CLEFT PALATE may also occur but are less common. Some patients may also experience ATAXIA, MUSCLE SPASTICITY, hearing loss, and speech difficulties. Mutations in the GJA1 gene have been identified. OMIM: 164200"@en .',
 '<http://id.nlm.nih.gov/mesh

In [6]:
record_list_nested = [record.split('\n') for record in record_list]

record_list_nested[:3]

[['<http://id.nlm.nih.gov/mesh/C538525>',
  '        <http://id.nlm.nih.gov/mesh/vocab#note>',
  '                "Mitochondrial myopathy encephalopathy lactic acidosis and strokelike episodes"@en .'],
 ['<http://id.nlm.nih.gov/mesh/C563160>',
  '        <http://id.nlm.nih.gov/mesh/vocab#note>',
  '                "A rare hereditary autosomal dominant condition that affects multiple parts of the body; particularly the face, eyes, teeth, and extremities. Affected individuals often have small eyes (MICROPHTHALMIA), small or missing teeth, weak enamel, multiple cavities, and early tooth loss. Other common features include a thin nose and SYNDACTYLY between the fourth and fifth fingers. HYPOTRICHOSIS, syndactyly of the toes, curvature of fingers, MICROCEPHALY, and CLEFT PALATE may also occur but are less common. Some patients may also experience ATAXIA, MUSCLE SPASTICITY, hearing loss, and speech difficulties. Mutations in the GJA1 gene have been identified. OMIM: 164200"@en .'],
 ['<http:

In [40]:
mesh_scr_notes_df = pd.DataFrame(record_list_nested, columns=["mesh_id", "predicate", "note", "tag"])
mesh_scr_notes_df = mesh_scr_notes_df[["mesh_id", "note"]]
print(mesh_scr_notes_df.shape)
mesh_scr_notes_df.head(2)

(2092, 2)


Unnamed: 0,mesh_id,note
0,<http://id.nlm.nih.gov/mesh/C538525>,"""Mitochondrial myopathy enceph..."
1,<http://id.nlm.nih.gov/mesh/C563160>,"""A rare hereditary autosomal d..."


In [41]:
def parse_id_from_uri_rdf(uri: str):
    return uri.rstrip(">").split("/")[-1]

mesh_scr_notes_df["mesh_id"] = mesh_scr_notes_df["mesh_id"].apply(lambda x: parse_id_from_uri_rdf(x))
mesh_scr_notes_df.head()

Unnamed: 0,mesh_id,note
0,C538525,"""Mitochondrial myopathy enceph..."
1,C563160,"""A rare hereditary autosomal d..."
2,C535998,"""A milk-filled retention cyst ..."
3,C548085,"""Replaced \""Progressive Transf..."
4,C537710,"""An autosomal dominant disorde..."


In [42]:
mesh_scr_notes_df.mesh_id.nunique()
mesh_scr_notes_df.to_csv("mesh_scr_disease_notes.tsv", sep="\t", header=True, index=False)

## Merge Biokg with DISEASE, SCR_DISEASE data

In [45]:
LOAD_MESH_SCR_NOTES_FROM_DISK = False
if LOAD_MESH_SCR_NOTES_FROM_DISK:
    mesh_scr_notes_df = pd.read_csv(mesh_scr_disease_notes_path, delimiter="\t", header=0, index_col=None)
    mesh_scr_notes_df.head(2)

In [49]:
mesh_notes_df_merged = pd.concat([mesh_notes_df, mesh_scr_notes_df], axis=0)
mesh_notes_df_merged.to_csv("mesh_disease_notes_merged.tsv", sep="\t", header=True, index=False)

In [50]:
biokg_dis_df = biokg_dis_df.merge(mesh_notes_df_merged, how="inner", left_on="mesh_id", right_on="mesh_id")
print(biokg_dis_df.shape)
biokg_dis_df.head(2)

(6865, 5)


Unnamed: 0,mesh_id,type,entity,note_x,note_y
0,D000006,NAME,"Abdomen, Acute",A clinical syndrome with acute abdominal pain ...,A clinical syndrome with acute abdominal pain ...
1,D000007,NAME,Abdominal Injuries,General or unspecified injuries involving orga...,General or unspecified injuries involving orga...
