# Sandbox for working with parsers

## Setup

In [1]:
## CX: allows multiple lines of code to print from one code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## to get around bugs
import nest_asyncio
nest_asyncio.apply()

import pathlib
import pandas as pd
import re

from collections import defaultdict

## DisGeNET disease->gene parsing

In [2]:
## put the path you want to use here
allgene_pmid_path = pathlib.Path.home().joinpath('Desktop', 'ScrippsJob', 'DisGeNET', 'all_gene_disease_pmid_associations.tsv.gz')

In [3]:
def process_gene(file_path_gene_disease):
    df_gene_disease = pd.read_csv(
        file_path_gene_disease,
        encoding="ISO-8859-1",
        sep="\t",
        comment="#",
        compression="gzip",
    )
    rename_gene = {
        "diseaseId": "umls",
        "geneId": "gene_id",
        "geneSymbol": "gene_name",
        "diseaseName": "disease_name",
        "pmid": "pubmed",
    }
    df_gene_disease = df_gene_disease.where((pd.notnull(df_gene_disease)), None)
    # source field could be multiple data sources concatenated by ";", break them into a list
    # df_gene_disease.source = to_list(df_gene_disease.source)
    # df_gene_disease.diseaseType = to_list(df_gene_disease.diseaseType)
    # df_gene_disease.diseaseSemanticType = to_list(df_gene_disease.diseaseSemanticType)
    d = defaultdict(list)
    # rename pandas columns
    df_gene_disease = df_gene_disease.rename(columns=rename_gene)
    # for each gene, group the results based on source, and merge all pubmed IDs together
    for grp, subdf in df_gene_disease.groupby(["umls", "source", "gene_id"]):
        records = subdf.to_dict(orient="records")
        doc = {"source": grp[1], "gene_id": int(grp[2]), "pubmed": set()}
        for record in records:
            for k, v in record.items():
                if isinstance(v, np.int64):
                    record[k] = int(v)
                if k in ["gene_name", "DSI", "DPI", "score", "EI"]:
                    doc[k] = v
                elif k in ["YearInitial", "YearFinal"]:
                    doc[k] = int(v) if v else v
                elif k == "pubmed" and v:
                    doc[k].add(int(v))
        doc["pubmed"] = list(doc["pubmed"])
        d[grp[0].replace("umls", "umls_cui")].append(doc)
    return d

In [4]:
processed1 = process_gene(allgene_pmid_path)

In [11]:
len(processed1['C0678222'])

7473

## HPO parsing

In [2]:
## put the path you want to use here
HPO_path = pathlib.Path.home().joinpath('Desktop', 'ScrippsJob', 'phenotype.hpoa')

In [3]:
def process_disease2hp(file_path_disease_hpo):
    df_disease_hpo = pd.read_csv(file_path_disease_hpo, sep="\t", skiprows=4, dtype=str)
    df_disease_hpo = df_disease_hpo.rename(
        index=str, columns={"DiseaseName": "disease_name", "#DatabaseID": "disease_id"}
    )
    
    ## removing qualifier = 'NOT' annotations, because it means the disease does not
    ##   have this phenotypic feature. The HPO website doesn't show these 'NOT' annots
    df_disease_hpo = df_disease_hpo[df_disease_hpo['Qualifier'] != "NOT"]
    ## then remove the qualifier
    df_disease_hpo.drop(columns = 'Qualifier', inplace = True)
    ## make sure all null values are None
    df_disease_hpo = df_disease_hpo.where((pd.notnull(df_disease_hpo)), None)
    
    d = []
    for did, subdf in df_disease_hpo.groupby("disease_id"):
        did = did.replace("ORPHA", "ORPHANET")
        records = subdf.to_dict(orient="records")
        
        pathway_related = []
        course = []
        modifiers = []
        inheritance = []
        
        for record in records:
            record_dict = {}
            if record["Aspect"] == "C":
                course.append(record["HPO_ID"])
                continue
            elif record["Aspect"] == "M":
                modifiers.append(record["HPO_ID"])
                continue
            elif record["Aspect"] == "I":
                inheritance.append(record["HPO_ID"])
                continue      
                
            for k, v in record.items():
                # name the field based on pathway database
                if (k == "Sex") and v:
                    record_dict['sex'] = v.lower()
                elif (k == 'Reference') and v: 
                ## only process if Reference has a value
                ## notes: OMIM:194190, OMIM:180849, OMIM:212050 are disease examples with > 1 type of reference
                    ## this is a string representing a list
                    tempRefs = v.split(";")
                    ## prepare to iterate through the tempRefs and store the processed data
                    tempProperties = {
                        'ISBN': [],
                        'PMID': [],
                        'http': [],
                        'DECIPHER': [],
                        'OMIM': [],
                        'ORPHA': []
                    }
                    
                    ## remove the prefixes or not? currently keeping the prefix
                    for i in tempRefs:
                        for key in tempProperties.keys():
                            if key in i:
                                ## replace curie prefix for isbn and orpha
                                if key == 'ISBN':
                                    tempProperties[key].append('ISBN:' + i.split(":")[1])
                                elif key == 'ORPHA':
                                    tempProperties[key].append('ORPHANET:' + i.split(":")[1])                                    
                                else:
                                    tempProperties[key].append(i)
                    ## ONLY add reference keys/values to the record if there are values
                    for k,v in tempProperties.items():
                        if v:
                            if k == 'ISBN':
                                record_dict['isbn_refs'] = v
                            elif k == 'PMID':
                                record_dict['pmid_refs'] = v                    
                            elif k == 'http':
                                record_dict['website_refs'] = v  
                            elif k == 'DECIPHER':
                                record_dict['decipher_refs'] = v  
                            elif k == 'OMIM':
                                record_dict['omim_refs'] = v  
                            elif k == 'ORPHA':
                                record_dict['orphanet_refs'] = v  
                elif (k == 'Frequency') and v:
                ## only process if Frequency has a value
                    tempDict = {}
                    if 'http' in v:  ## catching an error in the data
                        continue
                    elif 'HP:' in v:
                        tempDict['hp_freq'] = v
                    elif '%' in v:
                        tempFreq = float(v.strip('%')) / 100
                        ## only go forward if this is a valid fraction <=1
                        if tempFreq <= 1:
                            tempDict['numeric_freq'] = tempFreq
                    elif '/' in v:
                        ## idx 0 is numerator, idx 1 is denominator
                        tempL = [int(ele) for ele in v.split("/")]
                        ## only go forward if this is a valid fraction <=1
                        if (tempL[0] != 0) and (tempL[1] !=0) and (tempL[0] <= tempL[1]):
                            tempDict['freq_numerator'] = tempL[0]
                            tempDict['freq_denominator'] = tempL[1]
                            tempDict['numeric_freq'] = tempL[0] / tempL[1]

                    ## ONLY add frequency keys/values to the record if there are values
                    if tempDict:
                        record_dict.update(tempDict)
                        
                elif (k == 'Modifier') and v:
                ## only process if Modifier has a value
                    ## in <20 records, this is a delimited list with repeated values
                    ## this behavior matches the unlist behavior used with biothings APIs
                    ## https://github.com/kevinxin90/biothings.api/blob/master/biothings/utils/dataload.py
                    if ";" in v:
                        ## transform to list -> set->list to remove repeated values
                        tempMods = list(set(v.split(";")))
                        record_dict['modifier'] = tempMods
                    else:
                        record_dict['modifier'] = v
                elif k not in {"disease_id", "disease_name", 
                               "Aspect", "Sex", 
                               "Reference", "Frequency", 
                               "Modifier"}:
                    record_dict[k.lower()] = v
            pathway_related.append(record_dict)
        drecord = {
            "_id": did,
            "hpo": pathway_related,
            "disease_name": records[0]["disease_name"],
            "course": course,
            "modifiers": modifiers,
            "inheritance": inheritance
        }
        d.append(drecord)
 
    return {
        x["_id"]: [x["hpo"], x["disease_name"], x["course"], x["modifiers"], x["inheritance"]] for x in d
    }

In [4]:
processed1 = process_disease2hp(HPO_path)

In [8]:
## these are actually the same disease but obviously two diff sets of results. 
## see https://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=1777 and its id mapping
## mydisease resolves these two as one entity as well
len(processed1['OMIM:218340'][0])
processed1['OMIM:218340'][1]
len(processed1['ORPHANET:1777'][0])
processed1['ORPHANET:1777'][1]

30

'CRANIOFACIAL DYSMORPHISM WITH OCULAR COLOBOMA, ABSENT CORPUS CALLOSUM,AND AORTIC DILATATION'

25

'Temtamy syndrome'

25

[[{'hpo_id': 'HP:0000174',
   'orphanet_refs': ['ORPHANET:1777'],
   'evidence': 'TAS',
   'onset': None,
   'hp_freq': 'HP:0040283',
   'biocuration': 'ORPHA:orphadata[2021-02-08]'},
  {'hpo_id': 'HP:0000179',
   'orphanet_refs': ['ORPHANET:1777'],
   'evidence': 'TAS',
   'onset': None,
   'hp_freq': 'HP:0040283',
   'biocuration': 'ORPHA:orphadata[2021-02-08]'},
  {'hpo_id': 'HP:0000256',
   'orphanet_refs': ['ORPHANET:1777'],
   'evidence': 'TAS',
   'onset': None,
   'hp_freq': 'HP:0040282',
   'biocuration': 'ORPHA:orphadata[2021-02-08]'},
  {'hpo_id': 'HP:0000268',
   'orphanet_refs': ['ORPHANET:1777'],
   'evidence': 'TAS',
   'onset': None,
   'hp_freq': 'HP:0040282',
   'biocuration': 'ORPHA:orphadata[2021-02-08]'},
  {'hpo_id': 'HP:0000276',
   'orphanet_refs': ['ORPHANET:1777'],
   'evidence': 'TAS',
   'onset': None,
   'hp_freq': 'HP:0040282',
   'biocuration': 'ORPHA:orphadata[2021-02-08]'},
  {'hpo_id': 'HP:0000280',
   'orphanet_refs': ['ORPHANET:1777'],
   'evidence':

In [22]:
## CHECKS:
## decipher refs: 'DECIPHER:72', orphanet refs: ORPHANET:221 (none of these are in situations with multiple refs)
## omim and pmid refs in the same disease -> pheno association: 'OMIM:151610'
## combos of omim, isbn, website refs: 'OMIM:194190', OMIM:180849

processed1['OMIM:151610']

[[{'hpo_id': 'HP:0500043',
   'pmid_refs': ['PMID:3767680'],
   'omim_refs': ['OMIM:151610'],
   'evidence': 'IEA',
   'onset': None,
   'freq_numerator': 20,
   'freq_denominator': 20,
   'numeric_freq': 1.0,
   'biocuration': 'HPO:lccarmody[2018-10-05];HPO:skoehler[2018-10-08]'},
  {'hpo_id': 'HP:0000492',
   'pmid_refs': ['PMID:3767680'],
   'evidence': 'IEA',
   'onset': None,
   'biocuration': 'HPO:lccarmody[2018-10-05]'},
  {'hpo_id': 'HP:0000508',
   'pmid_refs': ['PMID:3767680'],
   'evidence': 'PCS',
   'onset': None,
   'biocuration': 'HPO:lccarmody[2018-10-05]'},
  {'hpo_id': 'HP:0000478',
   'omim_refs': ['OMIM:151610'],
   'evidence': 'IEA',
   'onset': None,
   'biocuration': 'HPO:iea[2009-02-17]'}],
 'LEVATOR-MEDIAL RECTUS SYNKINESIS',
 [],
 [],
 ['HP:0000006']]

In [7]:
rawHPO = pd.read_csv(HPO_path, sep="\t", skiprows=4, dtype=str)

In [13]:
rawHPO[rawHPO['#DatabaseID'] == 'OMIM:191900']

Unnamed: 0,#DatabaseID,DiseaseName,Qualifier,HPO_ID,Reference,Evidence,Onset,Frequency,Sex,Modifier,Aspect,Biocuration
41673,OMIM:191900,Muckle-Wells syndrome,,HP:0030953,PMID:25766347,PCS,,2/2,,HP:0025303,P,HPO:probinson[2020-11-17]
41674,OMIM:191900,Muckle-Wells syndrome,,HP:0003565,PMID:25766347,TAS,,2/2,,HP:0025303,P,HPO:probinson[2009-02-17];HPO:probinson[2020-1...
41675,OMIM:191900,Muckle-Wells syndrome,,HP:0003621,PMID:25766347,PCS,,1/2,,,C,HPO:probinson[2020-11-17]
41676,OMIM:191900,Muckle-Wells syndrome,,HP:0000365,PMID:14872505,PCS,HP:0011463,3/3,,,P,HPO:skoehler[2010-06-20];HPO:probinson[2020-11...
41677,OMIM:191900,Muckle-Wells syndrome,,HP:0005764,PMID:25766347,PCS,,2/2,,HP:0025303,P,HPO:probinson[2020-11-17]
41678,OMIM:191900,Muckle-Wells syndrome,,HP:0004322,PMID:14872505,PCS,,3/3,,,P,HPO:probinson[2020-11-17]
41679,OMIM:191900,Muckle-Wells syndrome,,HP:0000083,PMID:9704852,PCS,HP:0003584,,,,P,HPO:iea[2009-02-17];HPO:probinson[2020-11-17]
41680,OMIM:191900,Muckle-Wells syndrome,,HP:0011107,OMIM:191900,TAS,,,,,P,HPO:skoehler[2012-10-17]
41681,OMIM:191900,Muckle-Wells syndrome,,HP:0011227,PMID:25766347,PCS,,2/2,,,P,HPO:probinson[2020-11-17]
41682,OMIM:191900,Muckle-Wells syndrome,,HP:0012432,PMID:14872505,PCS,,3/3,,,P,HPO:probinson[2020-11-17]


## CTD troubleshooting

### disease -> chemical

In [2]:
CTD_path1 = pathlib.Path.home().joinpath('Downloads', 'CTD_chemicals_diseases.csv.gz')

Changed to only keep the direct annotations, and for the chunking to work correctly (not override the entries) and to deal with typing issues for the columns

In [6]:
def process_chemical(file_path_chemical):
    chunksize = 100000
    d = []
    for chunk in pd.read_csv(file_path_chemical, chunksize=chunksize, sep=',', comment='#', compression='gzip', 
                             names=['chemical_name', 'mesh_chemical_id', 'cas_registry_number', 'DiseaseName', 'DiseaseID', 'direct_evidence', 'inference_gene_symbol', 'inference_score', 'omim_id', 'pubmed'],
                             dtype=str):
        temp_chunk = chunk.copy()
        temp_chunk = temp_chunk.where((pd.notnull(temp_chunk)), None)
        ## remove all inferred annotations
        temp_chunk = temp_chunk[~ temp_chunk['direct_evidence'].isna()]
        ## only work with records if the dataframe still has records 
        if not temp_chunk.empty: 
            ## make this the correct type
            temp_chunk['inference_score'] = temp_chunk['inference_score'].astype(float)
            temp_chunk = temp_chunk.where((pd.notnull(temp_chunk)), None)
            # add new column called source
            temp_chunk['source'] = 'CTD'         
            # the record in these fields are separated by '|', need to convert them into list
            for field_id in ['omim_id', 'pubmed']:
                temp_chunk[field_id] = temp_chunk[field_id].apply(lambda x: x.split('|') if x and '|' in x else x)
            for did, subdf in temp_chunk.groupby('DiseaseID'):
                records = subdf.to_dict(orient='records')
                chemical_related = [{k: v for k, v in record.items() if k not in {'DiseaseName', 'DiseaseID'}} for record in records]            
                drecord = {'_id': did, 'chemical': chemical_related}
                d.append(drecord)
    finalDict = {}
    ## For now, I'm not merging records. Current data situation is separate records when relationship is marker/mechanism AND therapeutic
    for ele in d:
        tempID = ele['_id']
        ## if an entry for this disease already exists in the dictionary
        if tempID in finalDict.keys():
            finalDict[tempID] = finalDict[tempID] + ele['chemical']
        else:
            finalDict[tempID] = ele['chemical']
    return finalDict

In [7]:
trial1 = process_chemical(CTD_path1)

In [8]:
trial1['MESH:D006974']

[{'chemical_name': 'Amitriptyline',
  'mesh_chemical_id': 'D000639',
  'cas_registry_number': '50-48-6',
  'direct_evidence': 'marker/mechanism',
  'inference_gene_symbol': None,
  'inference_score': None,
  'omim_id': None,
  'pubmed': '7123337',
  'source': 'CTD'},
 {'chemical_name': 'Angiotensin-Converting Enzyme Inhibitors',
  'mesh_chemical_id': 'D000806',
  'cas_registry_number': None,
  'direct_evidence': 'therapeutic',
  'inference_gene_symbol': None,
  'inference_score': None,
  'omim_id': None,
  'pubmed': ['17541236', '6168412'],
  'source': 'CTD'},
 {'chemical_name': 'Atenolol',
  'mesh_chemical_id': 'D001262',
  'cas_registry_number': '29122-68-7',
  'direct_evidence': 'therapeutic',
  'inference_gene_symbol': None,
  'inference_score': None,
  'omim_id': None,
  'pubmed': '2493837',
  'source': 'CTD'},
 {'chemical_name': 'Atrasentan',
  'mesh_chemical_id': 'D000077868',
  'cas_registry_number': None,
  'direct_evidence': 'therapeutic',
  'inference_gene_symbol': None,
  '

### disease -> pathway

In [9]:
CTD_path2 = pathlib.Path.home().joinpath('Downloads', 'CTD_diseases_pathways.csv.gz')

In [20]:
lookat = pd.read_csv(CTD_path2, sep=',', comment='#', compression='gzip', names=['DiseaseName', 'DiseaseID', 'pathway_name', 'pathway_id', 'inference_gene_symbol'])

In [34]:
lookat.dtypes
589408 == 572377 + 17031

DiseaseName              object
DiseaseID                object
pathway_name             object
pathway_id               object
inference_gene_symbol    object
dtype: object

True

In [23]:
lookat.DiseaseID.str.contains('OMIM:').value_counts()
lookat.DiseaseID.str.contains('MESH:').value_counts()

False    572377
True      17031
Name: DiseaseID, dtype: int64

True     572377
False     17031
Name: DiseaseID, dtype: int64

In [41]:
lookat[lookat['DiseaseID'] == 'MESH:D011471']

Unnamed: 0,DiseaseName,DiseaseID,pathway_name,pathway_id,inference_gene_symbol
493395,Prostatic Neoplasms,MESH:D011471,2-Oxocarboxylic acid metabolism,KEGG:hsa01210,IDH1
493396,Prostatic Neoplasms,MESH:D011471,Abacavir transmembrane transport,REACT:R-HSA-2161517,SLC22A3
493397,Prostatic Neoplasms,MESH:D011471,Abacavir transport and metabolism,REACT:R-HSA-2161522,SLC22A3
493398,Prostatic Neoplasms,MESH:D011471,Abasic sugar-phosphate removal via the single-...,REACT:R-HSA-73930,APEX1
493399,Prostatic Neoplasms,MESH:D011471,ABC-family proteins mediated transport,REACT:R-HSA-382556,ABCC4
...,...,...,...,...,...
504205,Prostatic Neoplasms,MESH:D011471,YAP1- and WWTR1 (TAZ)-stimulated gene expression,REACT:R-HSA-2032785,PPARA
504206,Prostatic Neoplasms,MESH:D011471,YAP1- and WWTR1 (TAZ)-stimulated gene expression,REACT:R-HSA-2032785,RXRA
504207,Prostatic Neoplasms,MESH:D011471,YAP1- and WWTR1 (TAZ)-stimulated gene expression,REACT:R-HSA-2032785,TBL1XR1
504208,Prostatic Neoplasms,MESH:D011471,Zinc influx into cells by the SLC39 gene family,REACT:R-HSA-442380,SLC39A1


In [10]:
def process_pathway(file_path_pathway):
    # read in the data frame
    df_disease_pathway = pd.read_csv(file_path_pathway, sep=',', comment='#', compression='gzip',
                                     names=['DiseaseName', 'DiseaseID', 'pathway_name', 'pathway_id', 'inference_gene_symbol'],
                                     dtype=str)
    d = {}
    ## going to merge records with the same disease - pathway (but different inference_gene_symbol values)
    for grp, subdf in df_disease_pathway.groupby(['DiseaseID', 'pathway_id']):
        ## make a new record        
        record_dict = {
            'source': 'CTD',
            'pathway_name': subdf['pathway_name'].tolist()[0]}
        ## name the field based on pathway database
        tempPathwayID = grp[1].split(':')
        record_dict[tempPathwayID[0].lower() + '_pathway_id'] = tempPathwayID[1] 
        ## get the inference gene symbol list
        tempGeneL = subdf['inference_gene_symbol'].unique().tolist()
        if len(tempGeneL) == 1:
            record_dict['inference_gene_symbol'] = tempGeneL[0]
        else:
            record_dict['inference_gene_symbol'] = tempGeneL
        ## if disease key already exists in d
        if d.get(grp[0]):
            d[grp[0]].append(record_dict)
        else:
            d[grp[0]] = [record_dict]
        ## note: <20 diseases have >1000 unique pathways linked to them
    return d

In [11]:
pathwaydf = process_pathway(CTD_path2)

In [12]:
pathwaydf['MESH:D002294']

[{'source': 'CTD',
  'pathway_name': 'Glycolysis / Gluconeogenesis',
  'kegg_pathway_id': 'hsa00010',
  'inference_gene_symbol': ['ENO1', 'GAPDH', 'PGAM1', 'TPI1']},
 {'source': 'CTD',
  'pathway_name': 'Fructose and mannose metabolism',
  'kegg_pathway_id': 'hsa00051',
  'inference_gene_symbol': 'TPI1'},
 {'source': 'CTD',
  'pathway_name': 'Steroid hormone biosynthesis',
  'kegg_pathway_id': 'hsa00140',
  'inference_gene_symbol': ['CYP1A2', 'CYP1B1']},
 {'source': 'CTD',
  'pathway_name': 'Caffeine metabolism',
  'kegg_pathway_id': 'hsa00232',
  'inference_gene_symbol': 'CYP1A2'},
 {'source': 'CTD',
  'pathway_name': 'Pyrimidine metabolism',
  'kegg_pathway_id': 'hsa00240',
  'inference_gene_symbol': ['TYMS', 'UMPS']},
 {'source': 'CTD',
  'pathway_name': 'Glycine, serine and threonine metabolism',
  'kegg_pathway_id': 'hsa00260',
  'inference_gene_symbol': 'PGAM1'},
 {'source': 'CTD',
  'pathway_name': 'Tryptophan metabolism',
  'kegg_pathway_id': 'hsa00380',
  'inference_gene_symbo