# HPOA stuff

## Setup

In [1]:
## CX: allows multiple lines of code to print from one code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## to get around bugs
import nest_asyncio
nest_asyncio.apply()

import pathlib
import pandas as pd
import re

In [44]:
folder = pathlib.Path.home().joinpath('Desktop', 'ScrippsJob', 'DisGeNET')
allgene_pmid_path = folder.joinpath('all_variant_disease_pmid_associations.tsv')
allgene_pmid = pd.read_table(allgene_pmid_path)

In [45]:
allgene_pmid.shape
allgene_pmid.columns

(739842, 16)

Index(['snpId', 'chromosome', 'position', 'DSI', 'DPI', 'diseaseId',
       'diseaseName', 'diseaseType', 'diseaseClass', 'diseaseSemanticType',
       'score', 'EI', 'YearInitial', 'YearFinal', 'pmid', 'source'],
      dtype='object')

In [54]:
allgene_pmid[(allgene_pmid['diseaseId'] == 'C0000744') &
             (allgene_pmid['snpId'] == 'rs1367079155')]

Unnamed: 0,snpId,chromosome,position,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,pmid,source
185780,rs1367079155,4,99594764,1.0,0.08,C0000744,Abetalipoproteinemia,disease,C16;C18,Disease or Syndrome,0.7,1.0,1996.0,2015.0,25108285.0,UNIPROT
185781,rs1367079155,4,99594764,1.0,0.08,C0000744,Abetalipoproteinemia,disease,C16;C18,Disease or Syndrome,0.7,1.0,1996.0,2015.0,22236406.0,UNIPROT
185782,rs1367079155,4,99594764,1.0,0.08,C0000744,Abetalipoproteinemia,disease,C16;C18,Disease or Syndrome,0.7,1.0,1996.0,2015.0,10946006.0,UNIPROT
185783,rs1367079155,4,99594764,1.0,0.08,C0000744,Abetalipoproteinemia,disease,C16;C18,Disease or Syndrome,0.7,1.0,1996.0,2015.0,23475612.0,UNIPROT
185784,rs1367079155,4,99594764,1.0,0.08,C0000744,Abetalipoproteinemia,disease,C16;C18,Disease or Syndrome,0.7,1.0,1996.0,2015.0,26224785.0,UNIPROT
185785,rs1367079155,4,99594764,1.0,0.08,C0000744,Abetalipoproteinemia,disease,C16;C18,Disease or Syndrome,0.7,1.0,1996.0,2015.0,8939939.0,UNIPROT
185786,rs1367079155,4,99594764,1.0,0.08,C0000744,Abetalipoproteinemia,disease,C16;C18,Disease or Syndrome,0.7,1.0,1996.0,2015.0,10679949.0,UNIPROT


In [3]:
HPO_path = pathlib.Path.home().joinpath('Desktop', 'ScrippsJob', 'phenotype.hpoa')
hpoa = pd.read_table(HPO_path, sep="\t", skiprows=4, 
                    dtype=str)
hpoa.columns
hpoa.shape

Index(['#DatabaseID', 'DiseaseName', 'Qualifier', 'HPO_ID', 'Reference',
       'Evidence', 'Onset', 'Frequency', 'Sex', 'Modifier', 'Aspect',
       'Biocuration'],
      dtype='object')

(214124, 12)

In [4]:
## rename
hpoa.columns = ['DatabaseID', 'DiseaseName', 'Qualifier', 'HPO_ID', 'Reference',
                'Evidence', 'Onset', 'Frequency', 'Sex', 'Modifier', 'Aspect',
                'Biocuration']

## Look at DatabaseID, HPO_ID

- DatabaseID = disease's ID, HPO_ID = phenotype-related ID (HPO terms)
- Almost all of the database uses OMIM or ORPHA disease IDs, a little bit of the database has DECIPHER IDs.  
- Only 1 DatabaseID and 1 HPO_ID per row   
- There are 12003 unique disease IDs, 9566 unique phenotype-related IDs (HPO terms)



In [None]:
hpoa.DatabaseID.str.contains(';').value_counts()
hpoa.DatabaseID.str.extract('(.*):').value_counts()
112180+101648+296 == 214124
hpoa.DatabaseID.nunique()

In [None]:
hpoa.HPO_ID.str.contains(';').value_counts()
hpoa.HPO_ID.str.contains('HP:').value_counts()
hpoa.HPO_ID.nunique()

In [None]:
hpoa[hpoa.DatabaseID.str.contains('DECIPHER:')].DatabaseID.nunique()

## won't use: Look at DiseaseName

Only OMIM DiseaseNames are hard to parse (have ";" delimited, meaning multiple names). OMIM DiseaseNames also have weird ";;". Thinking of fields like this as "lists in string form" or "free text" (not simple labels or categorical variables). 

It looks like having " or " or ", " in the DiseaseName does NOT mean multiple names

In [None]:
hpoa[hpoa['DatabaseID'].str.contains('OMIM')].DiseaseName.str.contains(';').value_counts()
hpoa[hpoa['DatabaseID'].str.contains('ORPHA')].DiseaseName.str.contains(';').value_counts()
hpoa[hpoa['DatabaseID'].str.contains('DECIPHER')].DiseaseName.str.contains(';').value_counts()

In [None]:
## huh so Orphanet only gives P and C annotations
hpoa[hpoa['DatabaseID'].str.contains('ORPHA')].Aspect.value_counts()

## NEED DECISION: Qualifier

There are 1464 rows with annotations saying this disease DOES NOT have this phenotypic abnormality (aspect = "P"). Downstream services need to be able to deal with this, or we have to take these annotations out. 

In [28]:
hpoa['Qualifier'].value_counts()
hpoa[hpoa['Qualifier'] == "NOT"].Aspect.value_counts()

NOT    1464
Name: Qualifier, dtype: int64

P    1464
Name: Aspect, dtype: int64

In [29]:
hpoa[hpoa['Qualifier'] == "NOT"]

Unnamed: 0,DatabaseID,DiseaseName,Qualifier,HPO_ID,Reference,Evidence,Onset,Frequency,Sex,Modifier,Aspect,Biocuration
158,OMIM:605822,#605822 SPONDYLOOCULAR SYNDROME; SOS,NOT,HP:0000164,OMIM:605822,TAS,,,,,P,HPO:skoehler[2015-08-16]
162,OMIM:605822,#605822 SPONDYLOOCULAR SYNDROME; SOS,NOT,HP:0000591,OMIM:605822,TAS,,,,,P,HPO:skoehler[2015-08-16]
187,OMIM:127350,127350 DYSCHONDROSTEOSIS AND NEPHRITIS,NOT,HP:0001249,OMIM:127350,TAS,,,,,P,HPO:skoehler[2014-11-26]
322,OMIM:278750,"XERODERMA PIGMENTOSUM, VARIANT TYPE",NOT,HP:0000252,OMIM:278750,TAS,,,,,P,HPO:probinson[2012-08-01]
334,OMIM:278750,"XERODERMA PIGMENTOSUM, VARIANT TYPE",NOT,HP:0001510,OMIM:278750,TAS,,,,,P,HPO:probinson[2012-08-01]
...,...,...,...,...,...,...,...,...,...,...,...,...
213455,ORPHA:79284,Methylmalonic acidemia with homocystinuria typ...,NOT,HP:0003658,ORPHA:79284,TAS,,,,,P,ORPHA:orphadata[2020-12-09]
213471,ORPHA:556030,Early-onset familial hypoaldosteronism,NOT,HP:0000811,ORPHA:556030,TAS,,,,,P,ORPHA:orphadata[2020-12-09]
213933,ORPHA:79273,Hereditary coproporphyria,NOT,HP:0001903,ORPHA:79273,TAS,,,,,P,ORPHA:orphadata[2020-12-09]
213934,ORPHA:79273,Hereditary coproporphyria,NOT,HP:0001945,ORPHA:79273,TAS,,,,,P,ORPHA:orphadata[2020-12-09]


## NEED DECISION: Aspect

So...'P' are the Disease - PhenotypicFeature annotations

I want to edit parsing to treat 'I' (disease inheritance) annots like the other two 'C' (disease clinical course) and 'M' (disease modifier). What to do with this (annotate disease nodes?).  

But the extra trickiness is that these annots come with baggage...Reference, Frequency, Sex, Modifier info. It's hard to know how to structure this. 

In [None]:
hpoa.Aspect.value_counts()

In [None]:
hpoa[hpoa['Aspect'] != 'P'].count()

## NEED DECISION: Reference

### Intro

This column (every row has a value) is tricky because...  
- it is ";" delimited: ~ 900 rows have multiple values. Max number of values is 7 o_0. 
- there are multiple kinds of IDs: DECIPHER, ISBN, ISBN-10, ISBN-13, OMIM, ORPHA, PMID and websites (http and https)

So....with the current metadata, publications would be ISBN, ISBN-10, ISBN-13, and PMID (separate by ID namespace).     
Websites would be http, https, OMIM, ORPHA, DECIPHER (construct URLs, then keep all together). 

In [None]:
references = hpoa.Reference.str.split(';')

In [None]:
## see how many rows have multiple references
count = []
for i in references:
    count.append(len(i))
pd.Series(count).value_counts()

In [None]:
## what prefixes are there?
set_prefixes = set()
p = re.compile("(.*):")
for i in references:
    for n in i:
        tempPrefix = p.findall(n)
        set_prefixes.update(tempPrefix)

In [None]:
set_prefixes

### Some EDA

So...223 rows have a single DECIPHER IDs as their "reference".

In [None]:
hpoa.Reference.str.contains('DECIPHER').value_counts()
hpoa[(hpoa.Reference.str.contains('DECIPHER')) &
     (hpoa.Reference.str.contains(';'))]

hpoa[(hpoa.Reference.str.contains('DECIPHER'))].head()

So...
- 1 row has a single ISBN ID 
- 8 rows have a single ISBN-10 ID (1 unique value)
- 431 rows have ISBN-13 ID as reference
  - 45 of those rows have more than one reference; you can see mixes with OMIM and http IDs...

In [5]:
hpoa[hpoa.Reference.str.contains('ISBN:')].Reference
hpoa[hpoa.Reference.str.contains('ISBN-10:')].Reference
hpoa.Reference.str.contains('ISBN-13:').value_counts()
hpoa[(hpoa.Reference.str.contains('ISBN-13:')) &
     (hpoa.Reference.str.contains(';'))].shape
hpoa[(hpoa.Reference.str.contains('ISBN-13:')) &
     (hpoa.Reference.str.contains(';'))].head().Reference.to_list()

21064    ISBN:3642035590
Name: Reference, dtype: object

84480    ISBN-10:0-19-262896-8
84482    ISBN-10:0-19-262896-8
84483    ISBN-10:0-19-262896-8
84484    ISBN-10:0-19-262896-8
84488    ISBN-10:0-19-262896-8
84489    ISBN-10:0-19-262896-8
84492    ISBN-10:0-19-262896-8
84494    ISBN-10:0-19-262896-8
Name: Reference, dtype: object

False    213693
True        431
Name: Reference, dtype: int64

(45, 12)

['http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ofd1;ISBN-13:978-0721606156',
 'http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ofd1;ISBN-13:978-0721606156',
 'ISBN-13:978-0721606156;OMIM:252100',
 'http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ofd1;ISBN-13:978-0721606156',
 'http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ofd1;ISBN-13:978-0721606156']

In [9]:
hpoa[hpoa.Reference.str.contains('ISBN-10')]


Unnamed: 0,DatabaseID,DiseaseName,Qualifier,HPO_ID,Reference,Evidence,Onset,Frequency,Sex,Modifier,Aspect,Biocuration
84480,OMIM:174900,JUVENILE POLYPOSIS SYNDROME JUVENILE INTESTINA...,,HP:0002014,ISBN-10:0-19-262896-8,PCS,,HP:0040282,,,P,HPO:iea[2009-02-17]
84482,OMIM:174900,JUVENILE POLYPOSIS SYNDROME JUVENILE INTESTINA...,,HP:0002573,ISBN-10:0-19-262896-8,PCS,,HP:0040283,,,P,HPO:iea[2009-02-17]
84483,OMIM:174900,JUVENILE POLYPOSIS SYNDROME JUVENILE INTESTINA...,,HP:0002035,ISBN-10:0-19-262896-8,PCS,,HP:0040283,,,P,HPO:iea[2009-02-17]
84484,OMIM:174900,JUVENILE POLYPOSIS SYNDROME JUVENILE INTESTINA...,,HP:0003003,ISBN-10:0-19-262896-8,PCS,,20%,,,P,HPO:iea[2009-02-17]
84488,OMIM:174900,JUVENILE POLYPOSIS SYNDROME JUVENILE INTESTINA...,,HP:0001903,ISBN-10:0-19-262896-8,PCS,,HP:0040282,,,P,HPO:iea[2009-02-17]
84489,OMIM:174900,JUVENILE POLYPOSIS SYNDROME JUVENILE INTESTINA...,,HP:0006771,ISBN-10:0-19-262896-8,PCS,,HP:0040283,,,P,HPO:iea[2010-06-08]
84492,OMIM:174900,JUVENILE POLYPOSIS SYNDROME JUVENILE INTESTINA...,,HP:0006753,ISBN-10:0-19-262896-8,PCS,,HP:0040283,,,P,HPO:iea[2010-06-08]
84494,OMIM:174900,JUVENILE POLYPOSIS SYNDROME JUVENILE INTESTINA...,,HP:0002027,ISBN-10:0-19-262896-8,PCS,,HP:0040282,,,P,HPO:iea[2009-02-17]


### Parsing to separate

Creating new columns for reference information:
- ISBN, ISBN-10, ISBN-13 -> put down as isbn
- PMID 
- one column of websites, including....
  - http, https (keep URLs)
  - OMIM, DECIPHER (construct URLs, then keep all together)
  - ORPHA (just give URL to search for disease, can't direct link to disease...)

In [None]:
listISBN = []
listISBN10 = []
listISBN13 = []
listPMID = []
listWebsites = []

for entry in references:   
    tempISBN = []
    tempISBN10 = []
    tempISBN13 = []
    tempPMID = []
    tempWebsites = []
    for i in entry:
        if 'ISBN:' in i:
            ## just substring for the rest
            tempISBN.append(i[5:])
        elif 'ISBN-10:' in i:
            ## just substring for the rest
            tempISBN10.append(i[8:])          
        elif 'ISBN-13:' in i:
            ## just substring for the rest
            tempISBN13.append(i[8:])   
        elif 'PMID:' in i:
            ## just substring for the rest
            tempPMID.append(i[5:])     
        elif 'http' in i:
        ## add http or https stuff straight to websites
            tempWebsites.append(i)
        elif 'DECIPHER:' in i:
            ## create website url
            tempID = i[9:]
            tempStr = 'https://decipher.sanger.ac.uk/syndrome/{0}/overview'.format(tempID)
            tempWebsites.append(tempStr)
        elif 'OMIM:' in i:
            ## create website url
            tempID = i[5:]
            tempStr = 'https://www.omim.org/entry/{0}'.format(tempID)
            tempWebsites.append(tempStr)  
        elif 'ORPHA:' in i:
        ## it doesn't look like I can directly link this website >.<
            tempWebsites.append('https://www.orpha.net/consor/cgi-bin/Disease.php?lng=EN')    
        else:
            print("encountered unexpected reference, please check parsing:\n{0}".format(i))
    
    if not tempISBN: ## is empty
        listISBN.append(None)
    else:
        ## remove duplicates
        tempISBN = list(set(tempISBN))
        listISBN.append(tempISBN)    
        
    if not tempISBN10: ## is empty
        listISBN10.append(None)
    else:
        ## remove duplicates
        tempISBN10 = list(set(tempISBN10))
        listISBN10.append(tempISBN10)  
    
    if not tempISBN13: ## is empty
        listISBN13.append(None)
    else:
        ## remove duplicates
        tempISBN13 = list(set(tempISBN13))
        listISBN13.append(tempISBN13)     
        
    if not tempPMID: ## is empty
        listPMID.append(None)
    else:
        ## remove duplicates
        tempPMID = list(set(tempPMID))
        listPMID.append(tempPMID)   
        
    if not tempWebsites: ## is empty
        listWebsites.append(None)
    else:
        ## remove duplicates
        tempWebsites = list(set(tempWebsites))
        listWebsites.append(tempWebsites)          

In [None]:
## create new columns
hpoa = hpoa.assign(isbn = listISBN, isbn10 = listISBN10, isbn13 = listISBN13,
                   pmid = listPMID, websites = listWebsites)

## won't use: Evidence

Every row has one value, either IEA, PCS, or TAS.  
I don't think I can use this for provenance easily though, since...
* sometimes Evidence says IEA but Biocuration has a person's ID
* sometimes Evidence isn't IEA but Biocuration says iea 

In [None]:
hpoa.Evidence.value_counts()
hpoa.Evidence.count()

In [None]:
## showing that evidence is IEA but Biocuration isn't: happens for ~2/5 of the IEA entries
hpoa[(hpoa['Evidence'] == 'IEA') &
     (~ hpoa['Biocuration'].str.contains('iea'))].shape
hpoa[(hpoa['Evidence'] == 'IEA') &
     (~ hpoa['Biocuration'].str.contains('iea'))].head()

In [None]:
## showing that evidence isn't but Biocuration is (a small percentage of the rows)
4383/(142433+19903)
hpoa[(hpoa['Evidence'] != 'IEA') &
     (hpoa['Biocuration'].str.contains('iea'))].shape
hpoa[(hpoa['Evidence'] != 'IEA') &
     (hpoa['Biocuration'].str.contains('iea'))].head()

## decision: Onset

* very few rows (<1%) have this information
* currently each row has only one value (no ";" delimiter) but I dunno if this will change
* All rows with this info are "disease has phenotypic abnormality" rows (Aspect = P)
* I think this can be a categorical variable on the Disease - PhenotypicFeature edge. 
* There may be a rough chronological order to the terms (for listing possible values)
* likely use comments to make these terms actually understandable

In [None]:
hpoa.dropna(subset = ['Onset']).Aspect.value_counts()
hpoa.Onset.count()
1651/214124
hpoa.Onset.nunique()
hpoa.Onset.value_counts()

In [None]:
## this is a weird annotation since this term just means "Onset"
## https://hpo.jax.org/app/browse/term/HP:0003674
## can it be removed or...? 
hpoa[hpoa.Onset == "HP:0003674"].Reference.to_dict()

## NEED DECISION: Frequency

### Intro

This column is tricky because there are multiple kinds of values. Currently only one value per row (no ";" delimitor) 
- some are fractions: could convert to decimal, but keeping numerator and denominator could give prevalence info (although the denominator is tricky to describe since it's people with the disease only; the other two are prevalence/specificity of phenotype for the disease). 
- one of the fractions is '0/0'. Please be careful with this
- some are percentages: these could convert to decimal easily
- some are HP terms: these are a categorical variable...

Extra trickiness: apparently other kinds of disease annotations (course, inheritance, modifier) also have frequency...

In [None]:
hpoa[~ hpoa['Frequency'].isna()].Aspect.value_counts()
hpoa.Frequency.count()
120409 / 214124
hpoa.Frequency.str.contains(";").value_counts()
hpoa.Frequency.str.contains("/").value_counts()
hpoa.Frequency.str.contains("%").value_counts()
hpoa.Frequency.str.contains("HP:").value_counts()
12631+306+107472 == 120409

### Some EDA

note that some frequency terms are 1 since the fraction is 1/1

In [None]:
hpoa.Frequency

### Parsing to separate

Code below makes 4 columns: categorical frequency (for HP terms), numeric frequency (for percentages and fractions), numerator and denominator (for fractions). 

categorical frequency ends up with 5 diff HP terms. 

In [None]:
categoryFreq = []
numericFreq = []
numerator = []
denominator = []

for i in hpoa.Frequency.astype(str):
    ## since only one value in each row
    if i == 'nan':
        categoryFreq.append(None)
        numericFreq.append(None)
        numerator.append(None)
        denominator.append(None)        
    elif 'HP:' in i:
        categoryFreq.append(i)
        numericFreq.append(None)
        numerator.append(None)
        denominator.append(None)
    elif '%' in i:
        tempN = float(i.strip('%')) / 100.0
        numericFreq.append(tempN)
        categoryFreq.append(None)
        numerator.append(None)
        denominator.append(None)
    elif '/' in i:
        tempL = i.split("/")
        tempL = [int(i) for i in tempL]
        ## if numerator or denominator is 0, since that happened
        ## if numerator > denominator, since that happened too
        if (tempL[1] == 0) or (tempL[0] == 0) or (tempL[0] > tempL[1]):
            ## current decision is to leave it blank
            numericFreq.append(None)
            categoryFreq.append(None)
            numerator.append(None)
            denominator.append(None)    
        else:
            numerator.append(int(tempL[0]))
            denominator.append(int(tempL[1]))       
            numericFreq.append(eval(i))
            categoryFreq.append(None)        
    else:
        print("encountered unexpected format, please check parsing:\n{0}".format(i))

In [None]:
## create new columns
hpoa = hpoa.assign(FreqCategories = categoryFreq, NumericFreq = numericFreq, FreqNumerator = numerator,
                   FreqDenominator = denominator)

In [None]:
hpoa.FreqCategories.value_counts()
hpoa.NumericFreq.describe()
hpoa.FreqNumerator.describe()
hpoa.FreqDenominator.describe()

In [None]:
## wow some big numbers
hpoa[hpoa.FreqNumerator > 300]

## NEED CHECK: Sex

so it's super rare to have this field. Extra trickiness: apparently other kinds of disease annotations (course, inheritance, modifier) also have sex...    
Also...it has inconsistent capitalization. I think the [current parsing code deals with this](https://github.com/biothings/mydisease.info/blob/master/src/plugins/hpo/parser.py#L52) but check

In [None]:
hpoa.Sex.count()
hpoa.Sex.value_counts()
hpoa[~ hpoa['Sex'].isna()].Aspect.value_counts()

## NEED DECISION: Modifier

It's rare to have this field   
This column is tricky because...
- there can be multiple values. I currently don't support multiple values in my categorical variables...
- there's a lot of different values. Hard to list them all out

Extra trickiness: apparently other kinds of disease annotations (course, inheritance) also have modifier...  

In [None]:
hpoa.Modifier.count()
hpoa[~ hpoa['Modifier'].isna()].Aspect.value_counts()
hpoa.Modifier.value_counts()

In [None]:
hpoa['Modifier'] = hpoa['Modifier'].str.split(';')

In [None]:
hpoa['Modifier'].value_counts()

## won't use: Biocuration

This column is tricky because...
- there can be multiple values.
- each value has two parts: a person/entity ID and a date

In [None]:
hpoa.Biocuration.str.contains(";").value_counts()
hpoa.Biocuration.to_list()[0:5]

# exploring CTD data

## disease -> chemical

In [2]:
CTD_path = pathlib.Path.home().joinpath('Downloads', 'CTD_chemicals_diseases.csv')
CTD = pd.read_table(CTD_path, sep=",", comment='#', 
                    names=['chemical_name', 'mesh_chemical_id', 'cas_registry_number', 'DiseaseName', 
                           'DiseaseID', 'direct_evidence', 'inference_gene_symbol', 'inference_score', 
                           'omim_id', 'pubmed'])
CTD.columns
CTD.shape

Index(['chemical_name', 'mesh_chemical_id', 'cas_registry_number',
       'DiseaseName', 'DiseaseID', 'direct_evidence', 'inference_gene_symbol',
       'inference_score', 'omim_id', 'pubmed'],
      dtype='object')

(7119363, 10)

In [15]:
CTD[(CTD['DiseaseID'] == 'MESH:D012128')]

Unnamed: 0,chemical_name,mesh_chemical_id,cas_registry_number,DiseaseName,DiseaseID,direct_evidence,inference_gene_symbol,inference_score,omim_id,pubmed
1093,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,,"Respiratory Distress Syndrome, Adult",MESH:D012128,,SOD2,3.58,,25070658
1511,10-(fluoroethoxyphosphinyl)-N-(biotinamidopent...,C403065,,"Respiratory Distress Syndrome, Adult",MESH:D012128,,ALB,3.92,,12394941
4688,"1,10-phenanthroline",C025205,66-71-7,"Respiratory Distress Syndrome, Adult",MESH:D012128,,ACE,8.33,,12204859|22009550
4689,"1,10-phenanthroline",C025205,66-71-7,"Respiratory Distress Syndrome, Adult",MESH:D012128,,ALB,8.33,,12394941
4690,"1,10-phenanthroline",C025205,66-71-7,"Respiratory Distress Syndrome, Adult",MESH:D012128,,SOD2,8.33,,25070658
...,...,...,...,...,...,...,...,...,...,...
7115048,zoxamide,C451427,,"Respiratory Distress Syndrome, Adult",MESH:D012128,,CCL2,4.16,,25070658
7118966,Zymosan,D015054,9010-72-4,"Respiratory Distress Syndrome, Adult",MESH:D012128,,CCL2,8.56,,25070658
7118967,Zymosan,D015054,9010-72-4,"Respiratory Distress Syndrome, Adult",MESH:D012128,,PLA2G4A,8.56,,10881173
7118968,Zymosan,D015054,9010-72-4,"Respiratory Distress Syndrome, Adult",MESH:D012128,,THBD,8.56,,12707536


In [19]:
CTD[(CTD['DiseaseID'] == 'MESH:D012128')].chemical_name.nunique()

2795

In [18]:
CTD[(CTD['DiseaseID'] == 'MESH:D012128') &
    (CTD.chemical_name.str.startswith('Zinc'))]

Unnamed: 0,chemical_name,mesh_chemical_id,cas_registry_number,DiseaseName,DiseaseID,direct_evidence,inference_gene_symbol,inference_score,omim_id,pubmed
7076704,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,ACAA2,27.23,,25070658
7076705,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,ACE,27.23,,12204859|22009550
7076706,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,ACO2,27.23,,25070658
7076707,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,ALAD,27.23,,25070658
7076708,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,ALB,27.23,,12394941
7076709,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,APC,27.23,,25070658
7076710,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,CCL2,27.23,,25070658
7076711,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,CYB5A,27.23,,25070658
7076712,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,EDN1,27.23,,16625121
7076713,Zinc,D015032,7440-66-6,"Respiratory Distress Syndrome, Adult",MESH:D012128,,EIF2AK1,27.23,,25070658


In [None]:
CTD.direct_evidence.value_counts()

In [None]:
CTD.inference_score.count()
CTD.inference_score.describe()

In [None]:
CTD.DiseaseID.str.contains('OMIM:').value_counts()
CTD.DiseaseID.str.contains('MESH:').value_counts()
## so all diseaseIDs are either MESH or OMIM

## disease -> pathway

In [20]:
CTD2_path = pathlib.Path.home().joinpath('Downloads', 'CTD_diseases_pathways.csv')
CTD2 = pd.read_table(CTD2_path, sep=",", comment='#', 
                    names=['DiseaseName', 'DiseaseID', 'PathwayName', 'PathwayID', 'InferenceGeneSymbol'])
CTD2.columns
CTD2.shape

Index(['DiseaseName', 'DiseaseID', 'PathwayName', 'PathwayID',
       'InferenceGeneSymbol'],
      dtype='object')

(588819, 5)

In [21]:
CTD2[(CTD2['DiseaseID'] == 'MESH:D012128')]

Unnamed: 0,DiseaseName,DiseaseID,PathwayName,PathwayID,InferenceGeneSymbol
518361,"Respiratory Distress Syndrome, Adult",MESH:D012128,2-Oxocarboxylic acid metabolism,KEGG:hsa01210,ACO2
518362,"Respiratory Distress Syndrome, Adult",MESH:D012128,ABC-family proteins mediated transport,REACT:R-HSA-382556,PSMA4
518363,"Respiratory Distress Syndrome, Adult",MESH:D012128,ABC transporter disorders,REACT:R-HSA-5619084,PSMA4
518364,"Respiratory Distress Syndrome, Adult",MESH:D012128,Activation of APC/C and APC/C:Cdc20 mediated d...,REACT:R-HSA-176814,PSMA4
518365,"Respiratory Distress Syndrome, Adult",MESH:D012128,Activation of Matrix Metalloproteinases,REACT:R-HSA-1592389,TIMP1
...,...,...,...,...,...
519008,"Respiratory Distress Syndrome, Adult",MESH:D012128,Vesicle-mediated transport,REACT:R-HSA-5653656,PLA2G4A
519009,"Respiratory Distress Syndrome, Adult",MESH:D012128,Vif-mediated degradation of APOBEC3G,REACT:R-HSA-180585,PSMA4
519010,"Respiratory Distress Syndrome, Adult",MESH:D012128,Vitamin C (ascorbate) metabolism,REACT:R-HSA-196836,CYB5A
519011,"Respiratory Distress Syndrome, Adult",MESH:D012128,Vpu mediated degradation of CD4,REACT:R-HSA-180534,PSMA4


In [24]:
CTD2[(CTD2['DiseaseID'] == 'MESH:D012128')].PathwayID.nunique()

428

In [None]:
CTD2.DiseaseID.str.contains('OMIM:').value_counts()
CTD2.DiseaseID.str.contains('MESH:').value_counts()
## so all diseaseIDs are either MESH or OMIM

In [None]:
CTD2.PathwayID.str.contains('REACT:').value_counts()
CTD2.PathwayID.str.contains('KEGG:').value_counts()

In [None]:
CTD2[CTD2['DiseaseID'].str.contains('MESH')].PathwayID.str.contains('REACT:').value_counts()
## so disease MESH - REACT pathway and disease MESH - KEGG pathway both exist

In [None]:
CTD2[CTD2['DiseaseID'].str.contains('OMIM')].PathwayID.str.contains('REACT:').value_counts()
## so disease OMIM - REACT pathway and disease OMIM - KEGG pathway both exist