# HPOA stuff

## Setup

In [1]:
## CX: allows multiple lines of code to print from one code block
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## to get around bugs
import nest_asyncio
nest_asyncio.apply()

import pathlib
import pandas as pd
import re



In [2]:
def process_gene(file_path_gene_disease):
    df_gene_disease = pd.read_csv(
        file_path_gene_disease,
        encoding="ISO-8859-1",
        sep="\t",
        comment="#",
        compression="gzip",
    )
    rename_gene = {
        "diseaseId": "umls",
        "geneId": "gene_id",
        "geneSymbol": "gene_name",
        "diseaseName": "disease_name",
        "pmid": "pubmed",
    }
    df_gene_disease = df_gene_disease.where((pd.notnull(df_gene_disease)), None)
    # source field could be multiple data sources concatenated by ";", break them into a list
    # df_gene_disease.source = to_list(df_gene_disease.source)
    # df_gene_disease.diseaseType = to_list(df_gene_disease.diseaseType)
    # df_gene_disease.diseaseSemanticType = to_list(df_gene_disease.diseaseSemanticType)
    d = defaultdict(list)
    # rename pandas columns
    df_gene_disease = df_gene_disease.rename(columns=rename_gene)
    # for each gene, group the results based on source, and merge all pubmed IDs together
    for grp, subdf in df_gene_disease.groupby(["umls", "source", "gene_id"]):
        records = subdf.to_dict(orient="records")
        doc = {"source": grp[1], "gene_id": int(grp[2]), "pubmed": set()}
        for record in records:
            for k, v in record.items():
                if isinstance(v, np.int64):
                    record[k] = int(v)
                if k in ["gene_name", "DSI", "DPI", "score", "EI"]:
                    doc[k] = v
                elif k in ["YearInitial", "YearFinal"]:
                    doc[k] = int(v) if v else v
                elif k == "pubmed" and v:
                    doc[k].add(int(v))
        doc["pubmed"] = list(doc["pubmed"])
        d[grp[0].replace("umls", "umls_cui")].append(doc)
    return d

In [5]:
folder = pathlib.Path.home().joinpath('Desktop', 'ScrippsJob', 'DisGeNET')
allgene_pmid_path = folder.joinpath('all_gene_disease_pmid_associations.tsv.gz')
allgene_pmid = process_gene(allgene_pmid_path)

NameError: name 'defaultdict' is not defined

In [2]:
HPO_path = pathlib.Path.home().joinpath('Desktop', 'ScrippsJob', 'phenotype.hpoa')
hpoa = pd.read_table(HPO_path, sep="\t", skiprows=4, 
                    dtype=str)
hpoa.columns
hpoa.shape

Index(['#DatabaseID', 'DiseaseName', 'Qualifier', 'HPO_ID', 'Reference',
       'Evidence', 'Onset', 'Frequency', 'Sex', 'Modifier', 'Aspect',
       'Biocuration'],
      dtype='object')

(216197, 12)

In [3]:
## rename
hpoa.columns = ['DatabaseID', 'DiseaseName', 'Qualifier', 'HPO_ID', 'Reference',
                'Evidence', 'Onset', 'Frequency', 'Sex', 'Modifier', 'Aspect',
                'Biocuration']

## Look at DatabaseID, HPO_ID

- DatabaseID = disease's ID, HPO_ID = phenotype-related ID (HPO terms)
- Almost all of the database uses OMIM or ORPHA disease IDs, a little bit of the database has DECIPHER IDs.  
- Only 1 DatabaseID and 1 HPO_ID per row   
- There are 12082 unique disease IDs, 9646 unique phenotype-related IDs (HPO terms)

In [4]:
hpoa.DatabaseID.str.contains(';').value_counts()
hpoa.DatabaseID.str.extract('(.*):').value_counts()
112500+103410+287 == 216197
hpoa.DatabaseID.nunique()

hpoa[hpoa.DatabaseID.str.contains('DECIPHER:')].DatabaseID.nunique()

False    216197
Name: DatabaseID, dtype: int64

OMIM        112500
ORPHA       103410
DECIPHER       287
dtype: int64

True

12082

47

In [5]:
hpoa[hpoa.DatabaseID.str.contains('DECIPHER:')].DatabaseID.unique()

array(['DECIPHER:72', 'DECIPHER:45', 'DECIPHER:85', 'DECIPHER:18',
       'DECIPHER:15', 'DECIPHER:37', 'DECIPHER:38', 'DECIPHER:21',
       'DECIPHER:43', 'DECIPHER:66', 'DECIPHER:42', 'DECIPHER:67',
       'DECIPHER:57', 'DECIPHER:46', 'DECIPHER:16', 'DECIPHER:19',
       'DECIPHER:58', 'DECIPHER:17', 'DECIPHER:70', 'DECIPHER:4',
       'DECIPHER:39', 'DECIPHER:8', 'DECIPHER:14', 'DECIPHER:2',
       'DECIPHER:81', 'DECIPHER:44', 'DECIPHER:51', 'DECIPHER:59',
       'DECIPHER:47', 'DECIPHER:53', 'DECIPHER:34', 'DECIPHER:92',
       'DECIPHER:31', 'DECIPHER:20', 'DECIPHER:35', 'DECIPHER:68',
       'DECIPHER:54', 'DECIPHER:52', 'DECIPHER:32', 'DECIPHER:1',
       'DECIPHER:76', 'DECIPHER:62', 'DECIPHER:29', 'DECIPHER:74',
       'DECIPHER:65', 'DECIPHER:3', 'DECIPHER:48'], dtype=object)

In [None]:
hpoa.HPO_ID.str.contains(';').value_counts()
hpoa.HPO_ID.str.contains('HP:').value_counts()
hpoa.HPO_ID.nunique()

## won't use: Look at DiseaseName

Only OMIM DiseaseNames are hard to parse (have ";" delimited, meaning multiple names). OMIM DiseaseNames also have weird ";;". Thinking of fields like this as "lists in string form" or "free text" (not simple labels or categorical variables). 

It looks like having " or " or ", " in the DiseaseName does NOT mean multiple names

In [None]:
hpoa[hpoa['DatabaseID'].str.contains('OMIM')].DiseaseName.str.contains(';').value_counts()
hpoa[hpoa['DatabaseID'].str.contains('ORPHA')].DiseaseName.str.contains(';').value_counts()
hpoa[hpoa['DatabaseID'].str.contains('DECIPHER')].DiseaseName.str.contains(';').value_counts()

In [None]:
## huh so Orphanet only gives P and C annotations
hpoa[hpoa['DatabaseID'].str.contains('ORPHA')].Aspect.value_counts()

In [None]:
hpoa[hpoa['DatabaseID'].str.contains('OMIM')].Aspect.value_counts()
hpoa[hpoa['DatabaseID'].str.contains('DECIPHER')].Aspect.value_counts()

## modified parser: Qualifier

There are 1464 rows with annotations saying this disease DOES NOT have this phenotypic abnormality (aspect = "P"). Downstream services need to be able to deal with this, or we have to take these annotations out. 

I checked https://hpo.jax.org/app/browse/disease/OMIM:614856 and I don't see the NOT annotations. Using Chunlei's words, this would mean NOT exposing the info. 

In [None]:
hpoa['Qualifier'].value_counts()
hpoa[hpoa['Qualifier'] == "NOT"].Aspect.value_counts()

In [None]:
hpoa[hpoa['Qualifier'] == "NOT"]

In [None]:
hpoa.where((pd.notnull(hpoa)), None)

## modified parser: Aspect

So...'P' are the Disease - PhenotypicFeature annotations

I want to edit parsing to treat 'I' (disease inheritance) annots like the other two 'C' (disease clinical course) and 'M' (disease modifier). What to do with this (annotate disease nodes?).  

But the extra trickiness is that these annots come with baggage...Reference, Frequency, Sex, Modifier info. It's hard to know how to structure this. 

In [None]:
hpoa.Aspect.value_counts()

In [None]:
hpoa[hpoa['Aspect'] != 'P'].count()

## modified parser: Reference

### Intro

This column (every row has a value) is tricky because...  
- it is ";" delimited: ~ 1000 rows have multiple values. Max number of values is 7 o_0. 
- there are multiple kinds of IDs: DECIPHER, ISBN (ISBN, ISBN-10, ISBN-13), OMIM, ORPHA, PMID and websites (http and https)

So....with the current metadata, publications would be ISBN and PMID (separate by ID namespace).     
Websites would be http, https, OMIM, ORPHA, DECIPHER (construct URLs, then keep all together). 

In [8]:
references = hpoa.Reference.str.split(';')

In [9]:
## see how many rows have multiple references
count = []
for i in references:
    count.append(len(i))
pd.Series(count).value_counts()

1    215223
2       876
3        82
4         8
5         4
6         3
7         1
dtype: int64

In [10]:
hpoa[pd.Series(count) == 3].Reference

2131              PMID:23692737;PMID:17206620;PMID:15728585
2136              PMID:23692737;PMID:17206620;PMID:15728585
3036      http://www.ncbi.nlm.nih.gov/books/NBK1526/;PMI...
4976              PMID:22512499;PMID:15814878;PMID:17392301
9320                PMID:11050632;PMID:7987310;PMID:8588586
                                ...                        
106140             PMID:11180245;PMID:8213919;PMID:12210303
106147             PMID:3377005;PMID:14684697;PMID:12210303
106165             PMID:8213919;PMID:14684697;PMID:12618959
106169              PMID:3377005;PMID:8213919;PMID:12210303
106178              PMID:3377005;PMID:8213919;PMID:12210303
Name: Reference, Length: 82, dtype: object

In [None]:
## what prefixes are there?
set_prefixes = set()
p = re.compile("(.*):")
for i in references:
    for n in i:
        tempPrefix = p.findall(n)
        set_prefixes.update(tempPrefix)

In [None]:
set_prefixes

### Some EDA

So...214 rows have a single DECIPHER IDs as their "reference".

In [None]:
hpoa[(hpoa.Reference.str.split(";"))]

In [None]:
hpoa.Reference.str.contains('DECIPHER').value_counts()
hpoa[(hpoa.Reference.str.contains('DECIPHER')) &
     (hpoa.Reference.str.contains(';'))]

hpoa[(hpoa.Reference.str.contains('DECIPHER'))].head()

So...
- 1 row has a single ISBN ID 
- 8 rows have a single ISBN-10 ID (1 unique value)
- 431 rows have ISBN-13 ID as reference
  - 45 of those rows have more than one reference; you can see mixes with OMIM and http IDs...

In [6]:
hpoa[hpoa.Reference.str.contains('ISBN:')].Reference
hpoa[hpoa.Reference.str.contains('ISBN-10:')].Reference
hpoa.Reference.str.contains('ISBN-13:').value_counts()
hpoa[(hpoa.Reference.str.contains('ISBN-13:')) &
     (hpoa.Reference.str.contains(';'))].shape
hpoa[(hpoa.Reference.str.contains('ISBN-13:')) &
     (hpoa.Reference.str.contains(';'))].head().Reference.to_list()

110660    ISBN:3642035590
Name: Reference, dtype: object

101045    ISBN-10:0-19-262896-8
101047    ISBN-10:0-19-262896-8
101048    ISBN-10:0-19-262896-8
101049    ISBN-10:0-19-262896-8
101053    ISBN-10:0-19-262896-8
101054    ISBN-10:0-19-262896-8
101057    ISBN-10:0-19-262896-8
101059    ISBN-10:0-19-262896-8
Name: Reference, dtype: object

False    215768
True        429
Name: Reference, dtype: int64

(45, 12)

['http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ofd1;ISBN-13:978-0721606156',
 'http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ofd1;ISBN-13:978-0721606156',
 'ISBN-13:978-0721606156;OMIM:252100',
 'http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ofd1;ISBN-13:978-0721606156',
 'http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?book=gene&part=ofd1;ISBN-13:978-0721606156']

In [17]:
hpoa[(hpoa.Reference.str.contains('http')) &
     (hpoa.Reference.str.contains(';'))]

Unnamed: 0,DatabaseID,DiseaseName,Qualifier,HPO_ID,Reference,Evidence,Onset,Frequency,Sex,Modifier,Aspect,Biocuration
2220,OMIM:252100,MOHR SYNDROME OFDS II ORAL-FACIAL-DIGITAL SYND...,,HP:0001156,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,PCS,,HP:0040282,,,P,HPO:iea[2010-01-19]
2221,OMIM:252100,MOHR SYNDROME OFDS II ORAL-FACIAL-DIGITAL SYND...,,HP:0000161,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,PCS,,HP:0040282,,,P,HPO:iea[2009-02-17]
2230,OMIM:252100,MOHR SYNDROME OFDS II ORAL-FACIAL-DIGITAL SYND...,,HP:0001841,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,PCS,,HP:0040282,,,P,HPO:iea[2010-01-19]
2231,OMIM:252100,MOHR SYNDROME OFDS II ORAL-FACIAL-DIGITAL SYND...,,HP:0000456,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,PCS,,HP:0040282,,,P,HPO:iea[2009-02-17]
2235,OMIM:252100,MOHR SYNDROME OFDS II ORAL-FACIAL-DIGITAL SYND...,,HP:0000199,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,PCS,,HP:0040282,,,P,HPO:iea[2009-02-17]
2237,OMIM:252100,MOHR SYNDROME OFDS II ORAL-FACIAL-DIGITAL SYND...,,HP:0001162,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,PCS,,HP:0040282,,,P,HPO:iea[2010-01-19]
2248,OMIM:252100,MOHR SYNDROME OFDS II ORAL-FACIAL-DIGITAL SYND...,,HP:0004209,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,PCS,,HP:0040282,,,P,HPO:iea[2010-01-19]
2251,OMIM:252100,MOHR SYNDROME OFDS II ORAL-FACIAL-DIGITAL SYND...,,HP:0001830,http://www.ncbi.nlm.nih.gov/bookshelf/br.fcgi?...,PCS,,HP:0040282,,,P,HPO:iea[2010-01-19]
2941,OMIM:180849,RUBINSTEIN-TAYBI SYNDROME 1,,HP:0002788,http://www.ncbi.nlm.nih.gov/books/NBK1526/;PMI...,PCS,HP:0003593,,,,P,HPO:iea[2012-04-24]
2952,OMIM:180849,RUBINSTEIN-TAYBI SYNDROME 1,,HP:0008872,http://www.ncbi.nlm.nih.gov/books/NBK1526/;PMI...,PCS,,,,,P,HPO:iea[2012-04-24]


In [None]:
hpoa[hpoa.Reference.str.contains('ISBN-10')]


### Parsing to separate

Creating new columns for reference information:
- ISBN (includes ISBN, ISBN-10, ISBN-13)
- PMID 
- one column of websites, including....
  - http, https (keep URLs)
  - OMIM, DECIPHER (construct URLs, then keep all together)
  - ORPHA (just give URL to search for disease, can't direct link to disease...)

In [None]:
references = hpoa.Reference.str.split(';')

listISBN = []
listPMID = []
listWebsites = []

for entry in references:   
    tempISBN = []
    tempPMID = []
    tempWebsites = []
    for i in entry:
        if 'ISBN' in i:
            ## just substring for the rest
            tempISBN.append(i.split(":")[1])
        elif 'PMID:' in i:
            ## just substring for the rest
            tempPMID.append(i[5:])     
        elif 'http' in i:
        ## add http or https stuff straight to websites
            tempWebsites.append(i)
        ## generate website urls
        elif 'DECIPHER:' in i:
            tempWebsites.append('https://decipher.sanger.ac.uk/syndrome/{0}/overview'.format(i[9:]))
        elif 'OMIM:' in i:
            tempWebsites.append('https://www.omim.org/entry/' + i[5:])  
        elif 'ORPHA:' in i:
            tempWebsites.append('https://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=' + i[6:])    
        else:
            print("encountered unexpected reference, please check parsing:\n{0}".format(i))
    
    if not tempISBN: ## is empty
        listISBN.append(None)
    else:
        ## remove duplicates
        tempISBN = list(set(tempISBN))
        listISBN.append(tempISBN)   
        
    if not tempPMID: ## is empty
        listPMID.append(None)
    else:
        ## remove duplicates
        tempPMID = list(set(tempPMID))
        listPMID.append(tempPMID)   
        
    if not tempWebsites: ## is empty
        listWebsites.append(None)
    else:
        ## remove duplicates
        tempWebsites = list(set(tempWebsites))
        listWebsites.append(tempWebsites)          

In [None]:
## create new columns
hpoa = hpoa.assign(isbn = listISBN,
                   pmid = listPMID, websites = listWebsites)

In [None]:
hpoa[hpoa['DatabaseID'] == 'ORPHA:10']

## no change in parser: Evidence

I think this can be treated as a categorical variable

Every row has one value, either IEA, PCS, or TAS.  
I don't think I can use this for provenance easily though, since...
* sometimes Evidence says IEA but Biocuration has a person's ID
* sometimes Evidence isn't IEA but Biocuration says iea 

In [None]:
hpoa.Evidence.value_counts()
hpoa.Evidence.count()

In [None]:
## showing that evidence is IEA but Biocuration isn't: happens for ~2/5 of the IEA entries
hpoa[(hpoa['Evidence'] == 'IEA') &
     (~ hpoa['Biocuration'].str.contains('iea'))].shape
hpoa[(hpoa['Evidence'] == 'IEA') &
     (~ hpoa['Biocuration'].str.contains('iea'))].head()

In [None]:
## showing that evidence isn't but Biocuration is (a small percentage of the rows)
4383/(142433+19903)
hpoa[(hpoa['Evidence'] != 'IEA') &
     (hpoa['Biocuration'].str.contains('iea'))].shape
hpoa[(hpoa['Evidence'] != 'IEA') &
     (hpoa['Biocuration'].str.contains('iea'))].head()

## no change in parser: Onset

* very few rows (<1%) have this information
* currently each row has only one value (no ";" delimiter) but I dunno if this will change
* All rows with this info are "disease has phenotypic abnormality" rows (Aspect = P)
* I think this can be a categorical variable on the Disease - PhenotypicFeature edge. 
* There may be a rough chronological order to the terms (for listing possible values)
* likely use comments to make these terms actually understandable

In [None]:
hpoa.dropna(subset = ['Onset']).Aspect.value_counts()
hpoa.Onset.count()
1651/214124
hpoa.Onset.nunique()
hpoa.Onset.value_counts()

In [None]:
## this is a weird annotation since this term just means "Onset"
## currently it is shown on the website this way though https://hpo.jax.org/app/browse/disease/OMIM:222448
hpoa[hpoa.Onset == "HP:0003674"]

## modified parser: Frequency

### Intro

This column is tricky because there are multiple kinds of values. Currently only one value per row (no ";" delimitor) 
- some are fractions: could convert to decimal, but keeping numerator and denominator could give prevalence info (although the denominator is tricky to describe since it's people with the disease only; the other two are prevalence/specificity of phenotype for the disease). 
- one of the fractions is '0/0'. Please be careful with this
- some are percentages: these could convert to decimal easily
- some are HP terms: these are a categorical variable...

Extra trickiness: apparently other kinds of disease annotations (course, inheritance, modifier) also have frequency...

In [None]:
hpoa[~ hpoa['Frequency'].isna()].Aspect.value_counts()
hpoa.Frequency.count()
123155 / 216197
hpoa.Frequency.str.contains(";").value_counts()
hpoa.Frequency.str.contains("/").value_counts()
hpoa.Frequency.str.contains("%").value_counts()
hpoa.Frequency.str.contains("HP:").value_counts()
13677+306+109172 == 123155

### Some EDA

note that some frequency terms are 1 since the fraction is 1/1

In [None]:
hpoa[hpoa['Frequency'].isna()].Frequency

### Parsing to separate

Code below makes 4 columns: categorical frequency (for HP terms), numeric frequency (for percentages and fractions), numerator and denominator (for fractions). 

categorical frequency ends up with 5 diff HP terms. 

In [None]:
for i in hpoa.Frequency.astype(str):
    ## since only one value in each row
    if i == 'nan':
        categoryFreq.append(None)
        numericFreq.append(None)
        numerator.append(None)
        denominator.append(None)        
    elif 'HP:' in i:
        categoryFreq.append(i)
        numericFreq.append(None)
        numerator.append(None)
        denominator.append(None)
    elif '%' in i:
        tempN = float(i.strip('%')) / 100.0
        numericFreq.append(tempN)
        categoryFreq.append(None)
        numerator.append(None)
        denominator.append(None)
    elif '/' in i:
        tempL = [int(ele) for ele in i.split("/")]
        ## if numerator or denominator is 0, since that happened
        ## if numerator > denominator, since that happened too
        if (tempL[1] == 0) or (tempL[0] == 0) or (tempL[0] > tempL[1]):
            ## current decision is to leave it blank
            numericFreq.append(None)
            categoryFreq.append(None)
            numerator.append(None)
            denominator.append(None)    
        else:
            numerator.append(int(tempL[0]))
            denominator.append(int(tempL[1]))       
            numericFreq.append(eval(i))
            categoryFreq.append(None)        
    else:
        print("encountered unexpected format, please check parsing:\n{0}".format(i))

In [None]:
## create new columns
hpoa = hpoa.assign(FreqCategories = categoryFreq, NumericFreq = numericFreq, FreqNumerator = numerator,
                   FreqDenominator = denominator)

In [None]:
hpoa.FreqCategories.value_counts()
hpoa.NumericFreq.describe()
hpoa.FreqNumerator.describe()
hpoa.FreqDenominator.describe()

In [None]:
## wow some big numbers
hpoa[hpoa.FreqNumerator > 300]

## modified parser: Sex

currently dealt with for aspect "P" annotations -> made them all lowercase

so it's super rare to have this field. Extra trickiness: apparently other kinds of disease annotations (course, inheritance, modifier) also have sex...    
Also...it has inconsistent capitalization.

In [None]:
hpoa

In [None]:
hpoa.Sex.count()
hpoa.Sex.value_counts()
hpoa[~ hpoa['Sex'].isna()].Aspect.value_counts()

## modified parser: Modifier

It's rare to have this field   
This column is tricky because...
- there can be multiple values (although the truth is only two records have truly >1 value. The rest are REPEATS). 
- there's a lot of different values. Hard to list them all out

Extra trickiness: apparently other kinds of disease annotations (course, inheritance) also have modifier...  

In [None]:
hpoa[~ hpoa['Modifier'].isna()].Modifier.value_counts()

In [None]:
hpoa.Modifier.count()
hpoa[(hpoa['Modifier'] == 'HP:0012839;HP:0012840') |
     (hpoa['Modifier'] == 'HP:0025206;HP:0025303')
    ] 

## won't use: Biocuration

This column is tricky because...
- there can be multiple values.
- each value has two parts: a person/entity curie and a date

In [None]:
hpoa.Biocuration.str.contains(";").value_counts()
hpoa.Biocuration.to_list()[0:5]

# exploring CTD data

## disease -> chemical

In [2]:
CTD_path = pathlib.Path.home().joinpath('Downloads', 'CTD_chemicals_diseases.csv.gz')
CTD = pd.read_table(CTD_path, sep=",", comment='#', compression='gzip', 
                    names=['chemical_name', 'mesh_chemical_id', 'cas_registry_number', 'DiseaseName', 
                           'DiseaseID', 'direct_evidence', 'inference_gene_symbol', 'inference_score', 
                           'omim_id', 'pubmed'],
                   dtype=str)
CTD['inference_score'] = CTD['inference_score'].astype(float)

## convert inference score back to numeric

CTD.columns
CTD.shape

Index(['chemical_name', 'mesh_chemical_id', 'cas_registry_number',
       'DiseaseName', 'DiseaseID', 'direct_evidence', 'inference_gene_symbol',
       'inference_score', 'omim_id', 'pubmed'],
      dtype='object')

(7198752, 10)

In [8]:
CTD['inference_score'].describe()

count    7.097579e+06
mean     2.224442e+01
std      5.702759e+01
min      1.740000e+00
25%      4.380000e+00
50%      8.270000e+00
75%      1.985000e+01
max      2.478960e+03
Name: inference_score, dtype: float64

In [5]:
CTD['DiseaseID'].value_counts()
CTD['DiseaseID'].value_counts().describe()

MESH:D008106    120847
MESH:D011471    108432
MESH:D001943    104691
MESH:D006528     90997
MESH:D056486     72936
                 ...  
MESH:D014917         1
MESH:C531755         1
MESH:C538117         1
MESH:D020243         1
MESH:D014735         1
Name: DiseaseID, Length: 7236, dtype: int64

count      7236.000000
mean        994.852405
std        4656.083091
min           1.000000
25%          32.000000
50%          75.000000
75%         215.000000
max      120847.000000
Name: DiseaseID, dtype: float64

In [6]:
trial = CTD[~ CTD['direct_evidence'].isna()].copy()
trial.shape
trial[['mesh_chemical_id', 'DiseaseID', 'direct_evidence']].drop_duplicates().shape
trial.count()
## the size of these two are the same, meaning that there currently aren't duplicates 

(101173, 10)

(101173, 3)

chemical_name            101173
mesh_chemical_id         101173
cas_registry_number       76544
DiseaseName              101173
DiseaseID                101173
direct_evidence          101173
inference_gene_symbol         0
inference_score               0
omim_id                       0
pubmed                   101173
dtype: int64

In [41]:
trial[(trial['DiseaseID'] == 'MESH:D006974') &
      (trial['mesh_chemical_id'] == 'D003276')]

Unnamed: 0,chemical_name,mesh_chemical_id,cas_registry_number,DiseaseName,DiseaseID,direct_evidence,inference_gene_symbol,inference_score,omim_id,pubmed
2251290,"Contraceptives, Oral",D003276,,"Hypertension, Malignant",MESH:D006974,marker/mechanism,,,,1138738|4559540|5083217|6809465|7866596


In [45]:
trial['DiseaseID'].value_counts()[0:10]
trial['DiseaseID'].value_counts().describe()

MESH:D056486    1432
MESH:D012640    1274
MESH:D006973     952
MESH:D009336     928
MESH:D007674     794
MESH:D010146     777
MESH:D007022     737
MESH:D007249     716
MESH:D004487     673
MESH:D064420     637
Name: DiseaseID, dtype: int64

count    3255.000000
mean       31.082335
std        80.299929
min         1.000000
25%         2.000000
50%         6.000000
75%        22.000000
max      1432.000000
Name: DiseaseID, dtype: float64

In [46]:
trial2 = CTD[CTD['direct_evidence'].isna()]
trial2.count()
## so some have pubmed and others don't

chemical_name            7097579
mesh_chemical_id         7097579
cas_registry_number      4899533
DiseaseName              7097579
DiseaseID                7097579
direct_evidence                0
inference_gene_symbol    7097579
inference_score          7097579
omim_id                   551929
pubmed                   6674393
dtype: int64

(101173, 3)

In [35]:
count = 0
for did, subdf in CTD.groupby('DiseaseID'):
    print(did)
    print(subdf)
    count += 1
    if count>5:
        break

MESH:C000598644
                chemical_name mesh_chemical_id cas_registry_number  \
1890279  Carbon Tetrachloride          D002251             56-23-5   
2465352                   DDT          D003634             50-29-3   
2563708       Dextran Sulfate          D016264           9042-14-2   
2675659          Dietary Fats          D004041                 NaN   
2888927           Doxorubicin          D004317          23214-92-8   

                                               DiseaseName        DiseaseID  \
1890279  Leukoencephalopathy Brain Calcifications and C...  MESH:C000598644   
2465352  Leukoencephalopathy Brain Calcifications and C...  MESH:C000598644   
2563708  Leukoencephalopathy Brain Calcifications and C...  MESH:C000598644   
2675659  Leukoencephalopathy Brain Calcifications and C...  MESH:C000598644   
2888927  Leukoencephalopathy Brain Calcifications and C...  MESH:C000598644   

        direct_evidence inference_gene_symbol  inference_score omim_id pubmed  
1890279 

In [16]:
CTD.pubmed.value_counts().head()

25380136    77440
7979221     32466
16214533    31401
28284560    26399
27093858    23712
Name: pubmed, dtype: int64

In [9]:
## exploring what happens when disease has mesh and omim IDs. 
##   see http://ctdbase.org/detail.go?type=disease&acc=MESH%3aD006111

CTD[(CTD['DiseaseID'] == 'MESH:D006111') & 
    (~ CTD['direct_evidence'].isna())]

## this is mapped to the same mesh ID. see 
CTD[(CTD['DiseaseID'] == 'OMIM:275000')]
CTD[(CTD['DiseaseID'] == 'OMIM:603388')]

Unnamed: 0,chemical_name,mesh_chemical_id,cas_registry_number,DiseaseName,DiseaseID,direct_evidence,inference_gene_symbol,inference_score,omim_id,pubmed
970390,alfacalcidol,C008088,41294-56-8,Graves Disease,MESH:D006111,therapeutic,,,,9322804
1173741,Antithyroid Agents,D013956,,Graves Disease,MESH:D006111,therapeutic,,,,19263707
1871562,Carbimazole,D002231,22232-54-8,Graves Disease,MESH:D006111,therapeutic,,,,12201214|16018796|1681385|19263707|7950664|945...
4400969,Methimazole,D008713,60-56-0,Graves Disease,MESH:D006111,therapeutic,,,,11038449|12467281|15072706|16372246|1642096|18...
5566874,Potassium Iodide,D011193,7681-11-0,Graves Disease,MESH:D006111,therapeutic,,,,3840600
5579912,Prednisolone,D011239,50-24-8,Graves Disease,MESH:D006111,therapeutic,,,,9134835
5582350,Prednisone,D011241,53-03-2,Graves Disease,MESH:D006111,therapeutic,,,,2263031
5650871,Propranolol,D011433,525-66-6,Graves Disease,MESH:D006111,therapeutic,,,,17462097|7714072|9392993
5661294,Propylthiouracil,D011441,51-52-5,Graves Disease,MESH:D006111,therapeutic,,,,11922496|11956877|12521217|1339201|15283191|16...
6589387,Thyroxine,D013974,7488-70-2,Graves Disease,MESH:D006111,marker/mechanism,,,,9392993


Unnamed: 0,chemical_name,mesh_chemical_id,cas_registry_number,DiseaseName,DiseaseID,direct_evidence,inference_gene_symbol,inference_score,omim_id,pubmed


Unnamed: 0,chemical_name,mesh_chemical_id,cas_registry_number,DiseaseName,DiseaseID,direct_evidence,inference_gene_symbol,inference_score,omim_id,pubmed


In [3]:
CTD[CTD['DiseaseID'].str.startswith('OMIM:') & 
    (~ CTD['direct_evidence'].isna())]

Unnamed: 0,chemical_name,mesh_chemical_id,cas_registry_number,DiseaseName,DiseaseID,direct_evidence,inference_gene_symbol,inference_score,omim_id,pubmed
1103873,Amodiaquine,D000655,86-42-0,PLASMODIUM FALCIPARUM BLOOD INFECTION LEVEL,OMIM:248310,therapeutic,,,,2663213
1285363,Ascorbic Acid,D001205,50-81-7,IMMUNE SUPPRESSION,OMIM:146850,therapeutic,,,,15020195
1606493,bisphenol A,C006780,80-05-7,MUSCLE HYPERTROPHY,OMIM:614160,marker/mechanism,,,,31501865
1689896,Bupropion,D016642,34841-39-9,"TOBACCO ADDICTION, SUSCEPTIBILITY TO",OMIM:188890,therapeutic,,,,16785264
2137688,Cisplatin,D002945,15663-27-1,"HEARING LOSS, CISPLATIN-INDUCED, SUSCEPTIBILIT...",OMIM:613290,marker/mechanism,,,,19898482|25551397|25665007|30952644|9256891
2357064,CV 3988,C037913,85703-73-7,IMMUNE SUPPRESSION,OMIM:146850,therapeutic,,,,15020195
2387443,Cyclophosphamide,D003520,50-18-0,SPERMATOGENIC FAILURE 6,OMIM:102530,marker/mechanism,,,,16517039
2398579,Cyclosporine,D016572,59865-13-3,IMMUNE SUPPRESSION,OMIM:146850,marker/mechanism,,,,20156427
2895309,Doxorubicin,D004317,23214-92-8,"QT INTERVAL, VARIATION IN",OMIM:610141,marker/mechanism,,,,12597018|7919046
3974389,JP5 jet fuel,C029662,8008-20-6,IMMUNE SUPPRESSION,OMIM:146850,marker/mechanism,,,,15020195


In [20]:
CTD['DiseaseID'].str.startswith('OMIM:').value_counts()
CTD['DiseaseID'].str.startswith('MESH:').value_counts()

CTD[CTD['DiseaseID'].str.startswith('OMIM:')]

False    7072948
True      125804
Name: DiseaseID, dtype: int64

True     7072948
False     125804
Name: DiseaseID, dtype: int64

Unnamed: 0,chemical_name,mesh_chemical_id,cas_registry_number,DiseaseName,DiseaseID,direct_evidence,inference_gene_symbol,inference_score,omim_id,pubmed
60,10074-G5,C534883,,PROSTATE CANCER/BRAIN CANCER SUSCEPTIBILITY,OMIM:603688,,EPHB2,7.91,603688,
84,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,DEVELOPMENTAL AND EPILEPTIC ENCEPHALOPATHY 7,OMIM:613720,,KCNQ2,7.69,613720,
115,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,"SEIZURES, BENIGN FAMILIAL NEONATAL, 1",OMIM:121200,,KCNQ2,7.69,121200,
116,"10,10-bis(4-pyridinylmethyl)-9(10H)-anthracenone",C112297,,"SEIZURES, BENIGN FAMILIAL NEONATAL, 2",OMIM:121201,,KCNQ3,7.83,121201,
450,103D5R,C496879,,COWDEN SYNDROME 6,OMIM:615109,,AKT1,6.45,615109,
...,...,...,...,...,...,...,...,...,...,...
7197687,Zymosan,D015054,9010-72-4,"MYCOBACTERIUM TUBERCULOSIS, SUSCEPTIBILITY TO",OMIM:607948,,TLR2,13.18,607948,
7197908,Zymosan,D015054,9010-72-4,NEURODEVELOPMENTAL DISORDER WITH SPASTIC DIPLE...,OMIM:615075,,CTNNB1,5.16,615075,
7197992,Zymosan,D015054,9010-72-4,"OSTEOMYELITIS, STERILE MULTIFOCAL, WITH PERIOS...",OMIM:612852,,IL1RN,5.46,612852,
7197995,Zymosan,D015054,9010-72-4,"OSTEOPETROSIS, AUTOSOMAL RECESSIVE 2",OMIM:259710,,TNFSF11,5.51,259710,


In [10]:
CTD['mesh_chemical_id'].nunique()
CTD['DiseaseID'].nunique()

16291

7239

In [None]:
CTD[(CTD['DiseaseID'] == 'MESH:D012128')]

In [None]:
CTD[(CTD['DiseaseID'] == 'MESH:D012128')].chemical_name.nunique()

In [None]:
CTD[(CTD['DiseaseID'] == 'MESH:D012128') &
    (CTD.chemical_name.str.startswith('Zinc'))]

In [11]:
CTD.direct_evidence.value_counts()

marker/mechanism    64754
therapeutic         36239
Name: direct_evidence, dtype: int64

In [None]:
CTD.inference_score.count()
CTD.inference_score.describe()

In [None]:
CTD.DiseaseID.str.contains('OMIM:').value_counts()
CTD.DiseaseID.str.contains('MESH:').value_counts()
## so all diseaseIDs are either MESH or OMIM

## disease -> pathway

In [46]:
CTD2_path = pathlib.Path.home().joinpath('Downloads', 'CTD_diseases_pathways.csv.gz')
CTD2 = pd.read_table(CTD2_path, sep=",", comment='#', compression='gzip',
                    names=['DiseaseName', 'DiseaseID', 'PathwayName', 'PathwayID', 'InferenceGeneSymbol'],
                    dtype=str)
CTD2.columns
CTD2.shape

Index(['DiseaseName', 'DiseaseID', 'PathwayName', 'PathwayID',
       'InferenceGeneSymbol'],
      dtype='object')

(589408, 5)

In [47]:
CTD2[['DiseaseID', 'PathwayID']].drop_duplicates().DiseaseID.value_counts()

MESH:D011471    1558
MESH:D008106    1443
MESH:D001943    1416
MESH:D006528    1358
MESH:D013274    1284
                ... 
OMIM:615527        1
MESH:C537659       1
MESH:D002282       1
MESH:C567906       1
MESH:C567758       1
Name: DiseaseID, Length: 5031, dtype: int64

In [48]:
CTD2[['DiseaseID', 'PathwayID']].drop_duplicates().DiseaseID.value_counts()[10:20]

MESH:D017202    1054
MESH:D018450    1040
MESH:D009362    1039
MESH:D008545    1037
MESH:D003110    1036
MESH:D008325    1020
MESH:D001321    1014
MESH:D003924    1007
MESH:D009361     996
MESH:D002294     995
Name: DiseaseID, dtype: int64

In [49]:
CTD2[(CTD2['DiseaseID'] == 'MESH:D002294') &
     (CTD2['PathwayID'] == 'REACT:R-HSA-1059683')]

Unnamed: 0,DiseaseName,DiseaseID,PathwayName,PathwayID,InferenceGeneSymbol
108673,"Carcinoma, Squamous Cell",MESH:D002294,Interleukin-6 signaling,REACT:R-HSA-1059683,IL6
108674,"Carcinoma, Squamous Cell",MESH:D002294,Interleukin-6 signaling,REACT:R-HSA-1059683,PTPN11
108675,"Carcinoma, Squamous Cell",MESH:D002294,Interleukin-6 signaling,REACT:R-HSA-1059683,STAT3


In [None]:
CTD2[(CTD2['DiseaseID'] == 'MESH:D012128')].PathwayID.nunique()

In [None]:
CTD2.DiseaseID.str.contains('OMIM:').value_counts()
CTD2.DiseaseID.str.contains('MESH:').value_counts()
## so all diseaseIDs are either MESH or OMIM

In [None]:
CTD2.PathwayID.str.contains('REACT:').value_counts()
CTD2.PathwayID.str.contains('KEGG:').value_counts()

In [None]:
CTD2[CTD2['DiseaseID'].str.contains('MESH')].PathwayID.str.contains('REACT:').value_counts()
## so disease MESH - REACT pathway and disease MESH - KEGG pathway both exist

In [None]:
CTD2[CTD2['DiseaseID'].str.contains('OMIM')].PathwayID.str.contains('REACT:').value_counts()
## so disease OMIM - REACT pathway and disease OMIM - KEGG pathway both exist