# Maps Kevin's data onto reactome pathways

In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import os
import glob

import pylab as plt
import matplotlib
from IPython.display import display, HTML

import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
from pandas import HDFStore

import seaborn as sns
from IPython.display import display

from reactome import ensembl_to_uniprot, uniprot_to_reaction, compound_to_reaction, \
    reaction_to_metabolite_pathway, get_reaction_ids, get_reaction_df, get_reaction_entities, \
    produce_kegg_dict, get_all_pathways_formulae
    
from collections import defaultdict

from ipywidgets import FloatProgress

%matplotlib inline

## Load data

In [9]:
basedir = '/Users/joewandy/Dropbox/Analysis/omics_integration'

In [11]:
time = 7
parasite = 'INFEC'
treatment = 'Unsorted'

filename = os.path.join(basedir, 'data/data_%d_%s_%s.h5' % (time, parasite, treatment))
hdf = HDFStore(filename, complevel=9, complib='bzip2')

pvt = hdf['peak_vs_transcript'].replace([np.inf, -np.inf], np.nan).fillna(0)
# pvp = hdf['peak_vs_peak']
# tvt = hdf['transcript_vs_transcript']

hdf.close()

In [12]:
pvt.columns

Index(['ENSMUSG00000000001', 'ENSMUSG00000000028', 'ENSMUSG00000000031',
       'ENSMUSG00000000049', 'ENSMUSG00000000056', 'ENSMUSG00000000058',
       'ENSMUSG00000000078', 'ENSMUSG00000000085', 'ENSMUSG00000000088',
       'ENSMUSG00000000093',
       ...
       'ENSMUSG00000110391', 'ENSMUSG00000110393', 'ENSMUSG00000110397',
       'ENSMUSG00000110399', 'ENSMUSG00000110404', 'ENSMUSG00000110405',
       'ENSMUSG00000110410', 'ENSMUSG00000110414', 'ENSMUSG00000110419',
       'ENSMUSG00000110424'],
      dtype='object', length=21794)

In [13]:
species = 'Mus musculus'

## Maps ENSEMBL IDs to UniProt IDs

In [16]:
ens_id = pvt.columns[0]
print(ens_id)

ENSMUSG00000000001


In [17]:
print(len(pvt.columns))
ens_ids = pvt.columns.values.tolist()

21794


In [18]:
transcript_mapping = ensembl_to_uniprot(ens_ids, species, show_progress_bar=True)

In [24]:
ensembl_ids = list(transcript_mapping.keys())[0:10]
for ensembl_id in ensembl_ids:
    print(ensembl_id, transcript_mapping[ensembl_id])

ENSMUSG00000006235 ['P14753']
ENSMUSG00000026390 ['Q60754']
ENSMUSG00000025044 ['P30204']
ENSMUSG00000042286 ['G3X973', 'F7CT68', 'F7BK35']
ENSMUSG00000035279 ['Q8BV57']
ENSMUSG00000032115 ['Q9JKR6']
ENSMUSG00000029657 ['Q61699']
ENSMUSG00000038188 ['Q5ND28']
ENSMUSG00000008845 ['Q2VLH6']
ENSMUSG00000030895 ['Q91X72']


In [26]:
total_mapped = len(transcript_mapping)
total_genes = len(ens_ids)
certain = []
for ensembl_id in transcript_mapping:
    if len(transcript_mapping[ensembl_id]) == 1:
        certain.append(transcript_mapping[ensembl_id][0])

uncertain = total_mapped - len(certain)
print(len(certain), uncertain, total_mapped, total_genes)

5987 613 6600 21794


## Map UniProt IDs to Reactions

In [27]:
protein_mapping = uniprot_to_reaction(certain, species, show_progress_bar=True)

In [28]:
for uniprot_id in certain[0:3]:
    print(uniprot_id, protein_mapping[uniprot_id])
    print

P14753 [{'reaction_id': 'R-MMU-209310', 'reaction_name': 'Murine JAK2 binds to the Erythropoietin receptor, EpoR'}]
Q60754 [{'reaction_id': 'R-MMU-2247510', 'reaction_name': 'MARCO:ligand is endocytosed'}, {'reaction_id': 'R-MMU-2173783', 'reaction_name': 'Marco binds ligands'}]
P30204 [{'reaction_id': 'R-MMU-2507854', 'reaction_name': 'MSR1:ligand (SCARA1:ligand, SR-A:ligand) is endocytosed'}, {'reaction_id': 'R-MMU-2173779', 'reaction_name': 'Msr1 (Scara1) Binds Ligands'}]


## Map KEGG Compound IDs to Reactions

Load a dataframe containing the KEGG IDs of identified compounds only

In [29]:
peak_df = pd.read_csv(os.path.join(basedir, 'data/my_analysis_peaks.csv'), index_col=0)

In [30]:
peak_df.shape

(109, 11)

In [31]:
peak_df.head()

Unnamed: 0_level_0,sec_id,mass,rt,polarity,c_id,formula,adduct,rc_id,compound,db,identifier
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
741583,1,147.076381,905.000626,positive,2825862,C5H10N2O3,M+H,3453657,L-Glutamine,kegg,C00064
741598,16,162.076139,891.966177,positive,2826034,C6H11NO4,M+H,3453882,L-2-Aminoadipate,kegg,C00956
741599,17,116.070596,771.412376,positive,2826041,C5H9NO2,M+H,3453891,L-Proline,kegg,C00148
741611,29,132.101919,677.261473,positive,2826103,C6H13NO2,M+H,3453976,L-Leucine,kegg,C00123
741680,98,132.101919,646.385592,positive,2826221,C6H13NO2,M+H,3454135,L-Leucine,kegg,C00123


In [32]:
compound_ids = sorted(list(set(peak_df['identifier'].values.tolist())))

In [33]:
compound_mapping = compound_to_reaction(compound_ids, species, show_progress_bar=True)

In [35]:
print(list(compound_mapping.keys())[0:10])

['C00013', 'C00009', 'C00153', 'C00020', 'C00095', 'C00149', 'C00025', 'C00183', 'C00123', 'C00026']


## Map Reactions to Pathways

In [36]:
protein_reactions = list(set(get_reaction_ids(protein_mapping)))
compound_reactions = list(set(get_reaction_ids(compound_mapping)))
combined_reactions = list(set(protein_reactions + compound_reactions))

print(len(protein_reactions))
print(len(compound_reactions))
print(len(combined_reactions))

7096
870
7227


In [37]:
print(combined_reactions[0:10])

['R-MMU-5250579', 'R-MMU-349626', 'R-MMU-5246478', 'R-MMU-1799335', 'R-MMU-1369065', 'R-MMU-3002798', 'R-MMU-8863723', 'R-MMU-352232', 'R-MMU-5619440', 'R-MMU-3928620']


This is a map of reactions to pathways that are under the top-level biochemical ('Metabolism') pathways

In [38]:
pathway_mapping = reaction_to_metabolite_pathway(combined_reactions, species, show_progress_bar=True, leaf=True)
print(len(pathway_mapping))
print(pathway_mapping['R-MMU-2395768'])

1386
[{'pathway_id': 'R-MMU-975634', 'pathway_name': 'Retinoid metabolism and transport'}]


In [39]:
kegg_location = os.path.join(basedir, 'data/kegg/kegg.xml')
kegg_dict = produce_kegg_dict(kegg_location, 'id')

In [41]:
reaction_df = get_reaction_df(transcript_mapping, protein_mapping, compound_mapping, pathway_mapping, species)
reaction_df.sort_values(by=['compound_coverage', 'protein_coverage'], inplace=True, ascending=False)
print(reaction_df.shape)

(1386, 13)


In [42]:
reaction_df

Unnamed: 0,reaction_id,reaction_name,protein_coverage,compound_coverage,all_coverage,protein,observed_protein_count,all_protein_count,compound,observed_compound_count,all_compound_count,pathway_ids,pathway_names
151,R-MMU-70599,aspartate + glutamine + ATP <=> asparagine + g...,1.00,0.75,0.78,Q61024 (ENSMUSG00000029752),1,1,C00013:C00020:C00025:C00049:C00064:C00152,6,8,R-MMU-70614,Amino acid synthesis and interconversion (tran...
413,R-MMU-507775,alanine + alpha-ketoglutarate <=> pyruvate + g...,1.00,0.75,0.80,Q8BGT5 (ENSMUSG00000031700),1,1,C00025:C00026:C00041,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
527,R-MMU-70523,alanine + alpha-ketoglutarate <=> pyruvate + g...,1.00,0.75,0.80,Q8QZR5 (ENSMUSG00000022546),1,1,C00025:C00026:C00041,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
568,R-MMU-70524,pyruvate + glutamate <=> alanine + alpha-ketog...,1.00,0.75,0.80,Q8QZR5 (ENSMUSG00000022546),1,1,C00025:C00026:C00041,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
806,R-MMU-507749,pyruvate + glutamate <=> alanine + alpha-ketog...,1.00,0.75,0.80,Q8BGT5 (ENSMUSG00000031700),1,1,C00025:C00026:C00041,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
1011,R-MMU-70654,ornithine + alpha-ketoglutarate <=> glutamate ...,1.00,0.75,0.80,P29758 (ENSMUSG00000030934),1,1,C00025:C00026:C00077,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
1203,R-MMU-74213,Adenine + PRPP => AMP + PPi,1.00,0.75,0.80,P08030 (ENSMUSG00000006589),1,1,C00013:C00020:C00147,3,4,R-MMU-74217,Purine salvage
1299,R-MMU-70666,glutamate + L-glutamate gamma-semialdehyde <=>...,1.00,0.75,0.80,P29758 (ENSMUSG00000030934),1,1,C00025:C00026:C00077,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
87,R-MMU-70560,carbamoyl phosphate + ornithine => citrulline ...,0.00,0.75,0.60,,0,1,C00009:C00077:C00327,3,4,R-MMU-70635,Urea cycle
188,R-MMU-70952,alpha-aminoadipate + alpha-ketoglutarate <=> a...,0.00,0.75,0.60,,0,1,C00025:C00026:C00956,3,4,R-MMU-71064,Lysine catabolism


In [43]:
reaction_df.to_csv('reaction_df.tsv', sep='\t', index=False)

### How many unique pathways are linked to reactions?

In [44]:
reaction_df['pathway_names'].values

array(['Amino acid synthesis and interconversion (transamination)',
       'Amino acid synthesis and interconversion (transamination)',
       'Amino acid synthesis and interconversion (transamination)', ...,
       'Dermatan sulfate biosynthesis',
       'Chondroitin sulfate biosynthesis', 'CS/DS degradation'],
      dtype=object)

In [45]:
reaction_df['pathway_ids'].values

array(['R-MMU-70614', 'R-MMU-70614', 'R-MMU-70614', ..., 'R-MMU-2022923',
       'R-MMU-2022870', 'R-MMU-2024101'], dtype=object)

In [46]:
pathway_ids_to_names = {}
for ids, names in zip(reaction_df['pathway_ids'].values, reaction_df['pathway_names'].values):
    pathway_ids = ids.split(':')
    pathway_names = names.split(':')
    for pid, pn in zip(pathway_ids, pathway_names):
        pathway_ids_to_names[pid] = pn

In [48]:
data = []
for pid, name in pathway_ids_to_names.items():
    row = (pid, name.lower(), )
    data.append(row)
    
pathway_df = pd.DataFrame(data, columns=['pathway_id', 'pathway_name'])
pathway_df = pathway_df.sort_values(by='pathway_name').reset_index(drop=True)
display(pathway_df)

Unnamed: 0,pathway_id,pathway_name
0,R-MMU-73843,5-phosphoribose 1-diphosphate biosynthesis
1,R-MMU-1971475,a tetrasaccharide linker sequence is required ...
2,R-MMU-2161541,abacavir metabolism
3,R-MMU-156582,acetylation
4,R-MMU-399997,acetylcholine regulates insulin secretion
5,R-MMU-1482798,acyl chain remodeling of cl
6,R-MMU-1482883,acyl chain remodeling of dag and tag
7,R-MMU-1482788,acyl chain remodelling of pc
8,R-MMU-1482839,acyl chain remodelling of pe
9,R-MMU-1482925,acyl chain remodelling of pg


### Hypergeometric test

In [49]:
pw_f, pathway_id_to_name = get_all_pathways_formulae(species)

Missing formula for COMPOUND:C00013, retrieved H4P2O7 from kegg
Missing formula for COMPOUND:C00080, retrieved H from kegg
Missing formula for COMPOUND:C04637, retrieved C11H19O19P3R2 from kegg
Missing formula for COMPOUND:C05332, retrieved C8H11N from kegg
Missing formula for COMPOUND:C05261, retrieved C35H60N7O18P3S from kegg
Missing formula for COMPOUND:C00416, retrieved C5H7O8PR2 from kegg
Missing formula for COMPOUND:C05981, retrieved C11H20O22P4R2 from kegg
Missing formula for COMPOUND:C11557, retrieved C11H18O16P2R2 from kegg
Missing formula for COMPOUND:C01277, retrieved C11H18O16P2R2 from kegg
Missing formula for COMPOUND:C05974, retrieved C7H13NO9PR from kegg
Missing formula for COMPOUND:C05973, retrieved C6H13NO7PR from kegg


In [51]:
detected = set(peak_df[['formula']].values.flatten())
print(detected, len(detected))

{'C5H6O5', 'C6H6O6', 'H3O4P', 'C5H11NO2', 'C10H17N3O6S', 'C2H8NO4P', 'C5H8O5', 'C9H12N2O6', 'C7H7NO2', 'C5H4N4O2', 'C12H22O11', 'C4H6N4O3', 'C6H13N3O3', 'C5H9NO3', 'C6H13NO2', 'C5H9NO4', 'C10H12N2O3', 'C4H8O3', 'C8H11NO3', 'C5H7NO3', 'H4P2O7', 'C11H12N2O2', 'C2H5NO2', 'C5H4N2O4', 'C6H12O7', 'C5H11NO2S', 'C9H11NO2', 'C11H19NO9', 'C5H6O4', 'C4H7NO4', 'C7H10O7', 'C2H7NO3S', 'C5H6N2O2', 'C5H9NO2', 'C5H5N5', 'C5H12N2O2', 'C3H6O3', 'C6H11NO4', 'C6H6N2O', 'C7H14N2O4S', 'C5H4N4O', 'C5H5N5O', 'C9H13N3O5', 'C3H9O6P', 'C6H13NO5', 'C6H10O7', 'C7H6O2', 'C7H15NO3', 'C4H8N2O3', 'C6H12O6', 'C4H6O5', 'C5H10N2O3', 'C3H7NO3', 'C4H9NO3', 'C4H9NO2', 'C10H14N5O7P', 'C4H7N3O', 'C8H15NO6', 'C5H10O5', 'C9H17NO5', 'C3H7NO2'} 61


In [53]:
data = []
for pathway_id in pw_f:
    
    pathway_name = pathway_id_to_name[pathway_id]
    
    formulae = pw_f[pathway_id]
    formulae_count = len(formulae)
    formulae_str = ','.join(sorted(formulae))
    
    detected_f = set([x for x in formulae if x in detected])
    detected_count = len(detected_f)
    detected_str = ','.join(sorted(detected_f))
    
    row = [pathway_id, pathway_name, formulae_str, formulae_count, detected_str, detected_count]
    data.append(row)
    
all_pathway_df = pd.DataFrame(data, columns=[
    'pathway_id', 'pathway_name', 
    'formula', 'formula_count',
    'detected', 'detected_count'])
all_pathway_df.set_index('pathway_id', drop=True, inplace=True)

Compute hypergeometric p-values

In [54]:
from scipy.stats import hypergeom

In [55]:
# M = the number of unique formula in all pathways in Reactome
M = len(set(','.join(all_pathway_df['formula'].values).split(',')))

# N = the number of unique formula in all pathways in the dataset
N = len(set(','.join(all_pathway_df['detected'].values).split(',')))

SMOOTHING = 1

data = []
for idx, row in all_pathway_df.iterrows():
        
    # k = the number of unique formula in the pathway of interest in the dataset
    k = row['detected_count']
    
    # n = the number of unique formula in the pathway of interest
    n = row['formula_count'] + SMOOTHING
        
    p_value = hypergeom.sf(k, M, n, N)
    assert p_value > 0
    new_row = [idx, p_value]
    data.append(new_row)

p_value_df = pd.DataFrame(data, columns=[
    'pathway_id', 'p_value'])
p_value_df.set_index('pathway_id', drop=True, inplace=True)

In [56]:
combined = pd.concat([all_pathway_df, p_value_df], axis=1)
combined = combined.sort_values(by='p_value', ascending=True).reset_index(drop=True)
display(combined)
combined.to_csv('pathway_df.csv', index=False, encoding='utf-8')

Unnamed: 0,pathway_name,formula,formula_count,detected,detected_count,p_value
0,Amino acid synthesis and interconversion (tran...,"C10H14N5O7P,C10H15N5O10P2,C10H15N5O11P2,C10H16...",29,"C10H14N5O7P,C2H7NO3S,C3H7NO2,C5H12N2O2,C5H6O5,...",8,0.000718
1,Pyrimidine catabolism,"C10H14N2O5,C10H15N2O8P,C10H15N5O10P2,C17H21N4O...",32,"C3H7NO2,C4H8N2O3,C4H9NO2,C5H10N2O3,C5H6N2O2,C9...",8,0.001574
2,Conjugation of benzoate with glycine,"C10H14N5O7P,C10H16N5O13P3,C21H36N7O16P3S,C28H4...",7,"C10H14N5O7P,C7H6O2,H4P2O7",3,0.003569
3,Urea cycle,"C10H14N5O7P,C10H15N5O10P2,C10H16N5O13P3,C10H18...",17,"C10H14N5O7P,C5H12N2O2,C6H13N3O3,H3O4P,H4P2O7",5,0.003694
4,Purine catabolism,"C10H12N4O4,C10H12N4O5,C10H13N4O7P,C10H13N4O8P,...",30,"C10H14N5O7P,C10H17N3O6S,C5H4N4O,C5H4N4O2,C5H5N...",7,0.004514
5,Phosphate bond hydrolysis by NUDT proteins,"C10H13N4O7P,C10H13N4O8P,C10H14N5O7P,C5H11O8P,H...",8,"C10H14N5O7P,H3O4P,H4P2O7",3,0.005993
6,Recycling of bile acids and salts,"C10H14N5O7P,C10H15N5O10P2,C10H16N5O13P3,C21H36...",13,"C10H14N5O7P,C2H7NO3S,H3O4P,H4P2O7",4,0.006024
7,Conjugation of phenylacetate with glutamine,"C10H14N5O7P,C10H16N5O13P3,C21H36N7O16P3S,H4P2O7",4,"C10H14N5O7P,H4P2O7",2,0.006783
8,Pyrophosphate hydrolysis,"H2O,H3O4P,H4P2O7,Mg",4,"H3O4P,H4P2O7",2,0.006783
9,Purine salvage,"C10H12N4O4,C10H12N4O5,C10H13N4O7P,C10H13N4O8P,...",27,"C10H14N5O7P,C5H4N4O,C5H5N5,C5H5N5O,H3O4P,H4P2O7",6,0.009956
