# Maps Kevin's data onto reactome pathways

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob

import pylab as plt
import matplotlib
from IPython.display import display, HTML

import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
from pandas import HDFStore

import seaborn as sns
from IPython.display import display

from reactome import ensembl_to_uniprot, uniprot_to_reaction, compound_to_reaction, \
    reaction_to_metabolite_pathway, get_reaction_ids, get_reaction_df, get_reaction_entities, \
    produce_kegg_dict
    
from collections import defaultdict

from ipywidgets import FloatProgress

%matplotlib inline

## Load data

In [3]:
basedir = '/Users/joewandy/Dropbox/Analysis/omics_integration'

In [4]:
time = 7
parasite = 'INFEC'
treatment = 'Unsorted'

filename = os.path.join(basedir, 'data/data_%d_%s_%s.h5' % (time, parasite, treatment))
hdf = HDFStore(filename, complevel=9, complib='bzip2')

pvt = hdf['peak_vs_transcript'].replace([np.inf, -np.inf], np.nan).fillna(0)
# pvp = hdf['peak_vs_peak']
# tvt = hdf['transcript_vs_transcript']

hdf.close()

In [5]:
pvt.columns

Index([u'ENSMUSG00000000001', u'ENSMUSG00000000028', u'ENSMUSG00000000031',
       u'ENSMUSG00000000049', u'ENSMUSG00000000056', u'ENSMUSG00000000058',
       u'ENSMUSG00000000078', u'ENSMUSG00000000085', u'ENSMUSG00000000088',
       u'ENSMUSG00000000093',
       ...
       u'ENSMUSG00000110391', u'ENSMUSG00000110393', u'ENSMUSG00000110397',
       u'ENSMUSG00000110399', u'ENSMUSG00000110404', u'ENSMUSG00000110405',
       u'ENSMUSG00000110410', u'ENSMUSG00000110414', u'ENSMUSG00000110419',
       u'ENSMUSG00000110424'],
      dtype='object', length=21794)

In [6]:
species = 'Mus musculus'

## Maps ENSEMBL IDs to UniProt IDs

In [7]:
ens_id = pvt.columns[0]
print ens_id

ENSMUSG00000000001


In [8]:
print len(pvt.columns)
ens_ids = pvt.columns.values.tolist()

21794


In [9]:
transcript_mapping = ensembl_to_uniprot(ens_ids, species, show_progress_bar=True)

In [10]:
ensembl_ids = transcript_mapping.keys()[0:10]
for ensembl_id in ensembl_ids:
    print ensembl_id, transcript_mapping[ensembl_id]

In [11]:
total_mapped = len(transcript_mapping)
total_genes = len(ens_ids)
certain = []
for ensembl_id in transcript_mapping:
    if len(transcript_mapping[ensembl_id]) == 1:
        certain.append(transcript_mapping[ensembl_id][0])

uncertain = total_mapped - len(certain)
print len(certain), uncertain, total_mapped, total_genes

## Map UniProt IDs to Reactions

In [12]:
protein_mapping = uniprot_to_reaction(certain, species, show_progress_bar=True)

In [13]:
for uniprot_id in certain[0:3]:
    print uniprot_id, protein_mapping[uniprot_id]
    print

P70404 [{'reaction_id': u'R-MMU-70967', 'reaction_name': u'isocitrate + NAD+ => alpha-ketoglutarate + CO2 + NADH + H+ [IDH3]'}]

Q61335 [{'reaction_id': u'R-MMU-351894', 'reaction_name': u'Caspase mediated cleavage of BAP31'}]

Q6DFW4 [{'reaction_id': u'R-MMU-4570467', 'reaction_name': u'SUMOylation of NOP58 with SUMO1'}, {'reaction_id': u'R-MMU-6791222', 'reaction_name': u'21S pre-rRNA is nucleolytically processed at site E (site2a) to yield 18SE pre-rRNA'}, {'reaction_id': u'R-MMU-6791227', 'reaction_name': u"47S pre-rRNA is nucleolytically processed at A' (01,A1), site A0, and site 02 (site 6)  to yield 45S pre-rRNA"}]



## Map KEGG Compound IDs to Reactions

Load a dataframe containing the KEGG IDs of identified compounds only

In [14]:
peak_df = pd.read_csv(os.path.join(basedir, 'data/my_analysis_peaks.csv'), index_col=0)

In [15]:
peak_df.shape

(109, 11)

In [16]:
peak_df.head()

Unnamed: 0_level_0,sec_id,mass,rt,polarity,c_id,formula,adduct,rc_id,compound,db,identifier
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
741583,1,147.076381,905.000626,positive,2825862,C5H10N2O3,M+H,3453657,L-Glutamine,kegg,C00064
741598,16,162.076139,891.966177,positive,2826034,C6H11NO4,M+H,3453882,L-2-Aminoadipate,kegg,C00956
741599,17,116.070596,771.412376,positive,2826041,C5H9NO2,M+H,3453891,L-Proline,kegg,C00148
741611,29,132.101919,677.261473,positive,2826103,C6H13NO2,M+H,3453976,L-Leucine,kegg,C00123
741680,98,132.101919,646.385592,positive,2826221,C6H13NO2,M+H,3454135,L-Leucine,kegg,C00123


In [17]:
compound_ids = sorted(list(set(peak_df['identifier'].values.tolist())))

In [18]:
compound_mapping = compound_to_reaction(compound_ids, species, show_progress_bar=True)

In [19]:
print compound_mapping.keys()[0:10]

[u'C00025', u'C00328', u'C00149', u'C00020', u'C00009', u'C00147', u'C00013', u'C00026', u'C00864', u'C00099']


## Map Reactions to Pathways

In [20]:
protein_reactions = list(set(get_reaction_ids(protein_mapping)))
compound_reactions = list(set(get_reaction_ids(compound_mapping)))
combined_reactions = list(set(protein_reactions + compound_reactions))

print len(protein_reactions)
print len(compound_reactions)
print len(combined_reactions)

7096
870
7227


In [21]:
print combined_reactions[0:10]

[u'R-MMU-9014672', u'R-MMU-2130336', u'R-MMU-5694018', u'R-MMU-6813740', u'R-MMU-9014678', u'R-MMU-204485', u'R-MMU-202626', u'R-MMU-1254386', u'R-MMU-5693561', u'R-MMU-8939706']


This is a map of reactions to pathways that are under the top-level biochemical ('Metabolism') pathways

In [22]:
pathway_mapping = reaction_to_metabolite_pathway(combined_reactions, species, show_progress_bar=True, last_pathway=True)
print len(pathway_mapping)
print pathway_mapping['R-MMU-2395768']

1386
[{'pathway_id': u'R-MMU-975634', 'pathway_name': u'Retinoid metabolism and transport'}]


In [23]:
kegg_location = os.path.join(basedir, 'data/kegg/kegg.xml')
kegg_dict = produce_kegg_dict(kegg_location, 'id')

In [24]:
reaction_df = get_reaction_df(transcript_mapping, protein_mapping, compound_mapping, pathway_mapping, species)
reaction_df.sort_values(by=['compound_coverage', 'protein_coverage'], inplace=True, ascending=False)
print reaction_df.shape

(1386, 13)


In [25]:
reaction_df

Unnamed: 0,reaction_id,reaction_name,protein_coverage,compound_coverage,all_coverage,protein,observed_protein_count,all_protein_count,compound,observed_compound_count,all_compound_count,pathway_ids,pathway_names
270,R-MMU-70666,glutamate + L-glutamate gamma-semialdehyde <=>...,1.00,0.75,0.80,P29758 (ENSMUSG00000030934),1,1,C00025:C00026:C00077,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
344,R-MMU-507775,alanine + alpha-ketoglutarate <=> pyruvate + g...,1.00,0.75,0.80,Q8BGT5 (ENSMUSG00000031700),1,1,C00025:C00026:C00041,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
513,R-MMU-74213,Adenine + PRPP => AMP + PPi,1.00,0.75,0.80,P08030 (ENSMUSG00000006589),1,1,C00020:C00147:C00013,3,4,R-MMU-74217,Purine salvage
544,R-MMU-70524,pyruvate + glutamate <=> alanine + alpha-ketog...,1.00,0.75,0.80,Q8QZR5 (ENSMUSG00000022546),1,1,C00025:C00026:C00041,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
545,R-MMU-70523,alanine + alpha-ketoglutarate <=> pyruvate + g...,1.00,0.75,0.80,Q8QZR5 (ENSMUSG00000022546),1,1,C00025:C00026:C00041,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
954,R-MMU-70654,ornithine + alpha-ketoglutarate <=> glutamate ...,1.00,0.75,0.80,P29758 (ENSMUSG00000030934),1,1,C00025:C00026:C00077,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
1082,R-MMU-507749,pyruvate + glutamate <=> alanine + alpha-ketog...,1.00,0.75,0.80,Q8BGT5 (ENSMUSG00000031700),1,1,C00025:C00026:C00041,3,4,R-MMU-70614,Amino acid synthesis and interconversion (tran...
1360,R-MMU-70599,aspartate + glutamine + ATP <=> asparagine + g...,1.00,0.75,0.78,Q61024 (ENSMUSG00000029752),1,1,C00025:C00020:C00013:C00152:C00064:C00049,6,8,R-MMU-70614,Amino acid synthesis and interconversion (tran...
461,R-MMU-70560,carbamoyl phosphate + ornithine => citrulline ...,0.00,0.75,0.60,,0,1,C00009:C00077:C00327,3,4,R-MMU-70635,Urea cycle
704,R-MMU-893583,kynurenine + 2-oxoglutarate => 4-(2-aminopheny...,0.00,0.75,0.60,,0,1,C00025:C00328:C00026,3,4,R-MMU-71240,Tryptophan catabolism


In [26]:
reaction_df.to_csv('reaction_df.tsv', sep='\t', index=False)

How many unique pathways?

In [27]:
pathway_names = []
for names in reaction_df['pathway_names'].values:
    ps = names.split(':')
    pathway_names.extend(ps)
unique_names = sorted(list(set(pathway_names)))

In [28]:
pathway_df = pd.DataFrame(unique_names, columns=['pathway_name'])
pathway_df.to_csv('pathway_df.csv', index=False)
print pathway_df.shape

(219, 1)


In [29]:
pathway_df

Unnamed: 0,pathway_name
0,5-Phosphoribose 1-diphosphate biosynthesis
1,A tetrasaccharide linker sequence is required ...
2,AMPK inhibits chREBP transcriptional activatio...
3,Abacavir metabolism
4,Acetylation
5,Acetylcholine regulates insulin secretion
6,Acyl chain remodeling of CL
7,Acyl chain remodeling of DAG and TAG
8,Acyl chain remodelling of PC
9,Acyl chain remodelling of PE
