# Maps Kevin's data onto reactome pathways

In [1]:
import os
import glob

import pylab as plt
import matplotlib
from IPython.display import display, HTML

import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
from pandas import HDFStore

import seaborn as sns
from IPython.display import display

from reactome import ensembl_to_uniprot, uniprot_to_reaction

from ipywidgets import FloatProgress

%matplotlib inline

## Load data

In [2]:
time = 7
parasite = 'INFEC'
treatment = 'Unsorted'

filename = '../data/data_%d_%s_%s.h5' % (time, parasite, treatment)
hdf = HDFStore(filename, complevel=9, complib='bzip2')

pvt = hdf['peak_vs_transcript'].replace([np.inf, -np.inf], np.nan).fillna(0)
# pvp = hdf['peak_vs_peak']
# tvt = hdf['transcript_vs_transcript']

hdf.close()

In [3]:
pvt.columns

Index([u'ENSMUSG00000000001', u'ENSMUSG00000000028', u'ENSMUSG00000000031',
       u'ENSMUSG00000000049', u'ENSMUSG00000000056', u'ENSMUSG00000000058',
       u'ENSMUSG00000000078', u'ENSMUSG00000000085', u'ENSMUSG00000000088',
       u'ENSMUSG00000000093',
       ...
       u'ENSMUSG00000110391', u'ENSMUSG00000110393', u'ENSMUSG00000110397',
       u'ENSMUSG00000110399', u'ENSMUSG00000110404', u'ENSMUSG00000110405',
       u'ENSMUSG00000110410', u'ENSMUSG00000110414', u'ENSMUSG00000110419',
       u'ENSMUSG00000110424'],
      dtype='object', length=21794)

In [4]:
species = 'Mus musculus'

## Maps ENSEMBL IDs to UniProt IDs

In [5]:
ens_id = pvt.columns[0]
print ens_id

ENSMUSG00000000001


In [6]:
print len(pvt.columns)
ens_ids = pvt.columns.values.tolist()

21794


In [7]:
transcript_mapping = ensembl_to_uniprot(ens_ids, species, show_progress_bar=True)

In [8]:
ensembl_ids = transcript_mapping.keys()[0:10]
for ensembl_id in ensembl_ids:
    print ensembl_id, transcript_mapping[ensembl_id]

ENSMUSG00000002010 [{'url': u'http://www.uniprot.org/entry/P70404', 'protein_id': u'P70404'}]
ENSMUSG00000002015 [{'url': u'http://www.uniprot.org/entry/Q61335', 'protein_id': u'Q61335'}]
ENSMUSG00000027673 [{'url': u'http://www.uniprot.org/entry/Q9CQH3', 'protein_id': u'Q9CQH3'}]
ENSMUSG00000046027 [{'url': u'http://www.uniprot.org/entry/Q9EPQ7', 'protein_id': u'Q9EPQ7'}]
ENSMUSG00000046756 [{'url': u'http://www.uniprot.org/entry/Q80X85', 'protein_id': u'Q80X85'}]
ENSMUSG00000031805 [{'url': u'http://www.uniprot.org/entry/A0A0R4J0R7', 'protein_id': u'A0A0R4J0R7'}]
ENSMUSG00000035133 [{'url': u'http://www.uniprot.org/entry/E9PYT0', 'protein_id': u'E9PYT0'}]
ENSMUSG00000045482 [{'url': u'http://www.uniprot.org/entry/A0A1D5RLL4', 'protein_id': u'A0A1D5RLL4'}, {'url': u'http://www.uniprot.org/entry/D3YY11', 'protein_id': u'D3YY11'}, {'url': u'http://www.uniprot.org/entry/F7CGG2', 'protein_id': u'F7CGG2'}, {'url': u'http://www.uniprot.org/entry/E9PWT1', 'protein_id': u'E9PWT1'}, {'url': u'

In [9]:
total_mapped = len(transcript_mapping)
total_genes = len(ens_ids)
certain = []
for ensembl_id in transcript_mapping:
    if len(transcript_mapping[ensembl_id]) == 1:
        certain.append(transcript_mapping[ensembl_id][0]['protein_id'])

uncertain = total_mapped - len(certain)
print len(certain), uncertain, total_mapped, total_genes

5987 613 6600 21794


## Map UniProt IDs to Reactions

In [10]:
protein_mapping = uniprot_to_reaction(certain, species, show_progress_bar=True)

In [11]:
for uniprot_id in certain[0:10]:
    print uniprot_id, protein_mapping[uniprot_id]
    print

P70404 [{'reaction_id': u'R-MMU-70967', 'reaction_name': u'isocitrate + NAD+ => alpha-ketoglutarate + CO2 + NADH + H+ [IDH3]'}]

Q61335 [{'reaction_id': u'R-MMU-351894', 'reaction_name': u'Caspase mediated cleavage of BAP31'}]

Q9CQH3 [{'reaction_id': u'R-MMU-163217', 'reaction_name': u'Complex I oxidises NADH to NAD+, reduces CoQ to QH2'}, {'reaction_id': u'R-MMU-6799196', 'reaction_name': u'The MCIA complex, NDUFAF2-7 all dissociate from the 980kDa complex, resulting in Complex I'}, {'reaction_id': u'R-MMU-6799202', 'reaction_name': u'The 315kDa subcomplex binds the 370kDa subcomplex to form the 550kDa complex'}, {'reaction_id': u'R-MMU-6799179', 'reaction_name': u'Peripheral arm subunits bind the 815kDa complex to form a 980kDa complex'}, {'reaction_id': u'R-MMU-6799197', 'reaction_name': u'ND4, ND5 bind the 550kDa complex to form the 815kDa complex'}, {'reaction_id': u'R-MMU-6799191', 'reaction_name': u'Intermediate 2 binds MT-ND1:NDUFAF5:NDUFAF6 to form a 315kDa subcomplex'}, {'re

## Map KEGG Compound IDs to Reactions

Use rdkit to convert from inchi to inchikey

In [12]:
# import xmltodict
# kegg_location = '../data/kegg/kegg.xml'

# with open(kegg_location) as kegg_cmpd_file:
#     cmpd_dict = xmltodict.parse(kegg_cmpd_file.read())

In [20]:
# total = len(cmpd_dict['compounds']['compound'])
# f = FloatProgress(min=0, max=total)
# display(f)

# duplicates = []
# kegg_inchikey_to_compound = {}
# for compound in cmpd_dict['compounds']['compound']:
#     f.value += 1
#     inchi = compound['inchi']
#     if inchi is not None:
#         inchi_key = Chem.inchi.InchiToInchiKey(str(inchi))
#         if inchi_key in kegg_inchikey_to_compound:
#             print inchi_key            
#             print compound
#             print kegg_inchikey_to_compound[inchi_key]
#             print
#             duplicates.append(inchi_key)
#         else:
#             kegg_inchikey_to_compound[inchi_key] = compound   
            
# print len(kegg_inchikey_to_compound)
# print total

Load a dataframe containing the KEGG IDs of identified compounds only

In [17]:
peak_df = pd.read_csv('../data/my_analysis_peaks.csv', index_col=0)

In [19]:
peak_df.head()

Unnamed: 0_level_0,sec_id,mass,rt,polarity,c_id,formula,adduct,rc_id,compound,db,identifier
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
741583,1,147.076381,905.000626,positive,2825862,C5H10N2O3,M+H,3453657,L-Glutamine,kegg,C00064
741598,16,162.076139,891.966177,positive,2826034,C6H11NO4,M+H,3453882,L-2-Aminoadipate,kegg,C00956
741599,17,116.070596,771.412376,positive,2826041,C5H9NO2,M+H,3453891,L-Proline,kegg,C00148
741611,29,132.101919,677.261473,positive,2826103,C6H13NO2,M+H,3453976,L-Leucine,kegg,C00123
741680,98,132.101919,646.385592,positive,2826221,C6H13NO2,M+H,3454135,L-Leucine,kegg,C00123


In [24]:
compound_ids = sorted(peak_df['identifier'].values.tolist())

['C00009',
 'C00009',
 'C00013',
 'C00020',
 'C00020',
 'C00025',
 'C00025',
 'C00025',
 'C00026',
 'C00031',
 'C00037',
 'C00041',
 'C00041',
 'C00049',
 'C00049',
 'C00051',
 'C00051',
 'C00051',
 'C00064',
 'C00064',
 'C00064',
 'C00065',
 'C00065',
 'C00073',
 'C00073',
 'C00077',
 'C00077',
 'C00078',
 'C00078',
 'C00079',
 'C00079',
 'C00089',
 'C00093',
 'C00093',
 'C00095',
 'C00095',
 'C00099',
 'C00099',
 'C00121',
 'C00123',
 'C00123',
 'C00123',
 'C00140',
 'C00147',
 'C00147',
 'C00148',
 'C00148',
 'C00149',
 'C00152',
 'C00152',
 'C00152',
 'C00153',
 'C00178',
 'C00180',
 'C00180',
 'C00183',
 'C00183',
 'C00188',
 'C00188',
 'C00208',
 'C00216',
 'C00242',
 'C00245',
 'C00256',
 'C00257',
 'C00262',
 'C00262',
 'C00263',
 'C00263',
 'C00270',
 'C00270',
 'C00295',
 'C00299',
 'C00314',
 'C00314',
 'C00318',
 'C00327',
 'C00327',
 'C00327',
 'C00328',
 'C00329',
 'C00333',
 'C00334',
 'C00346',
 'C00346',
 'C00385',
 'C00417',
 'C00475',
 'C00475',
 'C00490',
 'C00539',