# Maps Kevin's data onto reactome pathways

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob

import pylab as plt
import matplotlib
from IPython.display import display, HTML

import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
from pandas import HDFStore

import seaborn as sns
from IPython.display import display
    
from collections import defaultdict

from ipywidgets import FloatProgress

%matplotlib inline

In [3]:
import sys
sys.path.append('../linker')

from reactome import ensembl_to_uniprot, uniprot_to_reaction, compound_to_reaction, \
    reaction_to_metabolite_pathway, get_reaction_ids, get_reaction_df, get_reaction_entities, \
    produce_kegg_dict, get_all_pathways_formulae


## Load data

In [4]:
basedir = '/Users/joewandy/Dropbox/Analysis/omics_integration'

In [5]:
time = 7
parasite = 'INFEC'
treatment = 'Unsorted'

filename = os.path.join(basedir, 'data/data_%d_%s_%s.h5' % (time, parasite, treatment))
hdf = HDFStore(filename, complevel=9, complib='bzip2')

pvt = hdf['peak_vs_transcript'].replace([np.inf, -np.inf], np.nan).fillna(0)
# pvp = hdf['peak_vs_peak']
# tvt = hdf['transcript_vs_transcript']

hdf.close()

In [6]:
pvt.columns

Index(['ENSMUSG00000000001', 'ENSMUSG00000000028', 'ENSMUSG00000000031',
       'ENSMUSG00000000049', 'ENSMUSG00000000056', 'ENSMUSG00000000058',
       'ENSMUSG00000000078', 'ENSMUSG00000000085', 'ENSMUSG00000000088',
       'ENSMUSG00000000093',
       ...
       'ENSMUSG00000110391', 'ENSMUSG00000110393', 'ENSMUSG00000110397',
       'ENSMUSG00000110399', 'ENSMUSG00000110404', 'ENSMUSG00000110405',
       'ENSMUSG00000110410', 'ENSMUSG00000110414', 'ENSMUSG00000110419',
       'ENSMUSG00000110424'],
      dtype='object', length=21794)

In [7]:
species = 'Mus musculus'

## Maps ENSEMBL IDs to UniProt IDs

In [8]:
ens_id = pvt.columns[0]
print(ens_id)

ENSMUSG00000000001


In [9]:
print(len(pvt.columns))
ens_ids = pvt.columns.values.tolist()

21794


In [10]:
transcript_mapping, id_to_names = ensembl_to_uniprot(ens_ids, species)

In [11]:
ensembl_ids = list(transcript_mapping.keys())[0:10]
for ensembl_id in ensembl_ids:
    print(ensembl_id, transcript_mapping[ensembl_id])

ENSMUSG00000006235 ['P14753']
ENSMUSG00000026390 ['Q60754']
ENSMUSG00000025044 ['P30204']
ENSMUSG00000042286 ['G3X973', 'F7CT68', 'F7BK35']
ENSMUSG00000035279 ['Q8BV57']
ENSMUSG00000032115 ['Q9JKR6']
ENSMUSG00000029657 ['Q61699']
ENSMUSG00000038188 ['Q5ND28']
ENSMUSG00000008845 ['Q2VLH6']
ENSMUSG00000030895 ['Q91X72']


In [12]:
total_mapped = len(transcript_mapping)
total_genes = len(ens_ids)
certain = []
for ensembl_id in transcript_mapping:
    if len(transcript_mapping[ensembl_id]) == 1:
        certain.append(transcript_mapping[ensembl_id][0])

uncertain = total_mapped - len(certain)
print(len(certain), uncertain, total_mapped, total_genes)

5987 613 6600 21794


Get human-friendly names from online

In [13]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [14]:
from bioservices import Ensembl

ensembl_ids = list(transcript_mapping.keys())
ens = Ensembl()

ensembl_lookup = {}
for x in batch(ensembl_ids, 1000):
    batch_ids = [i for i in x]
    print(len(batch_ids))
    lookup = ens.post_lookup_by_id(identifiers=batch_ids)
    ensembl_lookup.update(lookup)
    
    
ensembl_lookup['ENSMUSG00000002100']

1000
1000
1000
1000
1000
1000
600


{'assembly_name': 'GRCm38',
 'biotype': 'protein_coding',
 'db_type': 'core',
 'description': 'myosin binding protein C, cardiac [Source:MGI Symbol;Acc:MGI:102844]',
 'display_name': 'Mybpc3',
 'end': 91136516,
 'id': 'ENSMUSG00000002100',
 'logic_name': 'ensembl_havana_gene',
 'object_type': 'Gene',
 'seq_region_name': '2',
 'source': 'ensembl_havana',
 'species': 'mus_musculus',
 'start': 91118144,
 'strand': 1,
 'version': 15}

## Map UniProt IDs to Reactions

In [15]:
protein_mapping, id_to_names = uniprot_to_reaction(certain, species)

In [16]:
for uniprot_id in certain[0:3]:
    print(uniprot_id, protein_mapping[uniprot_id])
    print

P14753 [{'reaction_id': 'R-MMU-209310', 'reaction_name': 'Murine JAK2 binds to the Erythropoietin receptor, EpoR'}]
Q60754 [{'reaction_id': 'R-MMU-2247510', 'reaction_name': 'MARCO:ligand is endocytosed'}, {'reaction_id': 'R-MMU-2173783', 'reaction_name': 'Marco binds ligands'}]
P30204 [{'reaction_id': 'R-MMU-2507854', 'reaction_name': 'MSR1:ligand (SCARA1:ligand, SR-A:ligand) is endocytosed'}, {'reaction_id': 'R-MMU-2173779', 'reaction_name': 'Msr1 (Scara1) Binds Ligands'}]


Get human-friendly names from online

In [17]:
from bioservices import UniProt

uniprot = UniProt()
uniprot_ids = certain
res = uniprot.retrieve(uniprot_ids)
print(len(res))

5987


In [18]:
protein_metadata = {}
for r in res:
    for key in r['accession']:
        protein_id = key.contents[0]    
        for x in r['recommendedname']:
            tag = x.find('shortname')
            if tag is None:
                tag = x.find('fullname')
            label = tag.contents[0]
            protein_metadata[protein_id] = {'display_name': label}

In [19]:
for protein_id in protein_metadata:
    print(protein_id, protein_metadata[protein_id])
    break

P14753 {'display_name': 'EPO-R'}


## Map KEGG Compound IDs to Reactions

Load a dataframe containing the KEGG IDs of identified compounds only

In [20]:
peak_df = pd.read_csv(os.path.join(basedir, 'data/my_analysis_peaks.csv'), index_col=0)

In [21]:
peak_df.shape

(109, 11)

In [22]:
peak_df.head()

Unnamed: 0_level_0,sec_id,mass,rt,polarity,c_id,formula,adduct,rc_id,compound,db,identifier
pid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
741583,1,147.076381,905.000626,positive,2825862,C5H10N2O3,M+H,3453657,L-Glutamine,kegg,C00064
741598,16,162.076139,891.966177,positive,2826034,C6H11NO4,M+H,3453882,L-2-Aminoadipate,kegg,C00956
741599,17,116.070596,771.412376,positive,2826041,C5H9NO2,M+H,3453891,L-Proline,kegg,C00148
741611,29,132.101919,677.261473,positive,2826103,C6H13NO2,M+H,3453976,L-Leucine,kegg,C00123
741680,98,132.101919,646.385592,positive,2826221,C6H13NO2,M+H,3454135,L-Leucine,kegg,C00123


In [23]:
compound_ids = sorted(list(set(peak_df['identifier'].values.tolist())))

In [24]:
compound_mapping, id_to_names = compound_to_reaction(compound_ids, species)

UnboundLocalError: local variable 'session' referenced before assignment

In [None]:
print(list(compound_mapping.keys())[0:10])

In [None]:
print(compound_mapping)

## Map Reactions to Pathways

In [None]:
protein_reactions = list(set(get_reaction_ids(protein_mapping)))
compound_reactions = list(set(get_reaction_ids(compound_mapping)))
combined_reactions = list(set(protein_reactions + compound_reactions))

print(len(protein_reactions))
print(len(compound_reactions))
print(len(combined_reactions))

In [None]:
print(combined_reactions[0:10])

This is a map of reactions to pathways that are under the top-level biochemical ('Metabolism') pathways

In [None]:
pathway_mapping = reaction_to_metabolite_pathway(combined_reactions, species, show_progress_bar=True, leaf=True)
print(len(pathway_mapping))
print(pathway_mapping['R-MMU-2395768'])

In [None]:
kegg_location = os.path.join(basedir, 'data/kegg/kegg.xml')
kegg_dict = produce_kegg_dict(kegg_location, 'id')

In [None]:
reaction_df = get_reaction_df(transcript_mapping, protein_mapping, compound_mapping, pathway_mapping, species)
reaction_df.sort_values(by=['compound_coverage', 'protein_coverage'], inplace=True, ascending=False)
print(reaction_df.shape)

In [None]:
reaction_df

In [None]:
reaction_df.to_csv('reaction_df.tsv', sep='\t', index=False)

### How many unique pathways are linked to reactions?

In [None]:
reaction_df['pathway_names'].values

In [None]:
reaction_df['pathway_ids'].values

In [None]:
pathway_ids_to_names = {}
for ids, names in zip(reaction_df['pathway_ids'].values, reaction_df['pathway_names'].values):
    pathway_ids = ids.split(':')
    pathway_names = names.split(':')
    for pid, pn in zip(pathway_ids, pathway_names):
        pathway_ids_to_names[pid] = pn

In [None]:
data = []
for pid, name in pathway_ids_to_names.items():
    row = (pid, name.lower(), )
    data.append(row)
    
pathway_df = pd.DataFrame(data, columns=['pathway_id', 'pathway_name'])
pathway_df = pathway_df.sort_values(by='pathway_name').reset_index(drop=True)
display(pathway_df)

### Hypergeometric test

In [None]:
pw_f, pathway_id_to_name = get_all_pathways_formulae(species)

In [None]:
detected = set(peak_df[['formula']].values.flatten())
print(detected, len(detected))

In [None]:
data = []
for pathway_id in pw_f:
    
    pathway_name = pathway_id_to_name[pathway_id]
    
    formulae = pw_f[pathway_id]
    formulae_count = len(formulae)
    formulae_str = ','.join(sorted(formulae))
    
    detected_f = set([x for x in formulae if x in detected])
    detected_count = len(detected_f)
    detected_str = ','.join(sorted(detected_f))
    
    row = [pathway_id, pathway_name, formulae_str, formulae_count, detected_str, detected_count]
    data.append(row)
    
all_pathway_df = pd.DataFrame(data, columns=[
    'pathway_id', 'pathway_name', 
    'formula', 'formula_count',
    'detected', 'detected_count'])
all_pathway_df.set_index('pathway_id', drop=True, inplace=True)

Compute hypergeometric p-values

In [None]:
from scipy.stats import hypergeom

In [None]:
# M = the number of unique formula in all pathways in Reactome
M = len(set(','.join(all_pathway_df['formula'].values).split(',')))

# N = the number of unique formula in all pathways in the dataset
N = len(set(','.join(all_pathway_df['detected'].values).split(',')))

SMOOTHING = 1

data = []
for idx, row in all_pathway_df.iterrows():
        
    # k = the number of unique formula in the pathway of interest in the dataset
    k = row['detected_count']
    
    # n = the number of unique formula in the pathway of interest
    n = row['formula_count'] + SMOOTHING
        
    p_value = hypergeom.sf(k, M, n, N)
    assert p_value > 0
    new_row = [idx, p_value]
    data.append(new_row)

p_value_df = pd.DataFrame(data, columns=[
    'pathway_id', 'p_value'])
p_value_df.set_index('pathway_id', drop=True, inplace=True)

In [None]:
combined = pd.concat([all_pathway_df, p_value_df], axis=1)
combined = combined.sort_values(by='p_value', ascending=True).reset_index(drop=True)
display(combined)
combined.to_csv('pathway_df.csv', index=False, encoding='utf-8')