# Maps Kevin's data onto reactome pathways

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import glob
import json

import pylab as plt
import matplotlib
from IPython.display import display, HTML

import numpy as np
from scipy.sparse import lil_matrix
import pandas as pd
from pandas import HDFStore

import seaborn as sns
from IPython.display import display
    
from collections import defaultdict

from ipywidgets import FloatProgress

%matplotlib inline

In [5]:
import sys
sys.path.append('../linker')

from reactome import *

## Load data

In [6]:
def load_json(infile):
    with open(infile) as json_data:
        d = json.load(json_data)
        return d

In [7]:
base_dir = '../static/data/'
genes_df = pd.read_json(base_dir + 'genes_json.json')
proteins_df = pd.read_json(base_dir + 'proteins_json.json')
compounds_df = pd.read_json(base_dir + 'compounds_json.json')
reactions_df = pd.read_json(base_dir + 'reactions_json.json')
pathway_df = pd.read_json(base_dir + 'pathways_json.json')

gene_proteins_df = pd.read_json(base_dir + 'gene_proteins_json.json')
protein_reactions_df = pd.read_json(base_dir + 'protein_reactions_json.json')
compound_reactions_df = pd.read_json(base_dir + 'compound_reactions_json.json')
reaction_pathways_df = pd.read_json(base_dir + 'reaction_pathways_json.json')

### Number of known enzymes and compounds per reactions

In [66]:
merged_df = pd.merge(protein_reactions_df, compound_reactions_df, on='reaction_pk', how='outer')
display(merged_df.head())

count_df1 = merged_df.groupby(['reaction_pk'])['protein_pk'].size().reset_index()
count_df1 = count_df1.rename({'protein_pk': 'E'}, axis='columns')
display(count_df1.head())

count_df2 = merged_df.groupby(['reaction_pk'])['compound_pk'].size().reset_index()
count_df2 = count_df2.rename({'compound_pk': 'C'}, axis='columns')
display(count_df2.head())

Unnamed: 0,protein_pk,reaction_pk,compound_pk
0,Q99N16,R-MMU-211904,16474
1,Q99N16,R-MMU-211904,15843
2,Q99N16,R-MMU-211904,15378
3,Q99N16,R-MMU-211904,15379
4,Q99N16,R-MMU-211904,63590


Unnamed: 0,reaction_pk,E
0,-,4
1,R-MMU-109278,11
2,R-MMU-109291,9
3,R-MMU-109380,14
4,R-MMU-109387,16


Unnamed: 0,reaction_pk,C
0,-,4
1,R-MMU-109278,11
2,R-MMU-109291,9
3,R-MMU-109380,14
4,R-MMU-109387,16


In [63]:
count_df = pd.merge(count_df1, count_df2, on='reaction_pk', how='outer')
display(count_df)

Unnamed: 0,reaction_pk,protein_hits,compound_hits
0,-,1,1
1,R-MMU-109278,1,11
2,R-MMU-109291,1,9
3,R-MMU-109380,1,14
4,R-MMU-109387,1,16
5,R-MMU-109415,1,5
6,R-MMU-109449,1,13
7,R-MMU-109624,1,7
8,R-MMU-109903,1,6
9,R-MMU-110137,4,9


### Number of known enzymes per pathway

In [18]:
reaction_pathways_df.head()

Unnamed: 0,pathway_pk,reaction_pk
0,R-MMU-211935,R-MMU-211904
1,R-MMU-6799198,R-MMU-6799202
2,R-MMU-193775,R-MMU-193727
3,R-MMU-196783,R-MMU-196754
4,R-MMU-6799198,R-MMU-6800868


In [19]:
protein_reactions_df.head()

Unnamed: 0,protein_pk,reaction_pk
0,Q99N16,R-MMU-211904
1,G3UW81,R-MMU-211904
2,Q9EP75,R-MMU-211904
3,Q99N18,R-MMU-211904
4,Q8VCA4,R-MMU-211904


In [75]:
merged_df = pd.merge(reaction_pathways_df, protein_reactions_df, on='reaction_pk', how='inner')
# merged_df = merged_df[merged_df['protein_pk'].isin(known_protein_ids)]

count_df = merged_df.groupby(['pathway_pk'])['protein_pk'].size().reset_index()
count_df = count_df.rename({'protein_pk': 'enzyme_count'}, axis='columns')

display(count_df)

Unnamed: 0,pathway_pk,enzyme_count
0,-,2
1,R-MMU-1237112,2
2,R-MMU-1474151,3
3,R-MMU-1482801,1
4,R-MMU-1483166,14
5,R-MMU-1483191,2
6,R-MMU-1483213,7
7,R-MMU-156581,2
8,R-MMU-156584,23
9,R-MMU-156590,19


In [56]:
count_df

<pandas.core.groupby.DataFrameGroupBy object at 0x10a365518>

### Hypergeometric test

In [None]:
pw_f, pathway_id_to_name = get_all_pathways_formulae(species)

In [None]:
detected = set(peak_df[['formula']].values.flatten())
print(detected, len(detected))

In [None]:
data = []
for pathway_id in pw_f:
    
    pathway_name = pathway_id_to_name[pathway_id]
    
    formulae = pw_f[pathway_id]
    formulae_count = len(formulae)
    formulae_str = ','.join(sorted(formulae))
    
    detected_f = set([x for x in formulae if x in detected])
    detected_count = len(detected_f)
    detected_str = ','.join(sorted(detected_f))
    
    row = [pathway_id, pathway_name, formulae_str, formulae_count, detected_str, detected_count]
    data.append(row)
    
all_pathway_df = pd.DataFrame(data, columns=[
    'pathway_id', 'pathway_name', 
    'formula', 'formula_count',
    'detected', 'detected_count'])
all_pathway_df.set_index('pathway_id', drop=True, inplace=True)

Compute hypergeometric p-values

In [None]:
from scipy.stats import hypergeom

In [None]:
# M = the number of unique formula in all pathways in Reactome
M = len(set(','.join(all_pathway_df['formula'].values).split(',')))

# N = the number of unique formula in all pathways in the dataset
N = len(set(','.join(all_pathway_df['detected'].values).split(',')))

SMOOTHING = 1

data = []
for idx, row in all_pathway_df.iterrows():
        
    # k = the number of unique formula in the pathway of interest in the dataset
    k = row['detected_count']
    
    # n = the number of unique formula in the pathway of interest
    n = row['formula_count'] + SMOOTHING
        
    p_value = hypergeom.sf(k, M, n, N)
    assert p_value > 0
    new_row = [idx, p_value]
    data.append(new_row)

p_value_df = pd.DataFrame(data, columns=[
    'pathway_id', 'p_value'])
p_value_df.set_index('pathway_id', drop=True, inplace=True)

In [None]:
combined = pd.concat([all_pathway_df, p_value_df], axis=1)
combined = combined.sort_values(by='p_value', ascending=True).reset_index(drop=True)
display(combined)
combined.to_csv('pathway_df.csv', index=False, encoding='utf-8')