# simple t-test

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pylab as plt
import matplotlib
from IPython.display import display, HTML, Image

import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests

%matplotlib inline

In [3]:
from linker.models import Analysis
from linker.views.functions import get_last_analysis_data, get_groups, get_dataframes, get_standardized_df, \
    get_group_members
from linker.views.pipelines import GraphOmicsInference
from linker.constants import *


        MATCH (n:Species) RETURN n.displayName AS name order by name        
        

            MATCH (tp:TopLevelPathway)-[:hasEvent*]->(p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent)
            WHERE
                tp.displayName = 'Metabolism' AND
                tp.speciesName IN {species_list} AND
                (p)-[:hasEvent]->(rle)
            RETURN DISTINCT
                p.speciesName AS species_name,            
                p.displayName AS pathway_name,
                p.stId AS pathway_id                       
            ORDER BY species_name, pathway_name
        


### Load test data

In [4]:
analysis_id = 29

In [5]:
data_type = METABOLOMICS
analysis = Analysis.objects.get(pk=analysis_id)
analysis_data = get_last_analysis_data(analysis, data_type)
groups = get_groups(analysis_data)

In [6]:
data_df, design_df = get_dataframes(analysis_data, PKS)

In [7]:
data_df

Unnamed: 0_level_0,obs,compound_id,UN_1,UN4,INFEC_1,INFEC_2,INFEC_3,INFEC_4,HK1,HK2,HK3,HK4,padj_HK_vs_UN,FC_HK_vs_UN
compound_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C00001,False,H2o,,,,,,,,,,,,
C00002,False,Atp,,,,,,,,,,,,
C00003,False,Nad+,,,,,,,,,,,,
C00004,False,Nadh,,,,,,,,,,,,
C00005,False,Nadph,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C14767,False,9(s)-hode,,,,,,,,,,,,
C15519,False,25-hydroxycholesterol,,,,,,,,,,,,
C15613,False,"(25r)-3alpha,7alpha,12alpha-trihydroxy-5beta-c...",,,,,,,,,,,,
C18125,False,1-acyl-sn-glycero-3-phosphoserine,,,,,,,,,,,,


### Run t-test

In [8]:
case = 'HK'
control = 'UN'

In [9]:
wi = GraphOmicsInference(data_df, design_df, data_type, min_value=5000)
result_df = wi.run_ttest(case, control)
result_df.head()

Unnamed: 0,padj,log2FoldChange
C00009,0.937964,0.03141
C00020,0.552928,-0.319297
C00025,0.552928,0.162212
C00037,0.552928,-0.380872
C00041,0.552928,-0.201749


### Run pathway analysis

In [11]:
axis = 1
X_std, data_df, design_df = get_standardized_df(analysis_data, axis, pk_cols=PKS)

In [10]:
experimental_design = {
    'comparisons': [],
    'groups': get_group_members(analysis_data)
}

# populate comparison values
comparison_cols = list(filter(lambda x: x.lower().startswith('padj_'), data_df.columns))
for comparison_col in comparison_cols:
    tokens = comparison_col.split('_')
    case = tokens[1]
    control = tokens[3]
    experimental_design['comparisons'].append({
        'case': case,
        'control': control,
        'name': '%s_vs_%s' % (case, control)
    })
    
assert len(experimental_design['comparisons']) > 0

In [12]:
formula_df = pd.DataFrame()
formula_df['entity_id'] = X_std.index
formula_df.index.name = 'row_id'
formula_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
0,C00009
1,C00020
2,C00025
3,C00037
4,C00041


In [13]:
X_std.reset_index(drop=True, inplace=True)
X_std.index.name = 'row_id'
X_std.head()

Unnamed: 0_level_0,UN_1,UN4,INFEC_1,INFEC_2,INFEC_3,INFEC_4,HK1,HK2,HK3,HK4
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.379082,-0.293206,1.156148,-1.336673,-1.306171,0.933622,-1.240671,1.533671,-0.198068,0.372265
1,-0.651944,0.925004,-1.40403,1.968888,0.773244,0.738862,-0.510392,-0.879616,-0.821076,-0.13894
2,-0.211874,-0.727576,-0.040952,-1.163665,2.554087,-0.639875,0.869042,-0.416545,0.19168,-0.414321
3,1.407874,0.396857,-0.928242,-0.443247,1.779877,-1.486203,0.537985,-0.889365,0.182788,-0.558325
4,0.080268,-0.69256,0.714309,0.242121,2.449944,0.303803,-0.325113,-1.030245,-0.583833,-1.158695


In [14]:
experimental_design

{'comparisons': [{'case': 'HK', 'control': 'UN', 'name': 'HK_vs_UN'}],
 'groups': {'HK': array(['HK1', 'HK2', 'HK3', 'HK4'], dtype=object),
  'INFEC': array(['INFEC_1', 'INFEC_2', 'INFEC_3', 'INFEC_4'], dtype=object),
  'UN': array(['UN_1', 'UN4'], dtype=object)}}

In [15]:
import sys
sys.path.append('C:\\Users\\joewa\\Work\\git\\PALS\\pals')

In [16]:
from feature_extraction import DataSource
from pathway_analysis import PALS
from common import DATABASE_REACTOME_KEGG, REACTOME_SPECIES_MUS_MUSCULUS

In [17]:
database_name = None
if analysis.metadata['compound_database_str'] == COMPOUND_DATABASE_KEGG:
    database_name = DATABASE_REACTOME_KEGG
elif analysis.metadata['compound_database_str'] == COMPOUND_DATABASE_CHEBI:
    database_name = DATABASE_REACTOME_CHEBI
    
reactome_metabolic_pathway_only = analysis.metadata['metabolic_pathway_only']

reactome_species = analysis.metadata['species_list'][0] # assume the first one

In [18]:
ds = DataSource(X_std, formula_df, experimental_design, database_name, reactome_species, reactome_metabolic_pathway_only)

2019-11-05 13:00:53.367 | DEBUG    | feature_extraction:__init__:39 - Loading C:\Users\joewa\Work\git\PALS\pals\data\reactome\metabolic_pathways\COMPOUND\Mus musculus.json.zip


In [19]:
pals = PALS(ds)
pathway_df = pals.get_pathway_df(standardize=False)

2019-11-05 13:00:53.502 | DEBUG    | pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [ 0. -0.  0.  0. -0. -0. -0. -0. -0.  0. -0. -0. -0. -0.  0.  0. -0.  0.
  0.  0.  0. -0.  0. -0. -0.  0.]
2019-11-05 13:00:53.504 | DEBUG    | pathway_analysis:get_plage_activity_df:68 - Variance in the rows of the DF is [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1.]
2019-11-05 13:00:53.850 | INFO     | pathway_analysis:set_up_resample_plage_p_df:79 - Calculating plage p-values with resampling
2019-11-05 13:00:53.852 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:83 - Comparison HK_vs_UN
2019-11-05 13:00:53.853 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 0/1000
2019-11-05 13:00:53.947 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 100/1000
2019-11-05 13:00:54.038 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 200/1000
2019-11-05 13:00:54.128 | DE

In [20]:
pathway_df

Unnamed: 0,pw_name,HK_vs_UN p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,COMPOUND HK_vs_UN comb_p
R-MMU-74259,Purine catabolism,0.999771,30,5,16.67,0.010019,1.96,6.53,0.998557
R-MMU-156584,Cytosolic sulfonation of small molecules,0.994478,13,2,15.38,0.055565,0.85,6.54,0.985352
R-MMU-1362409,Mitochondrial iron-sulfur cluster biogenesis,0.922604,4,1,25.00,0.036353,0.26,6.50,0.851575
R-MMU-1474151,"Tetrahydrobiopterin (BH4) synthesis, recycling...",0.992057,15,1,6.67,0.281512,0.98,6.53,0.987816
R-MMU-9023661,Biosynthesis of E-series 18(R)-resolvins,0.299811,5,1,20.00,0.052359,0.33,6.60,0.202443
...,...,...,...,...,...,...,...,...,...
R-MMU-196836,Vitamin C (ascorbate) metabolism,0.299811,8,1,12.50,0.111352,0.52,6.50,0.225458
R-MMU-1660517,Synthesis of PIPs at the late endosome membrane,1.000000,7,1,14.29,0.090155,0.46,6.57,1.000000
R-MMU-75876,Synthesis of very long-chain fatty acyl-CoAs,0.957880,13,1,7.69,0.230832,0.85,6.54,0.939274
R-MMU-196741,"Cobalamin (Cbl, vitamin B12) transport and met...",1.000000,14,1,7.14,0.256115,0.92,6.57,1.000000


### Add the results to pathway data in GraphOmics

In [21]:
from linker.views.inference_view import copy_analysis_data

In [22]:
result_cols = list(filter(lambda x: x.endswith('comb_p'), pathway_df.columns))
pals_df = pathway_df[result_cols]

# remove 'comb_p' from the column names
pals_df = pals_df.rename(columns={
    col: '_'.join(col.split(' ')[0:-1]).strip() for col in pals_df.columns
})
pals_df.head()

Unnamed: 0,COMPOUND_HK_vs_UN_
R-MMU-74259,0.998557
R-MMU-156584,0.985352
R-MMU-1362409,0.851575
R-MMU-1474151,0.987816
R-MMU-9023661,0.202443


In [23]:
pals_dict = pals_df.to_dict()
pals_dict

{'COMPOUND_HK_vs_UN_': {'R-MMU-74259': 0.9985565305225553,
  'R-MMU-156584': 0.9853521906815588,
  'R-MMU-1362409': 0.8515754176278264,
  'R-MMU-1474151': 0.9878158427587491,
  'R-MMU-9023661': 0.20244260346218607,
  'R-MMU-1614635': 0.9999999669703273,
  'R-MMU-9018896': 0.21842982856639698,
  'R-MMU-1483213': 0.9999999563715601,
  'R-MMU-389661': 0.7461458843542305,
  'R-MMU-189451': 0.7053854223601137,
  'R-MMU-2142691': 0.8302700823104572,
  'R-MMU-1483191': 0.9999999669703273,
  'R-MMU-1855231': 0.9999999065532109,
  'R-MMU-70895': 0.999201670655569,
  'R-MMU-177135': 0.12568120069526434,
  'R-MMU-71262': 0.6818610399965258,
  'R-MMU-191273': 0.999999988080512,
  'R-MMU-611105': 0.9236868068773897,
  'R-MMU-197264': 0.9999922850178483,
  'R-MMU-209968': 0.9000355933034272,
  'R-MMU-156590': 0.20244260346218607,
  'R-MMU-6798163': 0.6268710200024415,
  'R-MMU-70921': 0.999256807653691,
  'R-MMU-2408508': 0.9579692693054424,
  'R-MMU-1855183': 0.9999999563715601,
  'R-MMU-6783984': 

In [24]:
data_type = PATHWAYS
analysis_data = get_last_analysis_data(analysis, data_type)

In [25]:
json_data = analysis_data.json_data
json_data

[{'obs': None,
  'pathway_pk': 'R-MMU-1237112',
  'pathway_id': 'Methionine salvage pathway',
  'species': 'Mus musculus'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1362409',
  'pathway_id': 'Mitochondrial iron-sulfur cluster biogenesis',
  'species': 'Mus musculus'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1474151',
  'pathway_id': 'Tetrahydrobiopterin (bh4) synthesis, recycling, salvage and regulation',
  'species': 'Mus musculus'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1482788',
  'pathway_id': 'Acyl chain remodelling of pc',
  'species': 'Mus musculus'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1482801',
  'pathway_id': 'Acyl chain remodelling of ps',
  'species': 'Mus musculus'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1482839',
  'pathway_id': 'Acyl chain remodelling of pe',
  'species': 'Mus musculus'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1483166',
  'pathway_id': 'Synthesis of pa',
  'species': 'Mus musculus'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1483191',
  'pathway_id': 'Synthesis 

In [26]:
for pathway_dict in json_data:
    pathway_pk = pathway_dict[PATHWAY_PK]
    for comparison in pals_dict:
        pals_results = pals_dict[comparison]
        # remove space and last underscore from the comparison name
        key = comparison.strip().rsplit('_', 1)[0]
        # key = 'PALS_%s' % key
        try:
            pathway_dict[key] = pals_results[pathway_pk]
        except KeyError:  # pathway is not present in dataset, so it isn't included in PALS results
            pathway_dict[key] = NA

In [27]:
json_data

[{'obs': None,
  'pathway_pk': 'R-MMU-1237112',
  'pathway_id': 'Methionine salvage pathway',
  'species': 'Mus musculus',
  'COMPOUND_HK_vs_UN_': 0.9041819583560438},
 {'obs': None,
  'pathway_pk': 'R-MMU-1362409',
  'pathway_id': 'Mitochondrial iron-sulfur cluster biogenesis',
  'species': 'Mus musculus',
  'COMPOUND_HK_vs_UN_': 0.8515754176278264},
 {'obs': None,
  'pathway_pk': 'R-MMU-1474151',
  'pathway_id': 'Tetrahydrobiopterin (bh4) synthesis, recycling, salvage and regulation',
  'species': 'Mus musculus',
  'COMPOUND_HK_vs_UN_': 0.9878158427587491},
 {'obs': None,
  'pathway_pk': 'R-MMU-1482788',
  'pathway_id': 'Acyl chain remodelling of pc',
  'species': 'Mus musculus',
  'COMPOUND_HK_vs_UN_': '-'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1482801',
  'pathway_id': 'Acyl chain remodelling of ps',
  'species': 'Mus musculus',
  'COMPOUND_HK_vs_UN_': '-'},
 {'obs': None,
  'pathway_pk': 'R-MMU-1482839',
  'pathway_id': 'Acyl chain remodelling of pe',
  'species': 'Mus musculus',

In [28]:
copy_analysis_data(analysis_data, json_data, 'PALS: %s' % ds.database_name, analysis_data.metadata, INFERENCE_PALS)

<AnalysisData: AnalysisData object (233)>