# simple t-test

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pylab as plt
import matplotlib
from IPython.display import display, HTML, Image

import numpy as np
import pandas as pd
from sklearn import preprocessing
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests

%matplotlib inline

In [3]:
from linker.models import Analysis
from linker.views.functions import get_last_analysis_data, get_groups, get_dataframes, get_standardized_df, \
    get_group_members
from linker.views.pipelines import WebOmicsInference
from linker.constants import *


        MATCH (n:Species) RETURN n.displayName AS name order by name        
        

            MATCH (tp:TopLevelPathway)-[:hasEvent*]->(p:Pathway)-[:hasEvent*]->(rle:ReactionLikeEvent)
            WHERE
                tp.displayName = 'Metabolism' AND
                tp.speciesName IN {species_list} AND
                (p)-[:hasEvent]->(rle)
            RETURN DISTINCT
                p.speciesName AS species_name,            
                p.displayName AS pathway_name,
                p.stId AS pathway_id                       
            ORDER BY species_name, pathway_name
        


### Load test data

In [4]:
analysis_id = 25

In [5]:
data_type = METABOLOMICS
analysis = Analysis.objects.get(pk=analysis_id)
analysis_data = get_last_analysis_data(analysis, data_type)
groups = get_groups(analysis_data)

In [6]:
data_df, design_df = get_dataframes(analysis_data, PKS)

In [7]:
data_df

Unnamed: 0_level_0,obs,compound_id,UN_1,UN4,INFEC_1,INFEC_2,INFEC_3,INFEC_4,HK1,HK2,HK3,HK4,padj_INFEC_vs_UN,FC_INFEC_vs_UN
compound_pk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
C00001,False,H2o,,,,,,,,,,,,
C00002,False,Atp,,,,,,,,,,,,
C00003,False,Nad+,,,,,,,,,,,,
C00004,False,Nadh,,,,,,,,,,,,
C00005,False,Nadph,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C14767,False,9(s)-hode,,,,,,,,,,,,
C15519,False,25-hydroxycholesterol,,,,,,,,,,,,
C15613,False,"(25r)-3alpha,7alpha,12alpha-trihydroxy-5beta-c...",,,,,,,,,,,,
C18125,False,1-acyl-sn-glycero-3-phosphoserine,,,,,,,,,,,,


### Run t-test

In [8]:
case = 'HK'
control = 'UN'

In [9]:
wi = WebOmicsInference(data_df, design_df, data_type, min_value=5000)
result_df = wi.run_ttest(case, control)
result_df.head()

Unnamed: 0,padj,log2FoldChange
C00009,0.937964,0.03141
C00020,0.552928,-0.319297
C00025,0.552928,0.162212
C00037,0.552928,-0.380872
C00041,0.552928,-0.201749


### Run pathway analysis

In [10]:
axis = 1
X_std, data_df, design_df = get_standardized_df(analysis_data, axis, pk_cols=PKS)

In [11]:
formula_df = pd.DataFrame()
formula_df['entity_id'] = X_std.index
formula_df.index.name = 'row_id'
formula_df.head()

Unnamed: 0_level_0,entity_id
row_id,Unnamed: 1_level_1
0,C00009
1,C00020
2,C00025
3,C00037
4,C00041


In [12]:
X_std.reset_index(drop=True, inplace=True)
X_std.index.name = 'row_id'
X_std.head()

Unnamed: 0_level_0,UN_1,UN4,INFEC_1,INFEC_2,INFEC_3,INFEC_4,HK1,HK2,HK3,HK4
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.379082,-0.293206,1.156148,-1.336673,-1.306171,0.933622,-1.240671,1.533671,-0.198068,0.372265
1,-0.651944,0.925004,-1.40403,1.968888,0.773244,0.738862,-0.510392,-0.879616,-0.821076,-0.13894
2,-0.211874,-0.727576,-0.040952,-1.163665,2.554087,-0.639875,0.869042,-0.416545,0.19168,-0.414321
3,1.407874,0.396857,-0.928242,-0.443247,1.779877,-1.486203,0.537985,-0.889365,0.182788,-0.558325
4,0.080268,-0.69256,0.714309,0.242121,2.449944,0.303803,-0.325113,-1.030245,-0.583833,-1.158695


In [13]:
experimental_design = {
    'comparisons': [],
    'groups': get_group_members(analysis_data)
}

# populate comparison values
comparison_cols = list(filter(lambda x: x.lower().startswith('padj_'), data_df.columns))
for comparison_col in comparison_cols:
    tokens = comparison_col.split('_')
    case = tokens[1]
    control = tokens[3]
    experimental_design['comparisons'].append({
        'case': case,
        'control': control,
        'name': '%s_vs_%s' % (case, control)
    })

In [14]:
import sys
sys.path.append('C:\\Users\\joewa\\Work\\git\\PALS\\pals')

In [16]:
from feature_extraction import DataSource
from pathway_analysis import PALS
from common import DATABASE_REACTOME_KEGG, REACTOME_SPECIES_MUS_MUSCULUS

In [19]:
ds = DataSource(X_std, formula_df, experimental_design, DATABASE_REACTOME_KEGG, 
                reactome_species=REACTOME_SPECIES_MUS_MUSCULUS, reactome_metabolic_pathway_only=True)

2019-11-04 23:05:00.085 | DEBUG    | feature_extraction:__init__:38 - Loading C:\Users\joewa\Work\git\PALS\pals\data\reactome\metabolic_pathways\COMPOUND\Mus musculus.json.zip


In [20]:
pals = PALS(ds, min_replace=5000)
pathway_df = pals.get_pathway_df(standardize=False)

2019-11-04 23:05:26.523 | DEBUG    | pathway_analysis:get_plage_activity_df:67 - Mean values of the rows in the DF is [ 0. -0.  0.  0. -0. -0. -0. -0. -0.  0. -0. -0. -0. -0.  0.  0. -0.  0.
  0.  0.  0. -0.  0. -0. -0.  0.]
2019-11-04 23:05:26.526 | DEBUG    | pathway_analysis:get_plage_activity_df:68 - Variance in the rows of the DF is [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1.]
2019-11-04 23:05:26.962 | INFO     | pathway_analysis:set_up_resample_plage_p_df:79 - Calculating plage p-values with resampling
2019-11-04 23:05:26.964 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:83 - Comparison INFEC_vs_UN
2019-11-04 23:05:26.966 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 0/1000
2019-11-04 23:05:27.078 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 100/1000
2019-11-04 23:05:27.188 | DEBUG    | pathway_analysis:set_up_resample_plage_p_df:91 - Resampling 200/1000
2019-11-04 23:05:27.300 |

In [21]:
pathway_df

Unnamed: 0,pw_name,INFEC_vs_UN p-value,unq_pw_F,tot_ds_F,F_coverage,sf,exp_F,Ex_Cov,INFEC_vs_UN comb_p
R-MMU-159418,Recycling of bile acids and salts,0.999992,13,3,23.08,0.009109,0.85,6.54,0.999921
R-MMU-73614,Pyrimidine salvage,0.988290,21,2,9.52,0.166099,1.37,6.52,0.978939
R-MMU-163210,Formation of ATP by chemiosmotic coupling,1.000000,5,1,20.00,0.052359,0.33,6.60,1.000000
R-MMU-1614517,Sulfide oxidation to sulfate,0.774221,7,1,14.29,0.090155,0.46,6.57,0.682765
R-MMU-70688,Proline catabolism,1.000000,10,2,20.00,0.028724,0.65,6.50,1.000000
...,...,...,...,...,...,...,...,...,...
R-MMU-196843,Vitamin B2 (riboflavin) metabolism,1.000000,10,2,20.00,0.028724,0.65,6.50,1.000000
R-MMU-947581,Molybdenum cofactor biosynthesis,0.805528,12,2,16.67,0.045610,0.78,6.50,0.696222
R-MMU-197264,Nicotinamide salvaging,0.999435,25,2,8.00,0.236188,1.63,6.52,0.998862
R-MMU-9018896,Biosynthesis of E-series 18(S)-resolvins,0.774221,7,1,14.29,0.090155,0.46,6.57,0.682765
