Skip to content

Commit

Permalink
Added an option to filter pathway analysis results by the minimum num…
Browse files Browse the repository at this point in the history
…ber of hits (issue #40).

Note: it doesn't matter for PLAGE, but for ORA and GSEA it's probably better to filter these pathways **before** analysis, rather than after.
  • Loading branch information
joewandy committed Jun 11, 2020
1 parent 09f5138 commit 2112804
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 7 deletions.
4 changes: 3 additions & 1 deletion pals/GSEA.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from gseapy.gsea import GSEA
from loguru import logger

from .common import is_comparison_used, GSEA_RANKING_SNR, NUM_RESAMPLES, Method
from .common import is_comparison_used, GSEA_RANKING_SNR, NUM_RESAMPLES, Method, post_filter_df_by_min_hits


class MSEA(GSEA):
Expand Down Expand Up @@ -162,4 +162,6 @@ def get_pathway_df(self):
# del pathway_df.index.name
pathway_df.rename_axis(None, inplace=True)

# post-processing to filter pathway dataframe by the minimum number of hits
pathway_df = post_filter_df_by_min_hits(pathway_df, self.data_source.min_hits)
return pathway_df
4 changes: 3 additions & 1 deletion pals/ORA.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from scipy.stats import ttest_ind
from statsmodels.sandbox.stats.multicomp import multipletests

from .common import SIGNIFICANT_THRESHOLD, is_comparison_used, Method
from .common import SIGNIFICANT_THRESHOLD, is_comparison_used, Method, post_filter_df_by_min_hits


class ORA(Method):
Expand Down Expand Up @@ -147,6 +147,8 @@ def get_pathway_df(self, correct_multiple_tests=True, standardize=True):
# del pathway_df.index.name
pathway_df.rename_axis(None, inplace=True)

# post-processing to filter pathway dataframe by the minimum number of hits
pathway_df = post_filter_df_by_min_hits(pathway_df, self.data_source.min_hits)
return pathway_df

####################################################################################################################
Expand Down
7 changes: 5 additions & 2 deletions pals/PLAGE.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from scipy.stats import hypergeom
from scipy.stats import ttest_ind

from .common import NUM_RESAMPLES, PLAGE_WEIGHT, HG_WEIGHT, is_comparison_used, Method
from .common import NUM_RESAMPLES, PLAGE_WEIGHT, HG_WEIGHT, is_comparison_used, Method, post_filter_df_by_min_hits


class PLAGE(Method):
Expand Down Expand Up @@ -254,7 +254,10 @@ def calculate_hg_values(self, pathway_df):
comb_p_df.columns = column_names
pathway_df_final = pd.merge(pathway_df_merge, comb_p_df[column_names], left_index=True, right_index=True,
how='outer')
return pathway_df_final

# post-processing to filter pathway dataframe by the minimum number of hits
pathway_df = post_filter_df_by_min_hits(pathway_df_final, self.data_source.min_hits)
return pathway_df

####################################################################################################################
# private methods
Expand Down
16 changes: 16 additions & 0 deletions pals/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
from collections import defaultdict
from io import StringIO

import numpy as np
import pandas as pd
from loguru import logger

PW_F_OFFSET = 1
MIN_REPLACE = 5000
MIN_HITS = 0
NUM_RESAMPLES = 1000
PLAGE_WEIGHT = 1
HG_WEIGHT = 0 # TODO: remove this?
Expand Down Expand Up @@ -220,3 +222,17 @@ def get_table_download_link(df):
csv.encode()
).decode() # some strings <-> bytes conversions necessary here
return f'<a href="data:file/csv;base64,{b64}" download="results.csv">Download csv file</a>'


def post_filter_df_by_min_hits(pathway_df, min_hits):
"""
Filter the 'tot_ds_F' column of a pathway dataframe. If it's less than min_hits, then set all the p-values to NaN.
:param pathway_df: a pathway dataframe returned by a pathway ranking method in PALS
:param min_hits: the minimum number of hits in this pathway
:return: a filtered pathway dataframe
"""
pathway_df = pathway_df.copy()
rows_to_remove = pathway_df[pathway_df['tot_ds_F'] < min_hits]
columns_to_remove = [col for col in rows_to_remove.columns.values if 'p-value' in col or 'comb_p' in col]
pathway_df.loc[rows_to_remove.index, columns_to_remove] = np.nan
return pathway_df
5 changes: 3 additions & 2 deletions pals/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@

from .loader import PiMP_KEGG_Loader, CompoundOnlineLoader, CompoundOfflineLoader, UniProtLoader, EnsemblLoader
from .common import DATABASE_PIMP_KEGG, DATABASE_REACTOME_KEGG, DATABASE_REACTOME_CHEBI, \
DATABASE_REACTOME_UNIPROT, DATABASE_REACTOME_ENSEMBL, MIN_REPLACE
DATABASE_REACTOME_UNIPROT, DATABASE_REACTOME_ENSEMBL, MIN_REPLACE, MIN_HITS


class DataSource(object):

def __init__(self, measurement_df, annotation_df, experimental_design, database_name,
reactome_species=None, reactome_metabolic_pathway_only=True, reactome_query=False, database=None,
min_replace=MIN_REPLACE):
min_replace=MIN_REPLACE, min_hits=MIN_HITS):
"""
Creates a data source for PALS analysis
:param measurement_df: a dataframe of peak intensities, where index = row id and columns = sample_name
Expand All @@ -32,6 +32,7 @@ def __init__(self, measurement_df, annotation_df, experimental_design, database_
self.reactome_metabolic_pathway_only = reactome_metabolic_pathway_only
self.reactome_query = reactome_query
self.min_replace = min_replace
self.min_hits = min_hits

self.groups = dict(self.experimental_design['groups'].items())
for group in self.groups:
Expand Down
5 changes: 4 additions & 1 deletion pals/run_pals.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def build_parser():
# common parameters
parser.add_argument('--min_replace', type=float, default=MIN_REPLACE,
help='Minimum intensity of MS1 peaks for data imputation (default: %(default)s)')
parser.add_argument('--min_hits', type=float, default=MIN_HITS,
help='Minimum number of hits to keep pathways in the results (default: %(default)s)')

# reactome parameters
parser.add_argument('--species', default=REACTOME_SPECIES_HOMO_SAPIENS, help='Species name',
Expand Down Expand Up @@ -99,6 +101,7 @@ def main():
# extract other args
database_name = args['db']
min_replace = args['min_replace']
min_hits = args['min_hits']
reactome_species = args['species']
reactome_metabolic_pathway_only = not args['use_all_reactome_pathways']
reactome_query = args['connect_to_reactome_server']
Expand All @@ -107,7 +110,7 @@ def main():
ds = DataSource(int_df, annotation_df, experimental_design, database_name,
reactome_species=reactome_species,
reactome_metabolic_pathway_only=reactome_metabolic_pathway_only,
reactome_query=reactome_query, min_replace=min_replace)
reactome_query=reactome_query, min_replace=min_replace, min_hits=min_hits)

# run the selected pathway analysis method
method = None
Expand Down

0 comments on commit 2112804

Please sign in to comment.