Added an option to filter pathway analysis results by the minimum num…

…ber of hits (issue #40). Note: it doesn't matter for PLAGE, but for ORA and GSEA it's probably better to filter these pathways **before** analysis, rather than after.
glasgowcompbio · Jun 11, 2020 · 2112804 · 2112804
1 parent 09f5138
commit 2112804
Show file tree

Hide file tree

Showing 6 changed files with 34 additions and 7 deletions.
diff --git a/pals/GSEA.py b/pals/GSEA.py
@@ -5,7 +5,7 @@
 from gseapy.gsea import GSEA
 from loguru import logger
 
-from .common import is_comparison_used, GSEA_RANKING_SNR, NUM_RESAMPLES, Method
+from .common import is_comparison_used, GSEA_RANKING_SNR, NUM_RESAMPLES, Method, post_filter_df_by_min_hits
 
 
 class MSEA(GSEA):
@@ -162,4 +162,6 @@ def get_pathway_df(self):
         # del pathway_df.index.name
         pathway_df.rename_axis(None, inplace=True)
 
+        # post-processing to filter pathway dataframe by the minimum number of hits
+        pathway_df = post_filter_df_by_min_hits(pathway_df, self.data_source.min_hits)
         return pathway_df
diff --git a/pals/ORA.py b/pals/ORA.py
@@ -8,7 +8,7 @@
 from scipy.stats import ttest_ind
 from statsmodels.sandbox.stats.multicomp import multipletests
 
-from .common import SIGNIFICANT_THRESHOLD, is_comparison_used, Method
+from .common import SIGNIFICANT_THRESHOLD, is_comparison_used, Method, post_filter_df_by_min_hits
 
 
 class ORA(Method):
@@ -147,6 +147,8 @@ def get_pathway_df(self, correct_multiple_tests=True, standardize=True):
         # del pathway_df.index.name
         pathway_df.rename_axis(None, inplace=True)
 
+        # post-processing to filter pathway dataframe by the minimum number of hits
+        pathway_df = post_filter_df_by_min_hits(pathway_df, self.data_source.min_hits)
         return pathway_df
 
     ####################################################################################################################

diff --git a/pals/PLAGE.py b/pals/PLAGE.py
@@ -11,7 +11,7 @@
 from scipy.stats import hypergeom
 from scipy.stats import ttest_ind
 
-from .common import NUM_RESAMPLES, PLAGE_WEIGHT, HG_WEIGHT, is_comparison_used, Method
+from .common import NUM_RESAMPLES, PLAGE_WEIGHT, HG_WEIGHT, is_comparison_used, Method, post_filter_df_by_min_hits
 
 
 class PLAGE(Method):
@@ -254,7 +254,10 @@ def calculate_hg_values(self, pathway_df):
         comb_p_df.columns = column_names
         pathway_df_final = pd.merge(pathway_df_merge, comb_p_df[column_names], left_index=True, right_index=True,
                                     how='outer')
-        return pathway_df_final
+
+        # post-processing to filter pathway dataframe by the minimum number of hits
+        pathway_df = post_filter_df_by_min_hits(pathway_df_final, self.data_source.min_hits)
+        return pathway_df
 
     ####################################################################################################################
     # private methods

diff --git a/pals/common.py b/pals/common.py
@@ -9,11 +9,13 @@
 from collections import defaultdict
 from io import StringIO
 
+import numpy as np
 import pandas as pd
 from loguru import logger
 
 PW_F_OFFSET = 1
 MIN_REPLACE = 5000
+MIN_HITS = 0
 NUM_RESAMPLES = 1000
 PLAGE_WEIGHT = 1
 HG_WEIGHT = 0  # TODO: remove this?
@@ -220,3 +222,17 @@ def get_table_download_link(df):
         csv.encode()
     ).decode()  # some strings <-> bytes conversions necessary here
     return f'<a href="data:file/csv;base64,{b64}" download="results.csv">Download csv file</a>'
+
+
+def post_filter_df_by_min_hits(pathway_df, min_hits):
+    """
+    Filter the 'tot_ds_F' column of a pathway dataframe. If it's less than min_hits, then set all the p-values to NaN.
+    :param pathway_df: a pathway dataframe returned by a pathway ranking method in PALS
+    :param min_hits: the minimum number of hits in this pathway
+    :return: a filtered pathway dataframe
+    """
+    pathway_df = pathway_df.copy()
+    rows_to_remove = pathway_df[pathway_df['tot_ds_F'] < min_hits]
+    columns_to_remove = [col for col in rows_to_remove.columns.values if 'p-value' in col or 'comb_p' in col]
+    pathway_df.loc[rows_to_remove.index, columns_to_remove] = np.nan
+    return pathway_df
diff --git a/pals/feature_extraction.py b/pals/feature_extraction.py
@@ -7,14 +7,14 @@
 
 from .loader import PiMP_KEGG_Loader, CompoundOnlineLoader, CompoundOfflineLoader, UniProtLoader, EnsemblLoader
 from .common import DATABASE_PIMP_KEGG, DATABASE_REACTOME_KEGG, DATABASE_REACTOME_CHEBI, \
-    DATABASE_REACTOME_UNIPROT, DATABASE_REACTOME_ENSEMBL, MIN_REPLACE
+    DATABASE_REACTOME_UNIPROT, DATABASE_REACTOME_ENSEMBL, MIN_REPLACE, MIN_HITS
 
 
 class DataSource(object):
 
     def __init__(self, measurement_df, annotation_df, experimental_design, database_name,
                  reactome_species=None, reactome_metabolic_pathway_only=True, reactome_query=False, database=None,
-                 min_replace=MIN_REPLACE):
+                 min_replace=MIN_REPLACE, min_hits=MIN_HITS):
         """
         Creates a data source for PALS analysis
         :param measurement_df: a dataframe of peak intensities, where index = row id and columns = sample_name
@@ -32,6 +32,7 @@ def __init__(self, measurement_df, annotation_df, experimental_design, database_
         self.reactome_metabolic_pathway_only = reactome_metabolic_pathway_only
         self.reactome_query = reactome_query
         self.min_replace = min_replace
+        self.min_hits = min_hits
 
         self.groups = dict(self.experimental_design['groups'].items())
         for group in self.groups:

diff --git a/pals/run_pals.py b/pals/run_pals.py
@@ -43,6 +43,8 @@ def build_parser():
     # common parameters
     parser.add_argument('--min_replace', type=float, default=MIN_REPLACE,
                         help='Minimum intensity of MS1 peaks for data imputation  (default: %(default)s)')
+    parser.add_argument('--min_hits', type=float, default=MIN_HITS,
+                        help='Minimum number of hits to keep pathways in the results (default: %(default)s)')
 
     # reactome parameters
     parser.add_argument('--species', default=REACTOME_SPECIES_HOMO_SAPIENS, help='Species name',
@@ -99,6 +101,7 @@ def main():
     # extract other args
     database_name = args['db']
     min_replace = args['min_replace']
+    min_hits = args['min_hits']
     reactome_species = args['species']
     reactome_metabolic_pathway_only = not args['use_all_reactome_pathways']
     reactome_query = args['connect_to_reactome_server']
@@ -107,7 +110,7 @@ def main():
     ds = DataSource(int_df, annotation_df, experimental_design, database_name,
                     reactome_species=reactome_species,
                     reactome_metabolic_pathway_only=reactome_metabolic_pathway_only,
-                    reactome_query=reactome_query, min_replace=min_replace)
+                    reactome_query=reactome_query, min_replace=min_replace, min_hits=min_hits)
 
     # run the selected pathway analysis method
     method = None