Permalink
Browse files

Merge pull request #44 from griffithlab/filter

Add new filter option to exclude NAs
  • Loading branch information...
susannasiebert committed Nov 13, 2017
2 parents 291eecb + 9978610 commit 9a0708bc65c501fb352695913591fb71dc112b31
View
@@ -5,12 +5,13 @@
from lib.filter import *
class BindingFilter:
def __init__(self, input_file, output_file, binding_threshold, minimum_fold_change, top_score_metric):
def __init__(self, input_file, output_file, binding_threshold, minimum_fold_change, top_score_metric, exclude_nas):
self.input_file = input_file
self.output_file = output_file
self.binding_threshold = binding_threshold
self.minimum_fold_change = minimum_fold_change
self.top_score_metric = top_score_metric
self.exclude_nas = exclude_nas
def execute(self):
filter_criteria = []
@@ -27,7 +28,7 @@ def execute(self):
column = 'Corresponding Fold Change'
filter_criteria.append({'column': column, 'operator': '>=', 'threshold': self.minimum_fold_change})
Filter(self.input_file, self.output_file, filter_criteria).execute()
Filter(self.input_file, self.output_file, filter_criteria, self.exclude_nas).execute()
@classmethod
def parser(cls, tool):
@@ -66,4 +67,10 @@ def parser(cls, tool):
+ "Default: median",
default='median',
)
parser.add_argument(
'--exclude-NAs',
help="Exclude NA values from the filtered output. Default: False",
default=False,
action='store_true'
)
return parser
View
@@ -1,10 +1,11 @@
import pandas
class Filter:
def __init__(self, input_file, output_file, filter_criteria):
def __init__(self, input_file, output_file, filter_criteria, exclude_nas):
self.input_file = input_file
self.output_file = output_file
self.filter_criteria = filter_criteria
self.exclude_nas = exclude_nas
def execute(self):
data = pandas.read_csv(self.input_file, delimiter='\t', float_precision='high')
@@ -14,6 +15,9 @@ def execute(self):
for criteria in self.filter_criteria:
clean_column = criteria['column'].replace(' ', '_')
# #clean_column != clean_column is a hacky way to keep all NA values
expression = "(%s %s %s) | (%s != %s)" % (clean_column, criteria['operator'], criteria['threshold'], clean_column, clean_column)
if self.exclude_nas:
expression = "(%s %s %s)" % (clean_column, criteria['operator'], criteria['threshold'])
else:
expression = "(%s %s %s) | (%s != %s)" % (clean_column, criteria['operator'], criteria['threshold'], clean_column, clean_column)
data = data.query(expression)
data.to_csv(self.output_file, sep='\t', header=header, index=False, na_rep='NA')
View
@@ -60,6 +60,7 @@ def __init__(self, **kwargs):
self.iedb_retries = kwargs.pop('iedb_retries', 5)
self.downstream_sequence_length = kwargs.pop('downstream_sequence_length', 1000)
self.keep_tmp_files = kwargs.pop('keep_tmp_files', False)
self.exclude_NAs = kwargs.pop('exclude_NAs', False)
tmp_dir = os.path.join(self.output_dir, 'tmp')
os.makedirs(tmp_dir, exist_ok=True)
self.tmp_dir = tmp_dir
@@ -263,6 +264,7 @@ def binding_filter(self):
self.binding_threshold,
self.minimum_fold_change,
self.top_score_metric,
self.exclude_NAs,
).execute()
status_message("Completed")
@@ -281,7 +283,7 @@ def coverage_filter(self):
filter_criteria.append({'column': "Tumor_RNA_VAF", 'operator': '>=', 'threshold': self.trna_vaf})
filter_criteria.append({'column': "Gene_Expression", 'operator': '>=', 'threshold': self.expn_val})
filter_criteria.append({'column': "Transcript_Expression", 'operator': '>=', 'threshold': self.expn_val})
Filter(self.binding_filter_out_path(), self.coverage_filter_out_path(), filter_criteria).execute()
Filter(self.binding_filter_out_path(), self.coverage_filter_out_path(), filter_criteria, self.exclude_NAs).execute()
elif self.input_file_type == 'bedpe':
shutil.copy(self.binding_filter_out_path(), self.coverage_filter_out_path())
status_message("Completed")
@@ -116,6 +116,12 @@ def __init__(self, run_name, input_file_help):
help="Cap to limit the downstream sequence length for frameshifts when creating the fasta file. "
+ "Use 'full' to include the full downstream sequence. Default: 1000"
)
self.parser.add_argument(
'--exclude-NAs',
help="Exclude NA values from the filtered output. Default: False",
default=False,
action='store_true'
)
class PvacseqRunArgumentParser(PredictionRunArgumentParser):
def __init__(self):
@@ -29,6 +29,7 @@ def test_binding_filter_runs_and_produces_expected_output(self):
500,
0,
'median',
False,
).execute())
self.assertTrue(cmp(
output_file.name,
@@ -0,0 +1 @@
Chromosome Start Stop Reference Variant Transcript Ensembl Gene ID Variant Type Mutation Protein Position Gene Name HLA Allele Peptide Length Sub-peptide Position MT Epitope Seq WT Epitope Seq Best MT Score Method Best MT Score Corresponding WT Score Corresponding Fold Change Tumor DNA Depth Tumor DNA VAF Tumor RNA Depth Tumor RNA VAF Normal Depth Normal VAF Gene Expression Transcript Expression Median MT Score Median WT Score Median Fold Change PickPocket WT Score PickPocket MT Score NetMHC WT Score NetMHC MT Score
View
@@ -28,6 +28,7 @@ def test_less_than(self):
'operator': "<",
'threshold': "500",
}],
False,
).execute())
self.assertTrue(cmp(
output_file.name,
@@ -48,6 +49,7 @@ def test_less_or_equal(self):
'operator': "<=",
'threshold': "500",
}],
False,
).execute())
self.assertTrue(cmp(
output_file.name,
@@ -68,6 +70,7 @@ def test_equal(self):
'operator': "==",
'threshold': "500",
}],
False,
).execute())
self.assertTrue(cmp(
output_file.name,
@@ -88,6 +91,7 @@ def test_greater_or_equal(self):
'operator': ">=",
'threshold': "500",
}],
False,
).execute())
self.assertTrue(cmp(
output_file.name,
@@ -108,6 +112,7 @@ def test_greater_than(self):
'operator': ">",
'threshold': "500",
}],
False,
).execute())
self.assertTrue(cmp(
output_file.name,
@@ -128,9 +133,31 @@ def test_NA(self):
'operator': ">",
'threshold': "100",
}],
False
).execute())
self.assertTrue(cmp(
output_file.name,
os.path.join(self.test_data_path, "Test.filtered.NA.tsv"),
False
))
def test_exclude_NA(self):
output_file = tempfile.NamedTemporaryFile()
self.assertFalse(Filter(
os.path.join(
self.test_data_path,
'Test.combined.parsed.tsv'
),
output_file.name,
[{
'column': "Tumor RNA Depth",
'operator': ">",
'threshold': "100",
}],
True,
).execute())
self.assertTrue(cmp(
output_file.name,
os.path.join(self.test_data_path, "Test.filtered.exclude_NA.tsv"),
False
))
@@ -7,7 +7,7 @@ def main(args_input = sys.argv[1:]):
parser = define_parser()
args = parser.parse_args(args_input)
BindingFilter(args.input_file, args.output_file, args.binding_threshold, args.minimum_fold_change, args.top_score_metric).execute()
BindingFilter(args.input_file, args.output_file, args.binding_threshold, args.minimum_fold_change, args.top_score_metric, args.exclude_nas).execute()
if __name__ == "__main__":
main()
@@ -7,7 +7,7 @@ def main(args_input = sys.argv[1:]):
parser = define_parser()
args = parser.parse_args(args_input)
BindingFilter(args.input_file, args.output_file, args.binding_threshold, args.minimum_fold_change, args.top_score_metric).execute()
BindingFilter(args.input_file, args.output_file, args.binding_threshold, args.minimum_fold_change, args.top_score_metric, args.exclude_nas).execute()
if __name__ == "__main__":
main()
@@ -55,7 +55,13 @@ def define_parser():
'--expn-val', type=int,
help="Gene and Transcript Expression cutoff. Sites above this cutoff will be considered"
+ "Default: 1",
default=1
default=1
)
parser.add_argument(
'--exclude-NAs',
help="Exclude NA values from the filtered output. Default: False",
default=False,
action='store_true'
)
return parser
@@ -82,7 +88,7 @@ def main(args_input = sys.argv[1:]):
filter_criteria.append({'column': "Gene_Expression", 'operator': '>=', 'threshold': args.expn_val})
filter_criteria.append({'column': "Transcript_Expression", 'operator': '>=', 'threshold': args.expn_val})
Filter(args.input_file, args.output_file, filter_criteria).execute()
Filter(args.input_file, args.output_file, filter_criteria, args.exclude_NAs).execute()
if __name__ == "__main__":
main()

0 comments on commit 9a0708b

Please sign in to comment.