Add output file format option to config file

compomics · Aug 5, 2019 · a0fc03c · a0fc03c
1 parent b37cd00
commit a0fc03c
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 18 deletions.
diff --git a/README.md b/README.md
@@ -95,12 +95,31 @@ Several MS²PIP options need to be set in this config file.
 currently supported MS²PIP models (see [Multiple prediction models](#multiple-prediction-models)).
 - The fragment ion error tolerance is set as `frag_error=X` where is X is the
 tolerance in Da.
+- Output formats to write predictions to, set as `out=X` where X is a 
+comma-separated list of a selection of the following list: `csv`, `mgf`, `msp`,
+or `bibliospec` (SSL/MS2, also for Skyline). For example: `out=csv,msp`.
 - PTMs (see further) are set as `ptm=X,Y,opt,Z` for each internal PTM where X is
 a string that represents the PTM, Y is the difference in Da associated with the
 PTM, opt is a required for compatibility with other CompOmics projects, and Z
 is the amino acid IAA) that is modified by the PTM. For N- and C-terminal
 modifications, Z should be `N-term` or `C-term`, respectively.
 
+
+
+Several MS²PIP options need to be set in this config file.
+- `model=X` where X is one of the currently supported MS²PIP models (see 
+[Multiple prediction models](#multiple-prediction-models)).
+- `frag_error=X` where is X is the fragmentation spectrum mass tolerance in Da
+(only relevant if an MGF file is passed).
+- `out=X` where X is a comma-separated list of a selection of the currently
+supported output file formats: `csv`, `mgf`, `msp`, or `bibliospec` (SSL/MS2,
+also for Skyline). For example: `out=csv,msp`.
+- `ptm=X,Y,opt,Z` for every peptide modification where:
+  - `X` is a string that represents the 
+PTM name (needs to match the names in the [PEPREC file](#peprec-file)).
+  - `Y` is the mass shift in Da associated with the PTM.
+  - `Z` is the one-letter code of the amino acid AA that is modified by the PTM. For N- and C-terminal modifications, `Z` should be `N-term` or `C-term`, respectively.
+
 #### PEPREC file
 To apply the pre-trained models you need to pass *only* a `<PEPREC file>` to
 MS²PIP. This file contains the peptide sequences for which you want to predict
@@ -128,6 +147,8 @@ In the [conversion_tools](https://github.com/compomics/ms2pip_c/tree/releases/co
 folder, we provide a host of Python scripts to convert common search engine
 output files to a PEPREC file.
 
+To start from a FASTA file, see [fasta2speclib](http://compomics.github.io/projects/ms2pip_c/wiki/fasta2speclib).
+
 
 #### MGF file (optional)
 Optionally, an MGF file with measured spectra can be passed to MS²PIP. In this
@@ -139,6 +160,9 @@ vice versa) will be skipped.
 #### Examples
 Suppose the **config file** contains the following lines
 ```
+model=HCD
+frag_error=0.02
+out=csv,mgf,msp
 ptm=Carbamidomethyl,57.02146,opt,C
 ptm=Acetyl,42.010565,opt,N-term
 ptm=Glyloss,-58.005479,opt,C-term
@@ -153,7 +177,7 @@ peptide3 0|Acetyl|2|Carbamidomethyl ACDEFGHIK 2
 In this example, `peptide3` is N-terminally acetylated and carries a
 carbamidomethyl on its second amino acid.
 
-The corresponding (optional) **MGF file** could contain the following spectrum:
+The corresponding (optional) **MGF file** can contain the following spectrum:
 ```
 BEGIN IONS
 TITLE=peptide1

diff --git a/config.txt b/config.txt
@@ -1,5 +1,7 @@
 model=HCD
 frag_error=0.02
+out=csv
+
 ptm=PhosphoS,79.966331,opt,S
 ptm=PhosphoT,79.966331,opt,T
 ptm=PhosphoY,79.966331,opt,Y

diff --git a/ms2pip/ms2pipC.py b/ms2pip/ms2pipC.py
@@ -17,6 +17,9 @@
 from ms2pip.cython_modules import ms2pip_pyx
 
 
+# Supported output formats
+SUPPORTED_OUT_FORMATS = ['csv', 'mgf', 'msp', 'bibliospec']
+
 # Models and their properties
 # id is passed to get_predictions to select model
 # ion_types is required to write the ion types in the headers of the result files
@@ -614,6 +617,19 @@ def run(pep_file, spec_file=None, vector_file=None, config_file=None, num_cpu=23
 		exit(1)
 	fragerror = params["frag_error"]
 
+	# Validate requested output formats
+	if "out" in params:
+		out_formats = [o.lower().strip() for o in params["out"].split(',')]
+		for o in out_formats:
+			if o not in SUPPORTED_OUT_FORMATS:
+				print("Unknown output format: '{}'".format(o))
+				print("Should be one of the following formats: {}".format(SUPPORTED_OUT_FORMATS))
+				exit(1)
+	else:
+		print("No output format specified; defaulting to csv")
+		out_formats = ['csv']
+
+	# Validate requested model
 	if model in MODELS.keys():
 		print("using {} models".format(model))
 	else:
@@ -669,6 +685,11 @@ def run(pep_file, spec_file=None, vector_file=None, config_file=None, num_cpu=23
 		sys.stdout.write("Removed {} unsupported peptide sequences (< 3, > 99 \
 amino acids, or containing B, J, O, U, X or Z).\n".format(num_pep_filtered))
 
+	if len(data) == 0:
+		sys.stdout.write("No peptides for which to predict intensities. Please \
+provide at least one valid peptide sequence.\n")
+		exit(1)
+
 	sys.stdout.write("starting workers...\n")
 	myPool = multiprocessing.Pool(num_cpu)
 
@@ -840,23 +861,23 @@ def run(pep_file, spec_file=None, vector_file=None, config_file=None, num_cpu=23
 		all_preds["prediction"] = np.hstack(np.concatenate(prediction_bufs, axis=None))
 
 
-		mgf = False  # Set to True to write spectrum as MGF file
-		if mgf:
-			print("writing MGF file {}_predictions.mgf...".format(output_filename))
-			spectrum_output.write_mgf(all_preds, peprec=data, output_filename=output_filename)
-
-		msp = False  # Set to True to write spectra as MSP file
-		if msp:
-			print("writing MSP file {}_predictions.msp...".format(output_filename))
-			spectrum_output.write_msp(all_preds, data, output_filename=output_filename)
-
-		if bibliospec:
-			print("writing SSL/MS2 files...")
-			spectrum_output.write_bibliospec(all_preds, data, params, output_filename=output_filename)
-
 		if not return_results:
-			sys.stdout.write("writing file {}_predictions.csv...\n".format(output_filename))
-			all_preds.to_csv("{}_predictions.csv".format(output_filename), index=False)
+			if 'mgf' in out_formats:
+				print("writing MGF file {}_predictions.mgf...".format(output_filename))
+				spectrum_output.write_mgf(all_preds, peprec=data, output_filename=output_filename)
+
+			if 'msp' in out_formats:
+				print("writing MSP file {}_predictions.msp...".format(output_filename))
+				spectrum_output.write_msp(all_preds, data, output_filename=output_filename)
+
+			if 'bibliospec' in out_formats:
+				print("writing SSL/MS2 files...")
+				spectrum_output.write_bibliospec(all_preds, data, params, output_filename=output_filename)
+
+			if 'csv' in out_formats:
+				print("writing CSV {}_predictions.csv...".format(output_filename))
+				all_preds.to_csv("{}_predictions.csv".format(output_filename), index=False)
+
 			sys.stdout.write("done!\n")
 		else:
 			return all_preds
diff --git a/ms2pip/ms2pip_tools/spectrum_output.py b/ms2pip/ms2pip_tools/spectrum_output.py
@@ -158,7 +158,7 @@ def write(msp_output):
 		write(msp_output)
 		return msp_output
 	else:
-		with open("{}_predictions.mgf".format(output_filename), write_mode) as msp_output:
+		with open("{}_predictions.msp".format(output_filename), write_mode) as msp_output:
 			write(msp_output)