Merge remote-tracking branch 'origin/hotfix'

griffithlab · Mar 11, 2020 · 8ab7511 · 8ab7511
2 parents 878a208 + 1d96a18
commit 8ab7511
Show file tree

Hide file tree

Showing 8 changed files with 63 additions and 16 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -69,7 +69,7 @@
 # The short X.Y version.
 version = '1.5'
 # The full version, including alpha/beta/rc tags.
-release = '1.5.5'
+release = '1.5.6'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/index.rst b/docs/index.rst
@@ -52,17 +52,12 @@ New in release |release|
 
 This is a hotfix release. It fixes the following issues:
 
-- The ``pvacfuse run`` command would previously output a misleading warning
-  message if an AGFusion input directory didn't contain any processable fusion
-  entries. This warning message has been fixed.
-- Between VEP versions, the Downstream protein sequence prediction for some
-  frameshift mutations was changed to now include a leading wildtype amino
-  acid. This potential difference in VEP-predicted Downstream protein
-  sequences was not accounted for and would result in frameshift mutation
-  protein prediction that would duplicate this leading wildtype amino acid.
-  This version updates our prediction pipeline to remove this duplicated amino
-  acid and output a fatal error if the Downstream protein sequence does not
-  contain the leading wildtype amino acid.
+- The ``pvacbind run`` command would previously error out if one of the input
+  sequences would contain a X stop codon. This update will remove the X amino
+  acid and the downstream sequence before further processing the remaining
+  protein sequence.
+- A bug in the ``pvacfuse top_score_filter`` code would previsouly result
+  in an error when trying to run this command. This has now been fixed.
 
 New in version |version|
 ------------------------

diff --git a/docs/releases/1_5.rst b/docs/releases/1_5.rst
@@ -152,3 +152,13 @@ This is a hotfix release. It fixes the following issues:
   This version updates our prediction pipeline to remove this duplicated amino
   acid and output a fatal error if the Downstream protein sequence does not
   contain the leading wildtype amino acid.
+
+1.5.6
+-----
+
+- The ``pvacbind run`` command would previously error out if one of the input
+  sequences would contain a X stop codon. This update will remove the X amino
+  acid and the downstream sequence before further processing the remaining
+  protein sequence.
+- A bug in the ``pvacfuse top_score_filter`` code would previsouly result
+  in an error when trying to run this command. This has now been fixed.
diff --git a/lib/pipeline.py b/lib/pipeline.py
@@ -503,10 +503,16 @@ def uniquify_records(self, records):
             count += 1
         return (uniq_records, keys)
 
-    def create_per_length_fasta(self, length):
+    def create_per_length_fasta_and_process_stops(self, length):
+        stop_chars = set('X*')
         records = []
         for record in SeqIO.parse(self.input_file, "fasta"):
-            if len(str(record.seq)) >= length:
+            sequence = str(record.seq).upper()
+            x_index = sequence.index('X') if 'X' in sequence else len(sequence)
+            star_index = sequence.index('*') if '*' in sequence else len(sequence)
+            sequence = sequence[0:min(x_index, star_index)]
+            if len(sequence) >= length:
+                record.seq = Seq(sequence, IUPAC.protein)
                 records.append(record)
         SeqIO.write(records, self.fasta_basename(length), "fasta")
 
@@ -702,7 +708,7 @@ def execute(self):
 
         split_parsed_output_files = []
         for length in self.epitope_lengths:
-            self.create_per_length_fasta(length)
+            self.create_per_length_fasta_and_process_stops(length)
             chunks = self.split_fasta_file(length)
             self.call_iedb(chunks, length)
             split_parsed_output_files.extend(self.parse_outputs(chunks, length))

diff --git a/setup.py b/setup.py
@@ -61,7 +61,7 @@
 
 setup(
     name="pvactools",
-    version="1.5.5",
+    version="1.5.6",
     packages=[
         "tools",
         "tools.pvacbind",

diff --git a/tests/test_data/pvacbind/input_with_stops.fasta b/tests/test_data/pvacbind/input_with_stops.fasta
@@ -0,0 +1,10 @@
+>1
+LPLPPPPLLPLLLLLXGASGG
+>2
+LPLPPPPLLPLLP*LLLLGASGG
+>3
+DPASAAAAAAAAAAxAAAAVIPTVSTPPP
+>4
+DPASAAAAAXAAVIP*TVSTPPP
+>5
+VNSXATLSRTLLAAAGGSSLQ
diff --git a/tests/test_data/pvacbind/output_with_stops.fasta b/tests/test_data/pvacbind/output_with_stops.fasta
@@ -0,0 +1,8 @@
+>1
+LPLPPPPLLPLLLLL
+>2
+LPLPPPPLLPLLP
+>3
+DPASAAAAAAAAAA
+>4
+DPASAAAAA
diff --git a/tests/test_pvacbind.py b/tests/test_pvacbind.py
@@ -140,6 +140,24 @@ def test_run_compiles(self):
         ))
         self.assertTrue(compiled_run_path)
 
+    def test_process_stops(self):
+        output_dir = tempfile.TemporaryDirectory(dir = self.test_data_directory)
+        params = {
+            'input_file': os.path.join(self.test_data_directory, "input_with_stops.fasta"),
+            'input_file_type': 'fasta',
+            'sample_name': 'Test',
+            'alleles': ['HLA-G*01:09'],
+            'prediction_algorithms': ['NetMHC'],
+            'output_dir': output_dir.name,
+            'epitope_lengths': [9],
+        }
+        pipeline = PvacbindPipeline(**params)
+        pipeline.create_per_length_fasta_and_process_stops(9)
+        output_file   = os.path.join(output_dir.name, 'tmp', 'Test.9.fa')
+        expected_file = os.path.join(self.test_data_directory, 'output_with_stops.fasta')
+        self.assertTrue(cmp(output_file, expected_file))
+        output_dir.cleanup()
+
     def test_pvacbind_pipeline(self):
         with patch('requests.post', unittest.mock.Mock(side_effect = lambda url, data, files=None: make_response(
             data,