Allow collision of samples with warning only, custom ordered headers …

…support (#66) Closes #53, closes #52, closes #51.
clintval · Aug 13, 2018 · 215ae05 · 215ae05
1 parent d8a1b9b
commit 215ae05
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 43 deletions.
diff --git a/sample_sheet/__init__.py b/sample_sheet/__init__.py
@@ -3,6 +3,7 @@
 import os
 import re
 import sys
+import warnings
 
 from contextlib import ExitStack
 from itertools import chain, repeat, islice
@@ -60,14 +61,14 @@ class ReadStructure(object):
     A read structure is a sequence of tokens in the form ``<number><operator>``
     where ``<operator>`` can describe template, skip, index, or UMI bases.
 
-    ========  =====================================================
-    Operator  Description
-    ========  =====================================================
-    T         Template base (*e.g.* experimental DNA, RNA)
-    S         Bases to be skipped or ignored
-    B         Bases to be used as an index to identify the sample
-    M         Bases to be used as an index to identify the molecule
-    ========  =====================================================
+    ==========  =====================================================
+     Operator   Description
+    ==========  =====================================================
+    **T**       Template base (*e.g.* experimental DNA, RNA)
+    **S**       Bases to be skipped or ignored
+    **B**       Bases to be used as an index to identify the sample
+    **M**       Bases to be used as an index to identify the molecule
+    ==========  =====================================================
 
     Args:
         structure: Read structure string representation.
@@ -81,7 +82,7 @@ class ReadStructure(object):
         >>> rs.tokens
         ['10M', '141T', '8B']
 
-    Notes:
+    Note:
 
         This class does not currently support read structures where the last
         operator has ambiguous length by using ``<+>`` preceding the
@@ -274,9 +275,6 @@ def __init__(
         self._store: Mapping
         self.sample_sheet: Optional[SampleSheet] = None
 
-        for key in RECOMMENDED_KEYS:
-            self[key] = None
-
         for key, value in data.items():
             # Promote a ``Read_Structure`` key to :class:`ReadStructure`.
             # Support case insensitivity and any amount of underscores.
@@ -418,11 +416,18 @@ def add_section(self, section_name: str) -> None:
         setattr(self, section_name, Section())
 
     @property
-    def all_sample_keys(self) -> Set[str]:
+    def all_sample_keys(self) -> List[str]:
         """Return the unique keys of all samples in this :class:`SampleSheet`.
 
+        The keys are discovered first by the order of samples and second by
+        the order of keys upon those samples.
+
         """
-        return set(chain.from_iterable([sample.keys() for sample in self]))
+        all_keys: List[str] = []
+        for key in chain.from_iterable([sample.keys() for sample in self]):
+            if key not in all_keys:
+                all_keys.append(key)
+        return all_keys
 
     @property
     def experimental_design(self) -> Any:
@@ -551,7 +556,24 @@ def add_sample(self, sample: Sample) -> None:
         Args:
             sample: :class:`Sample` to add to this :class:`SampleSheet`.
 
+        Note:
+
+            It is unclear if the Illumina specification truly allows for
+            equivalent samples to exist on the same sample sheet. To mitigate
+            the warnings in this library when you encounter such a case, use
+            a code pattern like the following:
+
+            >>> import warnings
+            >>> warnings.simplefilter("ignore")
+            >>> from sample_sheet import SampleSheet
+            >>> SampleSheet('tests/resources/single-end-colliding-sample-ids.csv');
+            SampleSheet('tests/resources/single-end-colliding-sample-ids.csv')
+
         """
+        # Do not allow samples without Sample_ID defined.
+        if sample.Sample_ID is None:
+            raise ValueError('Sample must have "Sample_ID" defined.')
+
         # Set whether the samples will have ``index`` or ``index2``.
         if len(self.samples) == 0:
             self.samples_have_index = sample.index is not None
@@ -596,11 +618,12 @@ def add_sample(self, sample: Sample) -> None:
         # both if they have been defined.
         for other in self.samples:
             if sample == other:
-                raise ValueError(
-                    f'Cannot add two samples with the same '
-                    f'`Sample_ID`, `Library_ID`, and `Lane`: '
-                    f'sample - {sample}, other - {other}'
+                message = (
+                    f'Two equivalent samples added:'
+                    f'\n\n1): {sample.__repr__()}\n2): {other.__repr__()}\n'
                 )
+                # TODO: Look into if this is truly illegal or not.
+                warnings.warn(UserWarning(message))
             if sample.index is None and self.samples_have_index:
                 raise ValueError(
                     f'Cannot add a sample without attribute `index` if a '
@@ -847,13 +870,14 @@ def write(self, handle: TextIO, blank_lines: int = 1) -> None:
             blank_lines: Number of blank lines to write between sections.
 
         """
-        writer = csv.writer(handle)
-        csv_width = max(len(RECOMMENDED_KEYS), len(self.all_sample_keys))
-        section_order = ['Header', 'Reads'] + self._sections + ['Settings']
-
         if not isinstance(blank_lines, int) or blank_lines <= 0:
             raise ValueError('Number of blank lines must be a positive int.')
 
+        writer = csv.writer(handle)
+        csv_width: int = max([len(self.all_sample_keys), 2])
+
+        section_order = ['Header', 'Reads'] + self._sections + ['Settings']
+
         def pad_iterable(
             iterable: Iterable, size: int = csv_width, padding: str = ''
         ) -> List[str]:
@@ -877,12 +901,10 @@ def write_blank_lines(
             write_blank_lines(writer)
 
         writer.writerow(pad_iterable(['[Data]'], csv_width))
-        other_keys = self.all_sample_keys - set(RECOMMENDED_KEYS)
-        samples_header = RECOMMENDED_KEYS + sorted(other_keys)
-        writer.writerow(pad_iterable(samples_header, csv_width))
+        writer.writerow(pad_iterable(self.all_sample_keys, csv_width))
 
         for sample in self.samples:
-            line = [getattr(sample, key) for key in samples_header]
+            line = [getattr(sample, key) for key in self.all_sample_keys]
             writer.writerow(pad_iterable(line, csv_width))
 
     def __len__(self) -> int:

diff --git a/tests/resources/single-end-colliding-sample-ids.csv b/tests/resources/single-end-colliding-sample-ids.csv
@@ -0,0 +1,66 @@
+[Header],,,,,,,,,,
+IEMFileVersion,5,,,,,,,,,
+Experiment Name,Tsqn180801,,,,,,,,,
+Date,3/08/2018,,,,,,,,,
+Workflow,GenerateFASTQ,,,,,,,,,
+Application,NovaSeq FASTQ Only,,,,,,,,,
+Instrument Type,NovaSeq,,,,,,,,,
+Assay,TruSeq Nano DNA,,,,,,,,,
+Index Adapters,IDT-ILMN TruSeq DNA UD Indexes (96 Indexes),,,,,,,,,
+Description,Tsqn180801,,,,,,,,,
+Chemistry,Amplicon,,,,,,,,,
+,,,,,,,,,,
+[Reads],,,,,,,,,,
+151,,,,,,,,,,
+151,,,,,,,,,,
+,,,,,,,,,,
+[Settings],,,,,,,,,,
+Adapter,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,,,,,,,,,
+AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT,,,,,,,,,
+,,,,,,,,,,
+[Data],,,,,,,,,,
+Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Index_Plate_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
+PRJ180538_VPH20T,,,,,SI-GA-G6_1,CTGACGCG,,,,
+PRJ180538_VPH20T,,,,,SI-GA-G6_2,GGTCGTAC,,,,
+PRJ180538_VPH20T,,,,,SI-GA-G6_3,TCCTTCTT,,,,
+PRJ180538_VPH20T,,,,,SI-GA-G6_4,AAAGAAGA,,,,
+PRJ180539_VCBPH5T,,,,,SI-GA-G7_1,GGTATGCA,,,,
+PRJ180539_VCBPH5T,,,,,SI-GA-G7_2,CTCGAAAT,,,,
+PRJ180539_VCBPH5T,,,,,SI-GA-G7_3,ACACCTTC,,,,
+PRJ180539_VCBPH5T,,,,,SI-GA-G7_4,TAGTGCGG,,,,
+PRJ180540_VCBP14T,,,,,SI-GA-G8_1,TATGAGCT,,,,
+PRJ180540_VCBP14T,,,,,SI-GA-G8_2,CCGATAGC,,,,
+PRJ180540_VCBP14T,,,,,SI-GA-G8_3,ATACCCAA,,,,
+PRJ180540_VCBP14T,,,,,SI-GA-G8_4,GGCTGTTG,,,,
+PRJ180541_VPH8T,,,,,SI-GA-G9_1,TAGGACGT,,,,
+PRJ180541_VPH8T,,,,,SI-GA-G9_2,ATCCCACA,,,,
+PRJ180541_VPH8T,,,,,SI-GA-G9_3,GGAATGTC,,,,
+PRJ180541_VPH8T,,,,,SI-GA-G9_4,CCTTGTAG,,,,
+PRJ180542_VPH23T,,,,,SI-GA-G10_1,TCGCCAGC,,,,
+PRJ180542_VPH23T,,,,,SI-GA-G10_2,AATGTTAG,,,,
+PRJ180542_VPH23T,,,,,SI-GA-G10_3,CGATAGCT,,,,
+PRJ180542_VPH23T,,,,,SI-GA-G10_4,GTCAGCTA,,,,
+PRJ180543_VPH36T,,,,,SI-GA-G11_1,TTATCGTT,,,,
+PRJ180543_VPH36T,,,,,SI-GA-G11_2,AGCAGAGC,,,,
+PRJ180543_VPH36T,,,,,SI-GA-G11_3,CATCTCCA,,,,
+PRJ180543_VPH36T,,,,,SI-GA-G11_4,GCGGATAG,,,,
+PRJ180544_PGL3,,,,,SI-GA-G12_1,ATTCTAAG,,,,
+PRJ180544_PGL3,,,,,SI-GA-G12_2,CCCGATTA,,,,
+PRJ180544_PGL3,,,,,SI-GA-G12_3,TGGAGGCT,,,,
+PRJ180544_PGL3,,,,,SI-GA-G12_4,GAATCCGC,,,,
+PRJ180545_LSI_noIAA,,,,,SI-GA-F3_1,TTCAGGTG,,,,
+PRJ180545_LSI_noIAA,,,,,SI-GA-F3_2,ACGGACAT,,,,
+PRJ180545_LSI_noIAA,,,,,SI-GA-F3_3,GATCTTGA,,,,
+PRJ180545_LSI_noIAA,,,,,SI-GA-F3_4,CGATCACC,,,,
+PRJ180546_LSI_IAA,,,,,SI-GA-F4_1,CCCAATAG,,,,
+PRJ180546_LSI_IAA,,,,,SI-GA-F4_2,GTGTCGCT,,,,
+PRJ180546_LSI_IAA,,,,,SI-GA-F4_3,AGAGTCGC,,,,
+PRJ180546_LSI_IAA,,,,,SI-GA-F4_4,TATCGATA,,,,
+PRJ180547_LL_30,,,,,SI-GA-F1_1,GTTGCAGC,,,,
+PRJ180547_LL_30,,,,,SI-GA-F1_2,TGGAATTA,,,,
+PRJ180547_LL_30,,,,,SI-GA-F1_3,CAATGGAG,,,,
+PRJ180547_LL_30,,,,,SI-GA-F1_4,ACCCTCCT,,,,
+PRJ180548_LL_38,,,,,SI-GA-F2_1,TTTACATG,,,,
+PRJ180548_LL_38,,,,,SI-GA-F2_2,CGCGATAC,,,,
+PRJ180548_LL_38,,,,,SI-GA-F2_3,ACGCGGGT,,,,
+PRJ180548_LL_38,,,,,SI-GA-F2_4,GAATTCCA,,,,
diff --git a/tests/test_sample.py b/tests/test_sample.py
@@ -1,11 +1,11 @@
 import pytest
 
+from nose.tools import assert_dict_equal
 from nose.tools import assert_is_instance
 from nose.tools import assert_is_none
-from nose.tools import assert_dict_equal
+from nose.tools import assert_list_equal
 from nose.tools import assert_not_equal
 from nose.tools import assert_raises
-from nose.tools import assert_set_equal
 from nose.tools import eq_
 
 from unittest import TestCase
@@ -27,21 +27,15 @@ def test_default_getattr(self):
         for key in ('not_real', 'fake'):
             assert_is_none(getattr(Sample(), key))
 
-    def test_keys_on_blank_init(self):
-        """Test that recommended keys exist on blank initialization."""
-        sample = Sample()
-        assert_set_equal(set(sample.keys()), set(RECOMMENDED_KEYS))
-
     def test_promotion_of_read_structure(self):
         """Test that a Read_Structure key is promoted to ``ReadStructure``."""
         sample = Sample({'Read_Structure': '10M141T8B', 'index': 'ACGTGCNA'})
         assert_is_instance(sample.Read_Structure, ReadStructure)
 
     def test_additional_key_is_added(self):
         """Test that an additional key is added to ``keys()`` method."""
-        assert_set_equal(
-            set(Sample({'Read_Structure': '151T'}).keys()),
-            {'index', 'Read_Structure', 'Sample_ID', 'Sample_Name'},
+        assert_list_equal(
+            list(Sample({'Read_Structure': '151T'}).keys()), ['Read_Structure']
         )
 
     def test_read_structure_with_single_index(self):

diff --git a/tests/test_sample_sheet.py b/tests/test_sample_sheet.py
@@ -1,3 +1,5 @@
+import pytest
+
 from nose.tools import assert_false
 from nose.tools import assert_is_instance
 from nose.tools import assert_is_none
@@ -128,7 +130,7 @@ def test_add_samples(self):
 
     def test_add_sample_with_index(self):
         """Test that the SampleSheet sets a sample with attribute ``index``"""
-        sample = Sample({'index': 'ACGTTNAT'})
+        sample = Sample({'Sample_ID': 0, 'index': 'ACGTTNAT'})
         sample_sheet = SampleSheet()
 
         sample_sheet.add_sample(sample)
@@ -138,7 +140,7 @@ def test_add_sample_with_index(self):
 
     def test_add_sample_with_index2(self):
         """Test that the SampleSheet sets a sample with attribute ``index2``"""
-        sample = Sample({'index2': 'ACGTTNAT'})
+        sample = Sample({'Sample_ID': 0, 'index2': 'ACGTTNAT'})
         sample_sheet = SampleSheet()
 
         assert_is_none(sample_sheet.samples_have_index)
@@ -162,12 +164,14 @@ def test_add_samples_with_same_index_different_index2(self):
 
         assert_is_none(sample_sheet.add_sample(sample2))
 
+    @pytest.mark.xfail
+    @pytest.mark.filterwarnings("ignore:Two equivalent")
     def test_add_sample_same_twice(self):
         """Test ``add_sample()`` when two samples having the same ``Sample_ID``
         and ``Library_ID`` are added.
 
         """
-        sample = Sample()
+        sample = Sample({'Sample_ID': 0})
         sample_sheet = SampleSheet()
         sample_sheet.add_sample(sample)
 
@@ -271,16 +275,15 @@ def test_add_sample_with_different_index_combination(self):
         assert_raises(ValueError, sample_sheet.add_sample, sample2)
 
     def test_all_sample_keys(self):
-        """Test ``all_sample_keys()`` to return set of all sample keys."""
+        """Test ``all_sample_keys()`` to return list of all sample keys."""
         sample1 = Sample({'Sample_ID': 49, 'Key1': 1})
         sample2 = Sample({'Sample_ID': 23, 'Key2': 2})
         sample_sheet = SampleSheet()
         sample_sheet.add_sample(sample1)
         sample_sheet.add_sample(sample2)
 
-        eq_(
-            sample_sheet.all_sample_keys,
-            {'Sample_ID', 'Sample_Name', 'index', 'Key1', 'Key2'},
+        assert_list_equal(
+            sample_sheet.all_sample_keys, ['Sample_ID', 'Key1', 'Key2']
         )
 
     def test_parse_invalid_ascii(self):
@@ -646,6 +649,24 @@ def test_write_custom_sections(self):
         )
         eq_(sample_sheet2.TestingSection.KeyNumber1, 'DNAMatrix.txt')
 
+    @pytest.mark.filterwarnings("ignore:Two equivalent")
+    def test_write_with_equal_samples_and_custom_ordered_header(self):
+        """Test ``write()`` when given invalid number of blank lines"""
+        infile = RESOURCES / 'single-end-colliding-sample-ids.csv'
+        sample_sheet1 = SampleSheet(infile)
+
+        # Write to string and make temporary file
+        string_handle = StringIO(newline=None)
+        sample_sheet1.write(string_handle)
+        string_handle.seek(0)
+        filename = string_as_temporary_file(string_handle.read())
+
+        # Read temporary file and confirm section and it's data exists.
+        sample_sheet2 = SampleSheet(filename)
+        assert_list_equal(
+            sample_sheet1.all_sample_keys, sample_sheet2.all_sample_keys
+        )
+
     def test_write_invalid_num_blank_lines(self):
         """Test ``write()`` when given invalid number of blank lines"""
         infile = RESOURCES / 'paired-end-single-index.csv'