Skip to content

Commit

Permalink
Allow collision of samples with warning only, custom ordered headers …
Browse files Browse the repository at this point in the history
…support (#66)

Closes #53, closes #52, closes #51.
  • Loading branch information
clintval committed Aug 13, 2018
1 parent d8a1b9b commit 215ae05
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 43 deletions.
74 changes: 48 additions & 26 deletions sample_sheet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import sys
import warnings

from contextlib import ExitStack
from itertools import chain, repeat, islice
Expand Down Expand Up @@ -60,14 +61,14 @@ class ReadStructure(object):
A read structure is a sequence of tokens in the form ``<number><operator>``
where ``<operator>`` can describe template, skip, index, or UMI bases.
======== =====================================================
Operator Description
======== =====================================================
T Template base (*e.g.* experimental DNA, RNA)
S Bases to be skipped or ignored
B Bases to be used as an index to identify the sample
M Bases to be used as an index to identify the molecule
======== =====================================================
========== =====================================================
Operator Description
========== =====================================================
**T** Template base (*e.g.* experimental DNA, RNA)
**S** Bases to be skipped or ignored
**B** Bases to be used as an index to identify the sample
**M** Bases to be used as an index to identify the molecule
========== =====================================================
Args:
structure: Read structure string representation.
Expand All @@ -81,7 +82,7 @@ class ReadStructure(object):
>>> rs.tokens
['10M', '141T', '8B']
Notes:
Note:
This class does not currently support read structures where the last
operator has ambiguous length by using ``<+>`` preceding the
Expand Down Expand Up @@ -274,9 +275,6 @@ def __init__(
self._store: Mapping
self.sample_sheet: Optional[SampleSheet] = None

for key in RECOMMENDED_KEYS:
self[key] = None

for key, value in data.items():
# Promote a ``Read_Structure`` key to :class:`ReadStructure`.
# Support case insensitivity and any amount of underscores.
Expand Down Expand Up @@ -418,11 +416,18 @@ def add_section(self, section_name: str) -> None:
setattr(self, section_name, Section())

@property
def all_sample_keys(self) -> Set[str]:
def all_sample_keys(self) -> List[str]:
"""Return the unique keys of all samples in this :class:`SampleSheet`.
The keys are discovered first by the order of samples and second by
the order of keys upon those samples.
"""
return set(chain.from_iterable([sample.keys() for sample in self]))
all_keys: List[str] = []
for key in chain.from_iterable([sample.keys() for sample in self]):
if key not in all_keys:
all_keys.append(key)
return all_keys

@property
def experimental_design(self) -> Any:
Expand Down Expand Up @@ -551,7 +556,24 @@ def add_sample(self, sample: Sample) -> None:
Args:
sample: :class:`Sample` to add to this :class:`SampleSheet`.
Note:
It is unclear if the Illumina specification truly allows for
equivalent samples to exist on the same sample sheet. To mitigate
the warnings in this library when you encounter such a case, use
a code pattern like the following:
>>> import warnings
>>> warnings.simplefilter("ignore")
>>> from sample_sheet import SampleSheet
>>> SampleSheet('tests/resources/single-end-colliding-sample-ids.csv');
SampleSheet('tests/resources/single-end-colliding-sample-ids.csv')
"""
# Do not allow samples without Sample_ID defined.
if sample.Sample_ID is None:
raise ValueError('Sample must have "Sample_ID" defined.')

# Set whether the samples will have ``index`` or ``index2``.
if len(self.samples) == 0:
self.samples_have_index = sample.index is not None
Expand Down Expand Up @@ -596,11 +618,12 @@ def add_sample(self, sample: Sample) -> None:
# both if they have been defined.
for other in self.samples:
if sample == other:
raise ValueError(
f'Cannot add two samples with the same '
f'`Sample_ID`, `Library_ID`, and `Lane`: '
f'sample - {sample}, other - {other}'
message = (
f'Two equivalent samples added:'
f'\n\n1): {sample.__repr__()}\n2): {other.__repr__()}\n'
)
# TODO: Look into if this is truly illegal or not.
warnings.warn(UserWarning(message))
if sample.index is None and self.samples_have_index:
raise ValueError(
f'Cannot add a sample without attribute `index` if a '
Expand Down Expand Up @@ -847,13 +870,14 @@ def write(self, handle: TextIO, blank_lines: int = 1) -> None:
blank_lines: Number of blank lines to write between sections.
"""
writer = csv.writer(handle)
csv_width = max(len(RECOMMENDED_KEYS), len(self.all_sample_keys))
section_order = ['Header', 'Reads'] + self._sections + ['Settings']

if not isinstance(blank_lines, int) or blank_lines <= 0:
raise ValueError('Number of blank lines must be a positive int.')

writer = csv.writer(handle)
csv_width: int = max([len(self.all_sample_keys), 2])

section_order = ['Header', 'Reads'] + self._sections + ['Settings']

def pad_iterable(
iterable: Iterable, size: int = csv_width, padding: str = ''
) -> List[str]:
Expand All @@ -877,12 +901,10 @@ def write_blank_lines(
write_blank_lines(writer)

writer.writerow(pad_iterable(['[Data]'], csv_width))
other_keys = self.all_sample_keys - set(RECOMMENDED_KEYS)
samples_header = RECOMMENDED_KEYS + sorted(other_keys)
writer.writerow(pad_iterable(samples_header, csv_width))
writer.writerow(pad_iterable(self.all_sample_keys, csv_width))

for sample in self.samples:
line = [getattr(sample, key) for key in samples_header]
line = [getattr(sample, key) for key in self.all_sample_keys]
writer.writerow(pad_iterable(line, csv_width))

def __len__(self) -> int:
Expand Down
66 changes: 66 additions & 0 deletions tests/resources/single-end-colliding-sample-ids.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
[Header],,,,,,,,,,
IEMFileVersion,5,,,,,,,,,
Experiment Name,Tsqn180801,,,,,,,,,
Date,3/08/2018,,,,,,,,,
Workflow,GenerateFASTQ,,,,,,,,,
Application,NovaSeq FASTQ Only,,,,,,,,,
Instrument Type,NovaSeq,,,,,,,,,
Assay,TruSeq Nano DNA,,,,,,,,,
Index Adapters,IDT-ILMN TruSeq DNA UD Indexes (96 Indexes),,,,,,,,,
Description,Tsqn180801,,,,,,,,,
Chemistry,Amplicon,,,,,,,,,
,,,,,,,,,,
[Reads],,,,,,,,,,
151,,,,,,,,,,
151,,,,,,,,,,
,,,,,,,,,,
[Settings],,,,,,,,,,
Adapter,AGATCGGAAGAGCACACGTCTGAACTCCAGTCA,,,,,,,,,
AdapterRead2,AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT,,,,,,,,,
,,,,,,,,,,
[Data],,,,,,,,,,
Sample_ID,Sample_Name,Sample_Plate,Sample_Well,Index_Plate_Well,I7_Index_ID,index,I5_Index_ID,index2,Sample_Project,Description
PRJ180538_VPH20T,,,,,SI-GA-G6_1,CTGACGCG,,,,
PRJ180538_VPH20T,,,,,SI-GA-G6_2,GGTCGTAC,,,,
PRJ180538_VPH20T,,,,,SI-GA-G6_3,TCCTTCTT,,,,
PRJ180538_VPH20T,,,,,SI-GA-G6_4,AAAGAAGA,,,,
PRJ180539_VCBPH5T,,,,,SI-GA-G7_1,GGTATGCA,,,,
PRJ180539_VCBPH5T,,,,,SI-GA-G7_2,CTCGAAAT,,,,
PRJ180539_VCBPH5T,,,,,SI-GA-G7_3,ACACCTTC,,,,
PRJ180539_VCBPH5T,,,,,SI-GA-G7_4,TAGTGCGG,,,,
PRJ180540_VCBP14T,,,,,SI-GA-G8_1,TATGAGCT,,,,
PRJ180540_VCBP14T,,,,,SI-GA-G8_2,CCGATAGC,,,,
PRJ180540_VCBP14T,,,,,SI-GA-G8_3,ATACCCAA,,,,
PRJ180540_VCBP14T,,,,,SI-GA-G8_4,GGCTGTTG,,,,
PRJ180541_VPH8T,,,,,SI-GA-G9_1,TAGGACGT,,,,
PRJ180541_VPH8T,,,,,SI-GA-G9_2,ATCCCACA,,,,
PRJ180541_VPH8T,,,,,SI-GA-G9_3,GGAATGTC,,,,
PRJ180541_VPH8T,,,,,SI-GA-G9_4,CCTTGTAG,,,,
PRJ180542_VPH23T,,,,,SI-GA-G10_1,TCGCCAGC,,,,
PRJ180542_VPH23T,,,,,SI-GA-G10_2,AATGTTAG,,,,
PRJ180542_VPH23T,,,,,SI-GA-G10_3,CGATAGCT,,,,
PRJ180542_VPH23T,,,,,SI-GA-G10_4,GTCAGCTA,,,,
PRJ180543_VPH36T,,,,,SI-GA-G11_1,TTATCGTT,,,,
PRJ180543_VPH36T,,,,,SI-GA-G11_2,AGCAGAGC,,,,
PRJ180543_VPH36T,,,,,SI-GA-G11_3,CATCTCCA,,,,
PRJ180543_VPH36T,,,,,SI-GA-G11_4,GCGGATAG,,,,
PRJ180544_PGL3,,,,,SI-GA-G12_1,ATTCTAAG,,,,
PRJ180544_PGL3,,,,,SI-GA-G12_2,CCCGATTA,,,,
PRJ180544_PGL3,,,,,SI-GA-G12_3,TGGAGGCT,,,,
PRJ180544_PGL3,,,,,SI-GA-G12_4,GAATCCGC,,,,
PRJ180545_LSI_noIAA,,,,,SI-GA-F3_1,TTCAGGTG,,,,
PRJ180545_LSI_noIAA,,,,,SI-GA-F3_2,ACGGACAT,,,,
PRJ180545_LSI_noIAA,,,,,SI-GA-F3_3,GATCTTGA,,,,
PRJ180545_LSI_noIAA,,,,,SI-GA-F3_4,CGATCACC,,,,
PRJ180546_LSI_IAA,,,,,SI-GA-F4_1,CCCAATAG,,,,
PRJ180546_LSI_IAA,,,,,SI-GA-F4_2,GTGTCGCT,,,,
PRJ180546_LSI_IAA,,,,,SI-GA-F4_3,AGAGTCGC,,,,
PRJ180546_LSI_IAA,,,,,SI-GA-F4_4,TATCGATA,,,,
PRJ180547_LL_30,,,,,SI-GA-F1_1,GTTGCAGC,,,,
PRJ180547_LL_30,,,,,SI-GA-F1_2,TGGAATTA,,,,
PRJ180547_LL_30,,,,,SI-GA-F1_3,CAATGGAG,,,,
PRJ180547_LL_30,,,,,SI-GA-F1_4,ACCCTCCT,,,,
PRJ180548_LL_38,,,,,SI-GA-F2_1,TTTACATG,,,,
PRJ180548_LL_38,,,,,SI-GA-F2_2,CGCGATAC,,,,
PRJ180548_LL_38,,,,,SI-GA-F2_3,ACGCGGGT,,,,
PRJ180548_LL_38,,,,,SI-GA-F2_4,GAATTCCA,,,,
14 changes: 4 additions & 10 deletions tests/test_sample.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import pytest

from nose.tools import assert_dict_equal
from nose.tools import assert_is_instance
from nose.tools import assert_is_none
from nose.tools import assert_dict_equal
from nose.tools import assert_list_equal
from nose.tools import assert_not_equal
from nose.tools import assert_raises
from nose.tools import assert_set_equal
from nose.tools import eq_

from unittest import TestCase
Expand All @@ -27,21 +27,15 @@ def test_default_getattr(self):
for key in ('not_real', 'fake'):
assert_is_none(getattr(Sample(), key))

def test_keys_on_blank_init(self):
"""Test that recommended keys exist on blank initialization."""
sample = Sample()
assert_set_equal(set(sample.keys()), set(RECOMMENDED_KEYS))

def test_promotion_of_read_structure(self):
"""Test that a Read_Structure key is promoted to ``ReadStructure``."""
sample = Sample({'Read_Structure': '10M141T8B', 'index': 'ACGTGCNA'})
assert_is_instance(sample.Read_Structure, ReadStructure)

def test_additional_key_is_added(self):
"""Test that an additional key is added to ``keys()`` method."""
assert_set_equal(
set(Sample({'Read_Structure': '151T'}).keys()),
{'index', 'Read_Structure', 'Sample_ID', 'Sample_Name'},
assert_list_equal(
list(Sample({'Read_Structure': '151T'}).keys()), ['Read_Structure']
)

def test_read_structure_with_single_index(self):
Expand Down
35 changes: 28 additions & 7 deletions tests/test_sample_sheet.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pytest

from nose.tools import assert_false
from nose.tools import assert_is_instance
from nose.tools import assert_is_none
Expand Down Expand Up @@ -128,7 +130,7 @@ def test_add_samples(self):

def test_add_sample_with_index(self):
"""Test that the SampleSheet sets a sample with attribute ``index``"""
sample = Sample({'index': 'ACGTTNAT'})
sample = Sample({'Sample_ID': 0, 'index': 'ACGTTNAT'})
sample_sheet = SampleSheet()

sample_sheet.add_sample(sample)
Expand All @@ -138,7 +140,7 @@ def test_add_sample_with_index(self):

def test_add_sample_with_index2(self):
"""Test that the SampleSheet sets a sample with attribute ``index2``"""
sample = Sample({'index2': 'ACGTTNAT'})
sample = Sample({'Sample_ID': 0, 'index2': 'ACGTTNAT'})
sample_sheet = SampleSheet()

assert_is_none(sample_sheet.samples_have_index)
Expand All @@ -162,12 +164,14 @@ def test_add_samples_with_same_index_different_index2(self):

assert_is_none(sample_sheet.add_sample(sample2))

@pytest.mark.xfail
@pytest.mark.filterwarnings("ignore:Two equivalent")
def test_add_sample_same_twice(self):
"""Test ``add_sample()`` when two samples having the same ``Sample_ID``
and ``Library_ID`` are added.
"""
sample = Sample()
sample = Sample({'Sample_ID': 0})
sample_sheet = SampleSheet()
sample_sheet.add_sample(sample)

Expand Down Expand Up @@ -271,16 +275,15 @@ def test_add_sample_with_different_index_combination(self):
assert_raises(ValueError, sample_sheet.add_sample, sample2)

def test_all_sample_keys(self):
"""Test ``all_sample_keys()`` to return set of all sample keys."""
"""Test ``all_sample_keys()`` to return list of all sample keys."""
sample1 = Sample({'Sample_ID': 49, 'Key1': 1})
sample2 = Sample({'Sample_ID': 23, 'Key2': 2})
sample_sheet = SampleSheet()
sample_sheet.add_sample(sample1)
sample_sheet.add_sample(sample2)

eq_(
sample_sheet.all_sample_keys,
{'Sample_ID', 'Sample_Name', 'index', 'Key1', 'Key2'},
assert_list_equal(
sample_sheet.all_sample_keys, ['Sample_ID', 'Key1', 'Key2']
)

def test_parse_invalid_ascii(self):
Expand Down Expand Up @@ -646,6 +649,24 @@ def test_write_custom_sections(self):
)
eq_(sample_sheet2.TestingSection.KeyNumber1, 'DNAMatrix.txt')

@pytest.mark.filterwarnings("ignore:Two equivalent")
def test_write_with_equal_samples_and_custom_ordered_header(self):
"""Test ``write()`` when given invalid number of blank lines"""
infile = RESOURCES / 'single-end-colliding-sample-ids.csv'
sample_sheet1 = SampleSheet(infile)

# Write to string and make temporary file
string_handle = StringIO(newline=None)
sample_sheet1.write(string_handle)
string_handle.seek(0)
filename = string_as_temporary_file(string_handle.read())

# Read temporary file and confirm section and it's data exists.
sample_sheet2 = SampleSheet(filename)
assert_list_equal(
sample_sheet1.all_sample_keys, sample_sheet2.all_sample_keys
)

def test_write_invalid_num_blank_lines(self):
"""Test ``write()`` when given invalid number of blank lines"""
infile = RESOURCES / 'paired-end-single-index.csv'
Expand Down

0 comments on commit 215ae05

Please sign in to comment.