Skip to content

Commit

Permalink
Merge pull request #160 from fls-bioinformatics-core/bcftbx-SampleShe…
Browse files Browse the repository at this point in the history
…etPredictor-handle-index-reads

bcftbx/IlluminaData: update SampleSheetPredictor to deal with index/custom reads
  • Loading branch information
pjbriggs committed May 20, 2020
2 parents bfe8ec7 + a9c2ee1 commit 279a120
Show file tree
Hide file tree
Showing 2 changed files with 162 additions and 6 deletions.
70 changes: 64 additions & 6 deletions bcftbx/IlluminaData.py
Expand Up @@ -1850,6 +1850,8 @@ def __init__(self,sample_sheet=None,sample_sheet_file=None):
self._predict_paired_end = False
self._predict_no_lane_splitting = False
self._predict_for_lanes = None
self._predict_for_reads = None
self._include_index_reads = False
self._force_sample_dir = False
# Read in data
if sample_sheet is None:
Expand Down Expand Up @@ -1941,6 +1943,7 @@ def add_project(self,project_name):

def set(self,package=None,paired_end=None,
no_lane_splitting=None,lanes=None,
reads=None,include_index_reads=None,
force_sample_dir=None):
"""
Configure settings for prediction
Expand All @@ -1949,11 +1952,18 @@ def set(self,package=None,paired_end=None,
(can be 'bcl2fastq2' or 'casava')
- paired_end: if True then predict outputs as if
data is paired end (i.e. R1 and R2 pairs)
(NB ignored if 'reads' argument is set)
- no_lane_splitting: if True then predict outputs
as if --no-lane-splitting was used for bcl2fastq
- lanes: if set then should be a list of lane
numbers that will be used when generating Fastq
names
- reads: if set then should be a list or other
iterable with the reads to include in the
prediction (e.g. ('R1','R2','I1','R3'))
- include_index_reads: if True then includes
index reads (i.e. I1,...) in prediction
(NB ignored if 'reads' argument is set)
- force_sample_dir: if True then force insertion
of a 'sample name' directory for IEM4 sample
sheets where sample name and ID are the same
Expand All @@ -1966,6 +1976,10 @@ def set(self,package=None,paired_end=None,
if no_lane_splitting is not None:
self._predict_no_lane_splitting = no_lane_splitting
self._predict_for_lanes = lanes
if reads is not None:
self._predict_for_reads = reads
if include_index_reads is not None:
self._include_index_reads = include_index_reads
if force_sample_dir is not None:
self._force_sample_dir = force_sample_dir
# Configure projects with same settings
Expand All @@ -1974,6 +1988,8 @@ def set(self,package=None,paired_end=None,
paired_end=paired_end,
no_lane_splitting=no_lane_splitting,
lanes=lanes,
reads=reads,
include_index_reads=include_index_reads,
force_sample_dir=force_sample_dir)

class SampleSheetProject(object):
Expand Down Expand Up @@ -2017,6 +2033,8 @@ def __init__(self,project_name):
self._predict_paired_end = False
self._predict_no_lane_splitting = False
self._predict_for_lanes = None
self._predict_for_reads = None
self._include_index_reads = False
self._force_sample_dir = False

@property
Expand Down Expand Up @@ -2092,6 +2110,7 @@ def add_sample(self,sample_id,sample_name=None,s_index=None):

def set(self,package=None,paired_end=None,
no_lane_splitting=None,lanes=None,
reads=None,include_index_reads=None,
force_sample_dir=None):
"""
Configure settings for prediction
Expand All @@ -2100,11 +2119,18 @@ def set(self,package=None,paired_end=None,
(can be 'bcl2fastq2' or 'casava')
- paired_end: if True then predict outputs as if
data is paired end (i.e. R1 and R2 pairs)
(NB ignored if 'reads' argument is set)
- no_lane_splitting: if True then predict outputs
as if --no-lane-splitting was used for bcl2fastq
- lanes: if set then should be a list of lane
numbers that will be used when generating Fastq
names
- reads: if set then should be a list or other
iterable with the reads to include in the
prediction (e.g. ('R1','R2','I1','R3'))
- include_index_reads: if True then includes
index reads (i.e. I1,...) in prediction
(NB ignored if 'reads' argument is set)
- force_sample_dir: if True then force insertion
of a 'sample name' directory for IEM4 sample
sheets where sample name and ID are the same
Expand All @@ -2117,6 +2143,10 @@ def set(self,package=None,paired_end=None,
if no_lane_splitting is not None:
self._predict_no_lane_splitting = no_lane_splitting
self._predict_for_lanes = lanes
if reads is not None:
self._predict_for_reads = reads
if include_index_reads is not None:
self._include_index_reads = include_index_reads
if force_sample_dir is not None:
self._force_sample_dir = force_sample_dir
# Cascade the settings to child samples
Expand All @@ -2125,6 +2155,8 @@ def set(self,package=None,paired_end=None,
paired_end=paired_end,
no_lane_splitting=no_lane_splitting,
lanes=lanes,
reads=reads,
include_index_reads=include_index_reads,
force_sample_dir=force_sample_dir)

def __repr__(self):
Expand Down Expand Up @@ -2180,6 +2212,8 @@ def __init__(self,sample_id,sample_name=None,s_index=None):
self._predict_paired_end = False
self._predict_no_lane_splitting = False
self._predict_for_lanes = None
self._predict_for_reads = None
self._include_index_reads = False
self._force_sample_dir = False

@property
Expand Down Expand Up @@ -2237,17 +2271,28 @@ def fastqs(self):
"""
predicted_fastqs = []
if self._predict_paired_end:
reads = (1,2)
if self._predict_for_reads:
base_reads = sorted([r for r in self._predict_for_reads])
elif self._predict_paired_end:
base_reads = ["R1","R2"]
else:
reads = (1,)
base_reads = ["R1",]
include_index_reads = (self._include_index_reads and
self._predict_for_reads is None)
if self._predict_for_package == "bcl2fastq2":
for barcode_seq in self.barcode_seqs:
# Add index reads?
if include_index_reads and barcode_seq:
index_reads = ["I%d" % (i+1)
for i in range(len(barcode_seq.split('-')))]
reads = index_reads + base_reads
else:
reads = base_reads
# Check if we need to split lanes
if self._predict_no_lane_splitting:
# No lanes
for read in reads:
fastq = "%s_S%d_R%d_001.fastq.gz" % \
fastq = "%s_S%d_%s_001.fastq.gz" % \
(self.sample_id,
self.s_index,
read)
Expand All @@ -2262,13 +2307,14 @@ def fastqs(self):
lanes = (1,)
for lane in lanes:
for read in reads:
fastq = "%s_S%d_L%03d_R%d_001.fastq.gz" % \
fastq = "%s_S%d_L%03d_%s_001.fastq.gz" % \
(self.sample_id,
self.s_index,
lane,
read)
predicted_fastqs.append(fastq)
elif self._predict_for_package == "casava":
reads = base_reads
for barcode_seq in self.barcode_seqs:
if self._predict_for_lanes:
lanes = self._predict_for_lanes
Expand All @@ -2278,7 +2324,7 @@ def fastqs(self):
lanes = (1,)
for lane in lanes:
for read in reads:
fastq = "%s_%s_L%03d_R%d_001.fastq.gz" % \
fastq = "%s_%s_L%03d_%s_001.fastq.gz" % \
(self.sample_id,
barcode_seq,
lane,read)
Expand All @@ -2288,6 +2334,7 @@ def fastqs(self):

def set(self,package=None,paired_end=None,
no_lane_splitting=None,lanes=None,
reads=None,include_index_reads=None,
force_sample_dir=None):
"""
Configure settings for prediction
Expand All @@ -2296,11 +2343,18 @@ def set(self,package=None,paired_end=None,
(can be 'bcl2fastq2' or 'casava')
- paired_end: if True then predict outputs as if
data is paired end (i.e. R1 and R2 pairs)
(NB ignored if 'reads' argument is set)
- no_lane_splitting: if True then predict outputs
as if --no-lane-splitting was used for bcl2fastq
- lanes: if set then should be a list of lane
numbers that will be used when generating Fastq
names
- reads: if set then should be a list or other
iterable with the reads to include in the
prediction (e.g. ('R1','R2','I1','R3'))
- include_index_reads: if True then includes
index reads (i.e. I1,...) in prediction
(NB ignored if 'reads' argument is set)
- force_sample_dir: if True then force insertion
of a 'sample name' directory for IEM4 sample
sheets where sample name and ID are the same
Expand All @@ -2313,6 +2367,10 @@ def set(self,package=None,paired_end=None,
if no_lane_splitting is not None:
self._predict_no_lane_splitting = no_lane_splitting
self._predict_for_lanes = lanes
if reads is not None:
self._predict_for_reads = reads
if include_index_reads is not None:
self._include_index_reads = include_index_reads
if force_sample_dir is not None:
self._force_sample_dir = force_sample_dir

Expand Down
98 changes: 98 additions & 0 deletions bcftbx/test/test_IlluminaData.py
Expand Up @@ -2797,6 +2797,104 @@ def test_samplesheet_predictor_iem_no_projects(self):
["Sample1_CGTGTAGG-GACCTGTA_L001_R1_001.fastq.gz"])
self.assertEqual(sample2.fastqs(),
["Sample2_CGTGTAGG-ATGTAACT_L001_R1_001.fastq.gz"])

def test_samplesheet_predictor_iem_with_index_reads(self):
"""SampleSheetPredictor: handle IEM4 sample sheet with index reads
"""
iem = SampleSheet(fp=io.StringIO(
self.hiseq_sample_sheet_content))
predictor = SampleSheetPredictor(sample_sheet=iem)
# Get projects
self.assertEqual(predictor.nprojects,1)
self.assertEqual(predictor.project_names,["PeterBriggs"])
project = predictor.get_project("PeterBriggs")
self.assertRaises(KeyError,predictor.get_project,"DoesntExist")
# Get samples
self.assertEqual(project.sample_ids,["PJB1-1579","PJB2-1580"])
sample1 = project.get_sample("PJB1-1579")
sample2 = project.get_sample("PJB2-1580")
self.assertRaises(KeyError,project.get_sample,"DoesntExist")
# Check sample barcodes and lanes
self.assertEqual(sample1.barcode_seqs,["CGATGTAT-TCTTTCCC"])
self.assertEqual(sample2.barcode_seqs,["TGACCAAT-TCTTTCCC"])
self.assertEqual(sample1.lanes("CGATGTAT-TCTTTCCC"),[1,2])
self.assertEqual(sample2.lanes("TGACCAAT-TCTTTCCC"),[1,2])
self.assertEqual(sample1.s_index,1)
self.assertEqual(sample2.s_index,2)
# Predict output fastqs bcl2fastq2
predictor.set(package="bcl2fastq2",
paired_end=True,
include_index_reads=True)
self.assertEqual(project.dir_name,"PeterBriggs")
self.assertEqual(sample1.dir_name,None)
self.assertEqual(sample1.fastqs(),
["PJB1-1579_S1_L001_I1_001.fastq.gz",
"PJB1-1579_S1_L001_I2_001.fastq.gz",
"PJB1-1579_S1_L001_R1_001.fastq.gz",
"PJB1-1579_S1_L001_R2_001.fastq.gz",
"PJB1-1579_S1_L002_I1_001.fastq.gz",
"PJB1-1579_S1_L002_I2_001.fastq.gz",
"PJB1-1579_S1_L002_R1_001.fastq.gz",
"PJB1-1579_S1_L002_R2_001.fastq.gz"])
self.assertEqual(sample2.dir_name,None)
self.assertEqual(sample2.fastqs(),
["PJB2-1580_S2_L001_I1_001.fastq.gz",
"PJB2-1580_S2_L001_I2_001.fastq.gz",
"PJB2-1580_S2_L001_R1_001.fastq.gz",
"PJB2-1580_S2_L001_R2_001.fastq.gz",
"PJB2-1580_S2_L002_I1_001.fastq.gz",
"PJB2-1580_S2_L002_I2_001.fastq.gz",
"PJB2-1580_S2_L002_R1_001.fastq.gz",
"PJB2-1580_S2_L002_R2_001.fastq.gz"])

def test_samplesheet_predictor_iem_with_custom_reads(self):
"""SampleSheetPredictor: handle IEM4 sample sheet with custom reads
"""
iem = SampleSheet(fp=io.StringIO(
self.hiseq_sample_sheet_content))
predictor = SampleSheetPredictor(sample_sheet=iem)
# Get projects
self.assertEqual(predictor.nprojects,1)
self.assertEqual(predictor.project_names,["PeterBriggs"])
project = predictor.get_project("PeterBriggs")
self.assertRaises(KeyError,predictor.get_project,"DoesntExist")
# Get samples
self.assertEqual(project.sample_ids,["PJB1-1579","PJB2-1580"])
sample1 = project.get_sample("PJB1-1579")
sample2 = project.get_sample("PJB2-1580")
self.assertRaises(KeyError,project.get_sample,"DoesntExist")
# Check sample barcodes and lanes
self.assertEqual(sample1.barcode_seqs,["CGATGTAT-TCTTTCCC"])
self.assertEqual(sample2.barcode_seqs,["TGACCAAT-TCTTTCCC"])
self.assertEqual(sample1.lanes("CGATGTAT-TCTTTCCC"),[1,2])
self.assertEqual(sample2.lanes("TGACCAAT-TCTTTCCC"),[1,2])
self.assertEqual(sample1.s_index,1)
self.assertEqual(sample2.s_index,2)
# Predict output fastqs bcl2fastq2
predictor.set(package="bcl2fastq2",
reads=('R1','R2','R3','I1',),
include_index_reads=True)
self.assertEqual(project.dir_name,"PeterBriggs")
self.assertEqual(sample1.dir_name,None)
self.assertEqual(sample1.fastqs(),
["PJB1-1579_S1_L001_I1_001.fastq.gz",
"PJB1-1579_S1_L001_R1_001.fastq.gz",
"PJB1-1579_S1_L001_R2_001.fastq.gz",
"PJB1-1579_S1_L001_R3_001.fastq.gz",
"PJB1-1579_S1_L002_I1_001.fastq.gz",
"PJB1-1579_S1_L002_R1_001.fastq.gz",
"PJB1-1579_S1_L002_R2_001.fastq.gz",
"PJB1-1579_S1_L002_R3_001.fastq.gz"])
self.assertEqual(sample2.dir_name,None)
self.assertEqual(sample2.fastqs(),
["PJB2-1580_S2_L001_I1_001.fastq.gz",
"PJB2-1580_S2_L001_R1_001.fastq.gz",
"PJB2-1580_S2_L001_R2_001.fastq.gz",
"PJB2-1580_S2_L001_R3_001.fastq.gz",
"PJB2-1580_S2_L002_I1_001.fastq.gz",
"PJB2-1580_S2_L002_R1_001.fastq.gz",
"PJB2-1580_S2_L002_R2_001.fastq.gz",
"PJB2-1580_S2_L002_R3_001.fastq.gz"])

class TestMiseqToCasavaConversion(unittest.TestCase):

Expand Down

0 comments on commit 279a120

Please sign in to comment.