/
bsrate.py
127 lines (109 loc) · 3.96 KB
/
bsrate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Calculate conversion rate for bisulfite sequencing."""
import os
from plumbum import TEE
from resolwe.process import (
BooleanField,
Cmd,
DataField,
FileField,
FloatField,
IntegerField,
Process,
SchedulingClass,
)
class BsConversionRate(Process):
"""Estimate bisulfite conversion rate in a control set.
The program bsrate included in [Methpipe]
(https://github.com/smithlabcode/methpipe) will estimate the bisulfite
conversion rate.
"""
slug = "bs-conversion-rate"
name = "Bisulfite conversion rate"
process_type = "data:wgbs:bsrate"
version = "1.3.1"
scheduling_class = SchedulingClass.BATCH
entity = {"type": "sample"}
requirements = {
"expression-engine": "jinja",
"executor": {
"docker": {"image": "public.ecr.aws/genialis/resolwebio/wgbs:3.0.0"}
},
"resources": {
"cores": 1,
"memory": 16384,
},
}
data_name = "{{ mr|name|default('?') }}"
category = "WGBS"
class Input:
"""Input fields for BsConversionRate."""
mr = DataField(
"alignment:bam:walt",
label="Aligned reads from bisulfite sequencing",
description="Bisulfite specifc alignment such as WALT is required as .mr file type is used. Duplicates"
"should be removed to reduce any bias introduced by incomplete conversion on PCR duplicate"
"reads.",
)
skip = BooleanField(
label="Skip Bisulfite conversion rate step",
description="Bisulfite conversion rate step can be skipped.",
default=False,
)
sequence = DataField(
"seq:nucleotide",
label="Unmethylated control sequence",
description="Separate unmethylated control sequence FASTA file is required to estimate bisulfite"
"conversion rate.",
required=False,
)
count_all = BooleanField(
label="Count all cytosines including CpGs", default=True
)
read_length = IntegerField(label="Average read length", default=150)
max_mismatch = FloatField(
label="Maximum fraction of mismatches", required=False
)
a_rich = BooleanField(label="Reads are A-rich", default=False)
class Output:
"""Output fields."""
report = FileField(label="Bisulfite conversion rate report")
def run(self, inputs, outputs):
"""Run the analysis."""
basename = os.path.basename(inputs.mr.output.mr.path)
assert basename.endswith(".mr.gz")
name = basename[:-6]
report_file = f"{name}_spikein_bsrate.txt"
skip_process = inputs.skip
try:
inputs.mr.output.spikein_mr.path
except AttributeError:
self.warning(
"Selected sample lacks the alignment file for unmethylated control reads."
)
skip_process = True
try:
inputs.sequence.output.fasta.path
except AttributeError:
self.warning("Unmethylated control sequence was not provided.")
skip_process = True
if not skip_process:
(Cmd["pigz"]["-cd", inputs.mr.output.spikein_mr.path] > f"{name}.mr")()
args = [
"-chrom",
inputs.sequence.output.fasta.path,
"-output",
report_file,
]
if inputs.count_all:
args.append("-all")
if inputs.max_mismatch:
args.extend(["-max", inputs.max_mismatch])
if inputs.a_rich:
args.append("-a-rich")
return_code, _, _ = Cmd["bsrate"][args][f"{name}.mr"] & TEE(retcode=None)
if return_code:
self.error("Bsrate analysis failed.")
else:
with open(report_file, "w") as f:
f.write("Bisulfite conversion rate process skipped.")
outputs.report = report_file