/
roary.snakefile
223 lines (195 loc) · 8.09 KB
/
roary.snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# ================
import csv
class Checkpoint_GatherResults:
"""
Define a class a la genome-grist to simplify file specification
from checkpoint (e.g. solve for {acc} wildcard). This approach
is documented at this url:
http://ivory.idyll.org/blog/2021-snakemake-checkpoints.html
"""
def __init__(self, pattern):
self.pattern = pattern
def get_genome_accs(self):
gather_csv = f'outputs/genbank/gather_vita_vars_gtdb_shared_assemblies.x.genbank.gather.csv'
assert os.path.exists(gather_csv)
genome_accs = []
with open(gather_csv, 'rt') as fp:
r = csv.DictReader(fp)
for row in r:
acc = row['name'].split(' ')[0]
genome_accs.append(acc)
print(f'loaded {len(genome_accs)} accessions from {gather_csv}.')
return genome_accs
def __call__(self, w):
global checkpoints
# wait for the results of 'gather_gtdb_rep_to_shared_assemblies';
# this will trigger exception until that rule has been run.
checkpoints.gather_gtdb_rep_to_shared_assemblies.get(**w)
# parse accessions in gather output file
genome_accs = self.get_genome_accs()
p = expand(self.pattern, acc=genome_accs, **w)
return p
#=========================
# this class creates a mapping between acc:roary_acc,
# so it might just autosolve the connection between those wildcards
# and run the right things. If not, I guess try and make a dictionary
# in this class and use it somehow somewhere?
# dictionary would go in `with open(prefetch_csv` and init a dict
# that would be available outside the class
class Checkpoint_RoaryPrefetchResults:
"""
Define a class a la genome-grist to simplify file specification
from checkpoint (e.g. solve for {roary_acc} wildcard). This approach
is documented at this url:
http://ivory.idyll.org/blog/2021-snakemake-checkpoints.html
"""
def __init__(self, pattern):
self.pattern = pattern
def get_roary_accs(self, acc):
prefetch_csv = f'outputs/roary_prefetch/{acc}_prefetch_filtered.csv'
assert os.path.exists(prefetch_csv)
roary_accs = []
with open(prefetch_csv, 'rt') as fp:
r = csv.DictReader(fp)
for row in r:
roary_acc = row['match_name'].split(' ')[0]
roary_accs.append(roary_acc)
print(f'loaded {len(roary_accs)} accessions from {prefetch_csv}.')
return roary_accs
def __call__(self, w):
global checkpoints
# wait for the results of 'touch_roary_filter_prefetch...';
# this will trigger exception until that rule has been run.
checkpoints.touch_roary_filter_prefetch_shared_assemblies_vs_refseq.get(**w)
# parse accessions in gather output file
roary_accs = self.get_roary_accs(**w) # w.accs has a hard time getting passed in here
p = expand(self.pattern, roary_acc=roary_accs, **w)
return p
rule all:
input:
Checkpoint_GatherResults("outputs/roary/{acc}/pan_genome_reference.fa")
#=======================================
checkpoint gather_gtdb_rep_to_shared_assemblies:
output:
gather_grist = "outputs/genbank/gather_vita_vars_gtdb_shared_assemblies.x.genbank.gather.csv",
gather_all_shared = "outputs/genbank/gather_vita_vars_gtdb_shared_assemblies_all.gather.csv"
conda: "envs/tidy.yml"
resources:
mem_mb = 8000
threads: 1
script: "scripts/gather_gtdb_rep_to_shared_assemblies.R"
rule download_shared_assemblies:
input:
gather_grist = "outputs/genbank/gather_vita_vars_gtdb_shared_assemblies.x.genbank.gather.csv",
conf = "inputs/genome-grist-conf.yml"
output: "outputs/genbank_genomes_shared_assemblies/{acc}_genomic.fna.gz"
conda: "envs/genome-grist.yml"
resources:
mem_mb = 8000
threads: 1
shell:'''
genome-grist run {input.conf} --until make_sgc_conf --nolock
'''
#=========================================
rule compute_signatures_shared_assemblies:
input: 'outputs/charcoal/{acc}_genomic.fna.gz.clean.fa.gz'
output: 'outputs/charcoal/{acc}_genomic.fna.gz.clean.sig'
conda: "envs/sourmash.yml"
shell:'''
sourmash compute -k 31 --scaled 2000 -o {output} {input}
'''
rule roary_prefetch_shared_assemblies_vs_refseq:
input:
query='outputs/charcoal/{acc}_genomic.fna.gz.clean.sig',
db = '/home/irber/sourmash_databases/outputs/sbt/refseq-bacteria-x1e6-k31.sbt.zip'
output: "outputs/roary_prefetch/{acc}_prefetch.csv"
conda: "envs/sourmash.yml"
shell:'''
sourmash prefetch -k 31 -o {output} {input.query} {input.db}
'''
rule roary_filter_prefetch_shared_assemblies_vs_refseq:
# filter to jaccard of >= 0.1
input: prefetch="outputs/roary_prefetch/{acc}_prefetch.csv"
output: filt="outputs/roary_prefetch/{acc}_prefetch_filtered.csv"
conda: "envs/tidy.yml"
script: "scripts/roary_filter_prefetch.R"
# input needs to be output from previous rule
# check that they all exist, and then touch a dummy file
checkpoint touch_roary_filter_prefetch_shared_assemblies_vs_refseq:
input: "outputs/roary_prefetch/{acc}_prefetch_filtered.csv"
output: touch("outputs/roary_prefetch/.{acc}_roary_dummy.txt")
rule roary_combine_filtered_prefetch_results:
input: filt = Checkpoint_GatherResults("outputs/roary_prefetch/{acc}_prefetch_filtered.csv")
output: combined = "outputs/genbank/roary_prefetch.x.genbank.gather.csv"
conda: "envs/tidy.yml"
script: "scripts/roary_combine.R"
rule roary_make_conf_file_for_genome_grist:
input: "outputs/genbank/roary_prefetch.x.genbank.gather.csv"
output: conf="outputs/roary_genome_grist_conf/roary_genome_grist_conf.yml"
run:
with open(output.conf, 'wt') as fp:
print(f"""\
sample:
- roary_prefetch
outdir: outputs
""", file=fp)
# in the Checkpoint_Roary class, we can create a dictionary of acc:roary_acc to
# handle stupid genome-grist dumbness. Then can use lambda wildcards to get
# roary_acc values from acc key in dictionary.
rule roary_genome_grist:
input:
conf="outputs/roary_genome_grist_conf/roary_genome_grist_conf.yml"
output:
genome = "outputs/genbank_genomes_roary/{roary_acc}_genomic.fna.gz"
conda: "envs/genome-grist.yml"
resources:
mem_mb = 8000
threads: 1
shell:'''
genome-grist run {input.conf} --until make_sgc_conf --nolock
mv genbank_genomes/ outputs/genbank_genomes_roary
'''
rule roary_prokka:
output:
ffn = 'outputs/roary_prokka/{acc}/{roary_acc}.ffn',
faa = 'outputs/roary_prokka/{acc}/{roary_acc}.faa',
gff = "outputs/roary_prokka/{acc}/{roary_acc}.gff"
input: "outputs/genbank_genomes_roary/{roary_acc}_genomic.fna.gz"
conda: 'envs/prokka.yml'
resources:
mem_mb = 8000
threads: 2
params:
outdir = lambda wildcards: 'outputs/roary_prokka/' + wildcards.acc + "/",
prefix = lambda wildcards: wildcards.roary_acc,
gzip = lambda wildcards: "outputs/genbank_genomes_roary/" + wildcards.roary_acc + "_genomic.fna.gz"
shell:'''
gunzip {input}
prokka {params.gzip} --outdir {params.outdir} --prefix {params.prefix} --metagenome --force --locustag {params.prefix} --cpus {threads} --centre X --compliant
mv {params.prefix}.ffn {output.ffn}
mv {params.prefix}.faa {output.faa}
mv {params.prefix}.gff {output.gff}
gzip {params.gzip}
'''
rule roary:
input:
gff = Checkpoint_RoaryPrefetchResults('outputs/roary_prokka/{acc}/{roary_acc}.gff'),
dummy = "outputs/roary_prefetch/.{acc}_roary_dummy.txt"
output: 'outputs/roary/{acc}/pan_genome_reference.fa'
conda: 'envs/roary.yml'
resources: mem_mb = 32000
threads: 4
params: outdir = lambda wildcards: "outputs/roary/" + wildcards.acc + "/"
shell:'''
roary -e -n -f {params.outdir} -p {threads} -z {input}
'''
rule roary_translate:
input: 'outputs/roary/{acc}/pan_genome_reference.fa'
output: 'outputs/roary/{acc}/pan_genome_reference.faa'
conda: 'envs/emboss.yml'
resources:
mem_mb = 16000
threads: 2
shell:'''
transeq {input} {output}
'''