/
demo.prot.conf
54 lines (43 loc) · 1.58 KB
/
demo.prot.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# check and validate config settings & filenames strictly.
strict: 1
# continue past survivable errors in decontamination
force: 0
###
### project specific settings
###
# location for all generated files
output_dir: 'output.demo.prot'
# list of genome filenames to decontaminate
genome_list: demo/demo.genome-list.txt
# directory in which genome filenames live
genome_dir: demo/genomes/
# (optional) list of lineages for input genomes. comment out or leave
# blank if none.
provided_lineages: demo/provided-lineages.csv
# match_rank is the rank _above_ which cross-lineage matches are considered
# contamination. e.g. if set to 'superkingdom', then Archaeal matches in
# Bacterial genomes will be contamination, but nothing else.
#
# values can be superkingdom, phylum, class, order, family, or genus.
match_rank: genus
###
### installation/system-wide configuration
###
# sourmash query databases for contamination (SBTs, LCAs, or signatures)
gather_db:
- demo/demo.prot-matches.sig.gz
#- db/gtdb_pep.rep_genus.protein_scaled100_k11.sbt.zip
#- demo/LoombaR_2017__SID1050_bax__bin.11.fa.gz.gather-matches.sig.gz
#- demo/TARA_ANE_MAG_00014.fa.gather-matches.sig.gz
#- demo/TARA_PON_MAG_00084.fa.gather-matches.sig.gz
#- demo/GCA_001593925.sig.gz
# lineages CSV containing reference lineages in query database.
# Must correspond to signatures in query databases (e.g. gtdb.csv).
lineages_csv: demo/demo.prot-lineages.csv
# scaled and ksize at which to construct genome/contig signatures.
# these should match query databases.
scaled: 100
ksize: 33
moltype: "protein"
min_f_ident: 0.0167
min_f_major: 0.0333