/
meta.sh
147 lines (122 loc) · 4.47 KB
/
meta.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/bash
# scripts for 16S data analysis
#
# originally by Ruben Garrido-Oter
# garridoo@mpipz.mpg.de
# exits whenever a function returns 1
set -e
# get path to scripts
scripts_dir=$(dirname $0)
# load config file
source $scripts_dir/config.sh
# load functions
source $scripts_dir/16s.functions.sh
# activate QIIME, etc.
source $scripts_dir/activate.sh
# # process data from the different studies
#
# log "processing data from Schlaeppi et al., 2014..."
# $scripts_dir/schlaeppi_2014.sh
#
# log "processing data from Bulgarelli et al., 2015..."
# $scripts_dir/bulgarelli_2015.sh
#
# log "processing data from Bai et al., 2015..."
# $scripts_dir/bai_2015_root.sh
# $scripts_dir/bai_2015_leaf.sh
#
# log "processing data from Zgadzaj et al., 2016..."
# $scripts_dir/zgadzaj_2016.sh
#
log "combining pre-processed reads from all studies..."
# load config file
source $scripts_dir/config.sh
rm -f $working_dir/seqs.fasta
cat $working_dir/schlaeppi_2014/seqs.fasta >> $working_dir/seqs.fasta
cat $working_dir/bulgarelli_2015/seqs.fasta >> $working_dir/seqs.fasta
cat $working_dir/bai_2015_root/seqs.fasta >> $working_dir/seqs.fasta
cat $working_dir/bai_2015_leaf/seqs.fasta >> $working_dir/seqs.fasta
cat $working_dir/zgadzaj_2016/seqs.fasta >> $working_dir/seqs.fasta
# dereplication
log "dereplicating..."
usearch -derep_fulllength $working_dir/seqs.fasta \
-fastaout $working_dir/seqs_unique.fasta \
-sizeout \
&>> $output
# abundance sort and discard singletons
log "sorting by abundance and discarding singletons..."
usearch -sortbysize $working_dir/seqs_unique.fasta \
-fastaout $working_dir/seqs_unique_sorted.fasta \
-minsize $min_size \
&>> $output
# OTU clustering
log "OTU clustering using UPARSE..."
usearch -cluster_otus $working_dir/seqs_unique_sorted.fasta \
-otus $working_dir/otus.fasta \
-id $id_threshold \
&>> $output
# chimera detection
log "removing chimeras..."
usearch -uchime_ref $working_dir/otus.fasta \
-db $gold_db \
-strand plus \
-nonchimeras $working_dir/otus_nc.fasta \
-threads $n_cores \
&>> $output
# align sequences to database using PyNAST and remove remaining
log "aligning OTU representative sequences to database..."
align_seqs.py -i $working_dir/otus_nc.fasta \
-t $gg_core_aligned_db \
-p $min_id_aln \
-o $working_dir
sed -i 's/-//g' $working_dir/otus_nc_aligned.fasta
# rename OTUs and remove alignment gaps
log "renaming OTUs..."
cat $working_dir/otus_nc_aligned.fasta | \
awk 'BEGIN {n=1}; />/ {print ">OTU_" n; n++} !/>/ {print}' \
>> $working_dir/rep_seqs.fasta
# generate OTU table
log "generating OTU table..."
usearch -usearch_global $working_dir/seqs.fasta \
-db $working_dir/rep_seqs.fasta \
-strand plus \
-id $id_threshold \
-uc $working_dir/read_mapping.uc \
&>> $output
# convert UC file to txt
log "converting UC OTU table file into text format..."
python $usearch_dir/uc2otutab.py $working_dir/read_mapping.uc \
1> $working_dir/otu_table.txt \
2>> $output
# reference-based OTU clustering using the UPARSE-REF algorithm
# using the rhizobia representative sequences as a reference
log "reference-based OTU clustering using UPARSE-REF..."
usearch -uparse_ref $working_dir/seqs_unique_sorted.fasta \
-db $data_dir/rhizobia_100_rep_seqs.fasta \
-strand plus \
-uparseout $working_dir/otu_clustering_rhizobia_97.up \
-threads $n_cores \
&>> $output
# generate UC-compatible mapping file for USEARCH compatibility at 97%
rm -f $working_dir/otu_clustering_rhizobia_97.uc
cat $working_dir/otu_clustering_rhizobia_97.up | \
awk -v i=0.97 '{if ($3<i*100 || $2=="chimera" || $2=="other") {$5=$2}
print $1, $5}' | \
sed 's/ /\t/g;s/^/H\t\t\t\t\t\t\t\t/g' \
1>> $working_dir/otu_clustering_rhizoia_97.uc \
2>> $output
# generate OTU table and filter sequences
log "generating OTU table..."
usearch -usearch_global $working_dir/seqs.fasta \
-db $data_dir/rhizobia_100_rep_seqs.fasta \
-strand plus \
-id $id_threshold \
-uc $working_dir/otu_clustering_rhizobia_97.uc \
-matched $working_dir/rhizobia_seqs.fasta \
&>> $output
# convert UC file to txt
log "converting UC OTU table file into text format..."
python $usearch_dir/uc2otutab.py $working_dir/otu_clustering_rhizobia_97.uc \
1> $working_dir/otu_table_rhizobia_97.txt \
2>> $output
log "DONE!"