/
CXXCH_search_anvio_import.sh
144 lines (105 loc) · 5.24 KB
/
CXXCH_search_anvio_import.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Original script by Viola Krukenberg (https://orcid.org/0000-0001-8369-8114)
# With modifications by David Benito Merino (https://orcid.org/my-orcid?orcid=0000-0002-1453-7330, https://github.com/dbenitom)
# This script detects CxxCH (heme-binding) motifs in predicted open reading frames from microbial genomes or metagenomes
# and imports the results in an anvi'o contigs database (https://anvio.org/).
# Path to anvio contigs.db:
CONTIGS_DB=
# Annotated genome name:
GENOME_ANNOTATION=
# Threads:
THREADS=10
# output
WORK_DIR=
#motif to search for
MOTIF="C[A-Z][A-Z]CH"
#short name of the motif
MOTIF_NAME="CXXCH"
####
# Command to extend aliases from bash profile
shopt -s expand_aliases
echo "## Step 1. Generate the fasta file with gene calls for the genome/metagenome:"
date
echo ""
source activate anvio-master; source ~/virtual-envs/anvio-master/bin/activate
anvi-get-sequences-for-gene-calls -c $CONTIGS_DB --get-aa-sequences -o ${WORK_DIR}/${GENOME_ANNOTATION}.faa
deactivate; source deactivate
echo "## Done"
date
echo ""
echo "## Step 2. Search motif in the gene calls:"
date
echo ""
cd $WORK_DIR
mkdir ${MOTIF_NAME}_scan
cp ${WORK_DIR}/${GENOME_ANNOTATION}.faa ${MOTIF_NAME}_scan
cd ${MOTIF_NAME}_scan
#modify annotated .faa file: each sequence in a single line
awk '{ if ($1 ~ /^>/) { printf("\n%s\n", $0) } else { printf("%s", $0) }}' ${GENOME_ANNOTATION}.faa > modified_${GENOME_ANNOTATION}.faa
#find sequences containing the motif and output them to new file
grep -B 1 ${MOTIF} modified_${GENOME_ANNOTATION}.faa > ${MOTIF_NAME}_motif_${GENOME_ANNOTATION}.faa
#make a list with header (including annotation) of the sequences containing the motif
grep '^>' ${MOTIF_NAME}_motif_${GENOME_ANNOTATION}.faa | sed 's/^>//' > motifList_${GENOME_ANNOTATION}.txt
#make a list of only IDs of the motif containing sequences
grep '^>' ${MOTIF_NAME}_motif_${GENOME_ANNOTATION}.faa | sed 's/>//' | sed 's/ .*//' > motifIds_${GENOME_ANNOTATION}.txt
#this extracts sequences using a list of IDs; script from Brandon; make sure script is in path
grep -A 1 -f motifIds_${GENOME_ANNOTATION}.txt modified_${GENOME_ANNOTATION}.faa > Sequences_${MOTIF_NAME}motif_${GENOME_ANNOTATION}.fasta
#make directory for count data
mkdir motifCountTables_${MOTIF_NAME}
#copy the file containing the sequences to the new directory; csplit will create many which should be stored in the directory for count data; could also move files here after cspli
cp Sequences_${MOTIF_NAME}motif_${GENOME_ANNOTATION}.fasta motifCountTables_${MOTIF_NAME}
cd motifCountTables_${MOTIF_NAME}
#split the multifasta file into separate files each containing one sequence; did this to then count in the next step the motifs per sequence
csplit -f ${MOTIF_NAME} Sequences_${MOTIF_NAME}motif_${GENOME_ANNOTATION}.fasta '/>/' '{*}'
#for each fasta file from csplit output make a file with the ORF name, count the number of motifs
for File in ${MOTIF_NAME}*
do
echo "$File"
grep '>' "$File" | sed 's/^>//' > "$File"_name
#grep every motif (-o for only matching patterns) and make a list with all motifs founds
grep -o ${MOTIF} "$File" > "$File"_list
#count the number of lines in the list of motifs found to know the number of motifs in the sequence
wc -l "$File"_list > "$File"_count
#make a table with the number of motifs and the ORF name, separate the columns by a tab
paste "$File"_count "$File"_name | column -c 2 -s $'\t' > CountTable_"$File".tab
done
#concatenate the tables created for each file into one big table
cat CountTable*.tab > CountTableAll_${MOTIF_NAME}.tab
#use tail to remove first line (first file contains no data), use sed to remove file name after count (separated by a space) and add a header to the columns of the table
tail -n +2 CountTableAll_${MOTIF_NAME}.tab | sed 's/ .*\t/\t/' | sed '1i\motif#\tORF' > FinalCountTableAll_${MOTIF_NAME}.tab
#clean the directory by moving the in previous steps generated files to separte folders; could also delete them
mkdir Names
mv *_name Names/
mkdir Lists
mv *_list Lists/
mkdir Counts
mv *_count Counts/
mkdir CountTables
mv CountTable_* CountTables/
mkdir SequFiles
mv ${MOTIF_NAME}* SequFiles/
echo "## Done"
echo ""
echo "## Step 3. Parse and import to contigs database"
date
echo ""
# Column 1: gene calls
cut -f2 FinalCountTableAll_${MOTIF_NAME}.tab | tail --lines=+2 > ${WORK_DIR}/gene_calls.tmp
# Column 2: source
yes "CXXCH_scan" | head -n `cut -f2 ${WORK_DIR}/gene_calls.tmp | wc -l` > ${WORK_DIR}/source.tmp
# Column 3: accession
yes "CXXCH_count" | head -n `cut -f2 ${WORK_DIR}/gene_calls.tmp | wc -l` > ${WORK_DIR}/accession.tmp
# Column 4: function
cut -f1 FinalCountTableAll_${MOTIF_NAME}.tab | tail --lines=+2 > ${WORK_DIR}/function.tmp
# Column 5: e-value
yes "0" | head -n `cut -f2 ${WORK_DIR}/gene_calls.tmp | wc -l` > ${WORK_DIR}/e_value.tmp
# Paste
cd ${WORK_DIR}
paste gene_calls.tmp source.tmp accession.tmp function.tmp e_value.tmp > all.tmp
# header
echo -e "gene_callers_id\tsource\taccession\tfunction\te_value" > header.tmp
cat header.tmp all.tmp > ${WORK_DIR}/${MOTIF_NAME}_scan/gene-annot-CXXCH.tsv
rm *tmp
# Immport to contigs.db
source activate anvio-master; source ~/virtual-envs/anvio-master/bin/activate
anvi-import-functions -c $CONTIGS_DB -i ${WORK_DIR}/${MOTIF_NAME}_scan/gene-annot-CXXCH.tsv
deactivate; source deactivate