-
Notifications
You must be signed in to change notification settings - Fork 0
/
orthorbb
executable file
·163 lines (145 loc) · 7.25 KB
/
orthorbb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env zsh
usage()
{
cat << EOF
orthorbb
Version 2.1 (2018-05-31)
License: GNU GPLv2
To report bugs or errors, please contact Daren Card (dcard@uta.edu).
This script is provided as-is, with no support and no guarantee of proper or
desirable functioning.
This script infers homology between a query and a reference set of protein sequences.
It is designed to annotate new protein annotation from programs like Maker. User supplies
protein sequences for a query (target) and for a reference (in fasta format), and some
relevent settings for the underlying BLAST searches. Query proteins are annotated with
orthologous sequence IDs from a reference protein set inferred using reciprocal best blastp
or stringent one-way best blastp to proteins. Proteins are first annotated using reciprocal
best blastp, followed stringent one-way blastp. Importantly, users can set a more stringent
e-value for the one-way best blastp than what is used in the reciprocal best-blastp.
Output files contain relevent information about how homology was inferred and a summary of the
annotation is also output. The table of homology matches NCBI BLAST output format #6 overall,
with some key notes: (1) "|RBB" or "|OBB" are appended to the reference protein IDs to indicate
whether the match is a reciprocal best blastp or one-way best blastp match and (2) percent match,
coordinates, e-value, and bitscore for the less confident (i.e., higher e-value) alignment are
provided in the case of the reciprocal best blastp.
orthorbb -p <query_proteins> -a <reference_proteins> -q <query_name> -r <reference_name>
[ -e <RBB_e-value> -f <ONEWAY_e-value> -t <threads> -h ]
OPTIONS:
-h usage information and help (this message)
-p query protein sequences to be annotated
-a reference protein sequences to be used in annotation
-q name of query sample (for output file naming)
-r name of reference sample (for output file naming)
-e e-value to be used to filter BLAST hits for RBB [0.001]
-f e-value to be used to filter BLAST hits for ONEWAY [1e-5]
-t number of computer threads to use in BLAST [1]
EOF
}
EVAL=0.001
THREAD=1
while getopts "hp:a:q:r:e:f:t:" OPTION
do
case $OPTION in
help)
usage
exit 1
;;
p)
QPROT=$OPTARG
;;
a)
RPROT=$OPTARG
;;
q)
QRY=$OPTARG
;;
r)
REF=$OPTARG
;;
e)
REVAL=$OPTARG
;;
f)
OEVAL=$OPTARG
;;
t)
THREAD=$OPTARG
;;
?)
usage
exit
;;
esac
done
if [[ -z $QPROT ]] || [[ -z $RPROT ]] || [[ -z $QRY ]] || [[ -z $REF ]]
then
usage
exit 1
fi
echo -e "\n###############################\nCreating BLAST Databases\n###############################\n"
# don't parse sequence IDs
cmd="makeblastdb -dbtype prot -in $QPROT"
# echo $cmd
eval $cmd
cmd="makeblastdb -dbtype prot -in $RPROT"
# echo $cmd
eval $cmd
echo -e "\n###############################\nRunning BLASTP Searches\n###############################\n"
# one-way and reciprocal blastp
cmd="blastp -num_threads $THREAD -max_target_seqs 10 -evalue $EVAL -outfmt 11 \
-db $RPROT -query $QPROT -out "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".out.asn""
# echo $cmd
eval $cmd
cmd="blastp -num_threads $THREAD -max_target_seqs 10 -evalue $EVAL -outfmt 11 \
-db $QPROT -query $RPROT -out "qry-"$REF"_2_ref-"$QRY"_blastp_e"$REVAL".out.asn""
# echo $cmd
eval $cmd
echo -e "\n###############################\nSummaryizing BLASTP Searches\n###############################\n"
# take top hit for each query sequence from both blastp searches
cmd="blast_formatter -max_target_seqs 1 -outfmt 6 -archive "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".out.asn" \
-out "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".top1hits.fmt6.txt""
# echo $cmd
eval $cmd
cmd="blast_formatter -max_target_seqs 1 -outfmt 6 -archive "qry-"$REF"_2_ref-"$QRY"_blastp_e"$REVAL".out.asn" \
-out "qry-"$REF"_2_ref-"$QRY"_blastp_e"$REVAL".top1hits.fmt6.txt""
# echo $cmd
eval $cmd
# RELIABILITY RANK = 1: deduce which protein hits show up in both datasets (i.e., are reciprocal best blast hits), and provide some context
cmd="cat <(cat "qry-"$REF"_2_ref-"$QRY"_blastp_e"$REVAL".top1hits.fmt6.txt" | awk -F \"\\t\" '!_[\$1 FS \$2]++' | awk -v OFS=\"\\t\" '{ print \$2, \$1, \$3, \$4, \$5, \$6, \$9, \$10, \$7, \$8, \$11, \$12 }') \
<(cat "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".top1hits.fmt6.txt" | awk -F \"\\t\" '!_[\$1 FS \$2]++') | \
sort -k1,1 -k11,11gr -k12,12g | awk -F \"\\t\" '_[\$1 FS \$2]++' \
> $QRY"_"$REF"_RBB.blastp.e"$REVAL".fmt6.txt""
# echo $cmd
eval $cmd
# RELIABILITY RANK = 2: deduce which stringent protein hits exist in reference (i.e., one-way best hits), and provide some context
cmd="cat "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".top1hits.fmt6.txt" | awk -v oeval=\"\$OEVAL\" '{ if (\$11 < oeval ) print \$0 }' | awk -F \"\\t\" '!_[\$1 FS \$2]++' \
> $QRY"_"$REF"_ONEWAY.blastp.e"$OEVAL".fmt6.txt""
# echo $cmd
eval $cmd
echo -e "\n###############################\nSummarizing all BLAST Information\n###############################\n"
# finally, lets put these together, avoiding redundancy and prioritizing based on reliability of inference of orthology (RBB protein > RBB transcript > ONEWAY protein)
# command dissection:
## two files are concatenated together, sorted by first column (-k1,1), and then unique lines based on the first column are kept (first column corresponds to query annotation IDs)
## importantly, the order in which the files are read in the 'cat' command dictates which type of hit is kept when the unique rows (based on first column) are selected
## therefore, we put the RBB protein first and the ONEWAY protein second, and whenever they overlap in a query annotation ID, the RBB protein hit is taken
## we do this inside a nested shell ("<(cat....), such that the output of that process is directed into a second 'cat' command, which does the same thing as described above
## this has the effect or prioritizing the results in the order we want, without redundancy in the query annotation IDs, giving us our final inference of homology with detailed information on our confidence in that inference
cmd="cat <(cat $QRY"_"$REF"_RBB.blastp.e"$REVAL".fmt6.txt" | awk -v OFS=\"\\t\" -v ref=\"\$REF\" '{ print \$1, \$2\"\|RBB\", \$3, \$4, \$5, \$6, \$7, \$8, \$9, \$10, \$11, \$12 }' ) \
<(cat $QRY"_"$REF"_ONEWAY.blastp.e"$OEVAL".fmt6.txt" | awk -v OFS=\"\\t\" -v ref=\"\$REF\" '{ print \$1, \$2\"\|OBB\", \$3, \$4, \$5, \$6, \$7, \$8, \$9, \$10, \$11, \$12 }') | \
sort -u -k1,1 > $QRY"_"$REF"_homology_annotation.fmt6.txt""
# echo $cmd
eval $cmd
# using context provided in output file, we can summarize how proteins were annotated using the reference
cmd="cat <(cat $QRY"_"$REF"_homology_annotation.fmt6.txt" | \
awk ' \$2 ~ /\\\|RBB\$/ { count1 += 1 }; \
\$2 ~ /\\\|OBB\$/ { count2 += 1 } END \
{ print \"RBB proteins to proteins = \"count1;
print \"ONEWAY proteins to proteins = \"count2;
print \"Total annotated = \"count1+count2 }') \
<(grep -c \"^>\" $QPROT | awk '{ print \"Total input sequences = \"\$1 }') \
> $QRY"_"$REF"_homology_annotation.summary.txt""
# echo $cmd
eval $cmd
echo -e "\n###############################\nAnalysis Completed\n\
Final annotations can be found in $QRY"_"$REF"_homology_annotation.tsv"\n\
A summary can be found in $QRY"_"$REF"_homology_annotation.summary.txt"\n###############################\n"