orthorbb

#!/usr/bin/env zsh

usage()
{
cat << EOF
orthorbb

Version 2.1 (2018-05-31)
License: GNU GPLv2
To report bugs or errors, please contact Daren Card (dcard@uta.edu).
This script is provided as-is, with no support and no guarantee of proper or
desirable functioning.

This script infers homology between a query and a reference set of protein sequences. 
It is designed to annotate new protein annotation from programs like Maker. User supplies 
protein sequences for a query (target) and for a reference (in fasta format), and some 
relevent settings for the underlying BLAST searches. Query proteins are annotated with 
orthologous sequence IDs from a reference protein set inferred using reciprocal best blastp 
or stringent one-way best blastp to proteins. Proteins are first annotated using reciprocal 
best blastp, followed stringent one-way blastp. Importantly, users can set a more stringent 
e-value for the one-way best blastp than what is used in the reciprocal best-blastp. 

Output files contain relevent information about how homology was inferred and a summary of the 
annotation is also output. The table of homology matches NCBI BLAST output format #6 overall, 
with some key notes: (1) "|RBB" or "|OBB" are appended to the reference protein IDs to indicate 
whether the match is a reciprocal best blastp or one-way best blastp match and (2) percent match, 
coordinates, e-value, and bitscore for the less confident (i.e., higher e-value) alignment are 
provided in the case of the reciprocal best blastp.

orthorbb -p <query_proteins> -a <reference_proteins> -q <query_name> -r <reference_name> 
	 [ -e <RBB_e-value> -f <ONEWAY_e-value> -t <threads> -h ]

OPTIONS:
        -h		usage information and help (this message)
        -p		query protein sequences to be annotated
	-a		reference protein sequences to be used in annotation
        -q		name of query sample (for output file naming)
	-r		name of reference sample (for output file naming)
	-e		e-value to be used to filter BLAST hits for RBB [0.001]
	-f		e-value to be used to filter BLAST hits for ONEWAY [1e-5]
	-t		number of computer threads to use in BLAST [1]
EOF
}

EVAL=0.001
THREAD=1

while getopts "hp:a:q:r:e:f:t:" OPTION
do
        case $OPTION in
                help)
                        usage
                        exit 1
                        ;;
		p)
			QPROT=$OPTARG
			;;
		a)
			RPROT=$OPTARG
			;;
		q)
			QRY=$OPTARG
			;;
		r)
			REF=$OPTARG
			;;
		e)
			REVAL=$OPTARG
			;;
		f)
			OEVAL=$OPTARG
			;;
		t)
			THREAD=$OPTARG
			;;
		?)
			usage
			exit
			;;
	esac
done

if [[ -z $QPROT ]] || [[ -z $RPROT ]] || [[ -z $QRY ]] || [[ -z $REF ]]
then
	usage
	exit 1
fi

echo -e "\n###############################\nCreating BLAST Databases\n###############################\n"
# don't parse sequence IDs
cmd="makeblastdb -dbtype prot -in $QPROT"
# echo $cmd
eval $cmd
cmd="makeblastdb -dbtype prot -in $RPROT"
# echo $cmd
eval $cmd

echo -e "\n###############################\nRunning BLASTP Searches\n###############################\n"
# one-way and reciprocal blastp
cmd="blastp -num_threads $THREAD -max_target_seqs 10 -evalue $EVAL -outfmt 11 \
-db $RPROT -query $QPROT -out "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".out.asn""
# echo $cmd
eval $cmd

cmd="blastp -num_threads $THREAD -max_target_seqs 10 -evalue $EVAL -outfmt 11 \
-db $QPROT -query $RPROT -out "qry-"$REF"_2_ref-"$QRY"_blastp_e"$REVAL".out.asn""
# echo $cmd
eval $cmd

echo -e "\n###############################\nSummaryizing BLASTP Searches\n###############################\n"
# take top hit for each query sequence from both blastp searches
cmd="blast_formatter -max_target_seqs 1 -outfmt 6 -archive "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".out.asn" \
-out "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".top1hits.fmt6.txt""
# echo $cmd
eval $cmd
cmd="blast_formatter -max_target_seqs 1 -outfmt 6 -archive "qry-"$REF"_2_ref-"$QRY"_blastp_e"$REVAL".out.asn" \
-out "qry-"$REF"_2_ref-"$QRY"_blastp_e"$REVAL".top1hits.fmt6.txt""
# echo $cmd
eval $cmd

# RELIABILITY RANK = 1: deduce which protein hits show up in both datasets (i.e., are reciprocal best blast hits), and provide some context
cmd="cat <(cat "qry-"$REF"_2_ref-"$QRY"_blastp_e"$REVAL".top1hits.fmt6.txt" | awk -F \"\\t\" '!_[\$1 FS \$2]++' | awk -v OFS=\"\\t\" '{ print \$2, \$1, \$3, \$4, \$5, \$6, \$9, \$10, \$7, \$8, \$11, \$12 }') \
<(cat "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".top1hits.fmt6.txt" | awk -F \"\\t\" '!_[\$1 FS \$2]++') | \
sort -k1,1 -k11,11gr -k12,12g | awk -F \"\\t\" '_[\$1 FS \$2]++' \
> $QRY"_"$REF"_RBB.blastp.e"$REVAL".fmt6.txt""
# echo $cmd
eval $cmd

# RELIABILITY RANK = 2: deduce which stringent protein hits exist in reference (i.e., one-way best hits), and provide some context
cmd="cat "qry-"$QRY"_2_ref-"$REF"_blastp_e"$REVAL".top1hits.fmt6.txt" | awk -v oeval=\"\$OEVAL\" '{ if (\$11 < oeval ) print \$0 }' | awk -F \"\\t\" '!_[\$1 FS \$2]++' \
> $QRY"_"$REF"_ONEWAY.blastp.e"$OEVAL".fmt6.txt""
# echo $cmd
eval $cmd

echo -e "\n###############################\nSummarizing all BLAST Information\n###############################\n"
# finally, lets put these together, avoiding redundancy and prioritizing based on reliability of inference of orthology (RBB protein > RBB transcript > ONEWAY protein)
# command dissection:
##  two files are concatenated together, sorted by first column (-k1,1), and then unique lines based on the first column are kept (first column corresponds to query annotation IDs)
##  importantly, the order in which the files are read in the 'cat' command dictates which type of hit is kept when the unique rows (based on first column) are selected
##  therefore, we put the RBB protein first and the ONEWAY protein second, and whenever they overlap in a query annotation ID, the RBB protein hit is taken
##  we do this inside a nested shell ("<(cat....), such that the output of that process is directed into a second 'cat' command, which does the same thing as described above
##  this has the effect or prioritizing the results in the order we want, without redundancy in the query annotation IDs, giving us our final inference of homology with detailed information on our confidence in that inference
cmd="cat <(cat $QRY"_"$REF"_RBB.blastp.e"$REVAL".fmt6.txt" | awk -v OFS=\"\\t\" -v ref=\"\$REF\" '{ print \$1, \$2\"\|RBB\", \$3, \$4, \$5, \$6, \$7, \$8, \$9, \$10, \$11, \$12 }' ) \
<(cat $QRY"_"$REF"_ONEWAY.blastp.e"$OEVAL".fmt6.txt" | awk -v OFS=\"\\t\" -v ref=\"\$REF\" '{ print \$1, \$2\"\|OBB\", \$3, \$4, \$5, \$6, \$7, \$8, \$9, \$10, \$11, \$12 }') | \
sort -u -k1,1 > $QRY"_"$REF"_homology_annotation.fmt6.txt""
# echo $cmd
eval $cmd

# using context provided in output file, we can summarize how proteins were annotated using the reference
cmd="cat <(cat $QRY"_"$REF"_homology_annotation.fmt6.txt" | \
awk ' \$2 ~ /\\\|RBB\$/ { count1 += 1 }; \
\$2 ~ /\\\|OBB\$/ { count2 += 1 } END \
{ print \"RBB proteins to proteins = \"count1;
print \"ONEWAY proteins to proteins = \"count2;
print \"Total annotated = \"count1+count2 }') \
<(grep -c \"^>\" $QPROT | awk '{ print \"Total input sequences = \"\$1 }') \
> $QRY"_"$REF"_homology_annotation.summary.txt""
# echo $cmd
eval $cmd

echo -e "\n###############################\nAnalysis Completed\n\
Final annotations can be found in $QRY"_"$REF"_homology_annotation.tsv"\n\
A summary can be found in $QRY"_"$REF"_homology_annotation.summary.txt"\n###############################\n"