/
pipeline.sh
executable file
·84 lines (70 loc) · 3.46 KB
/
pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#! /usr/bin/env bash
# Auth: Jennifer Chang
# Date: 2019/03/13
set -e
set -u
# Input and Output
INPUT=$1
BASENAME=$(basename $1)
# Connect your reference here
REFERENCE=sample_data/reference.fa
# Connect your programs here, can use full path names
BLASTN=/usr/local/bin/blastn
MAKEBLASTDB=/usr/local/bin/makeblastdb
SMOF=/usr/local/bin/smof.py
MAFFT=/usr/local/bin/mafft
FASTTREE=/usr/local/bin/FastTreeMP
NN_CLASS=nn_classifier.R
# Create your Blast Database
${MAKEBLASTDB} -in ${REFERENCE} -parse_seqids -dbtype nucl
# Search your Blast Database
${BLASTN} -db ${REFERENCE} -query $INPUT -num_alignments 1 -outfmt 6 -out ${BASENAME}_output.txt
echo "... results in ${BASENAME}_output.txt"
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|H1\|/ {print $1}' > ${BASENAME}_H1.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|H3\|/ {print $1}' > ${BASENAME}_H3.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|N1\|/ {print $1}' > ${BASENAME}_N1.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|N2\|/ {print $1}' > ${BASENAME}_N2.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|PB2\|/ {print $1}' > ${BASENAME}_PB2.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|PB1\|/ {print $1}' > ${BASENAME}_PB1.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|PA\|/ {print $1}' > ${BASENAME}_PA.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|NP\|/ {print $1}' > ${BASENAME}_NP.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|M\|/ {print $1}' > ${BASENAME}_M.ids
cat ${BASENAME}_output.txt |awk -F'\t' '$2~/\|NS\|/ {print $1}' > ${BASENAME}_NS.ids
ARR=(H1 H3 N1 N2 PB2 PB1 PA NP M NS)
# Fast part, separating out the sequences and adding references
for SEG in "${ARR[@]}"
do
echo "${SEG}"
if [ -s ${BASENAME}_${SEG}.ids ]
then
${SMOF} grep -Xf ${BASENAME}_${SEG}.ids ${INPUT} > ${BASENAME}_${SEG}.fa # pull out by segment
${SMOF} grep "|$SEG|" ${REFERENCE} >> ${BASENAME}_${SEG}.fa # add references
fi
done
# Slow part, building the alignment and tree
for SEG in "${ARR[@]}"
do
echo "${SEG}"
if [ -s ${BASENAME}_${SEG}.ids ]
then
${MAFFT} --thread -1 --auto ${BASENAME}_${SEG}.fa > ${BASENAME}_${SEG}_aln.fa
${FASTTREE} -nt ${BASENAME}_${SEG}_aln.fa > ${BASENAME}_${SEG}.tre
fi
done
# Fast again, pull out clades
touch ${BASENAME}_Final_Output.txt
rm ${BASENAME}_Final_Output.txt
touch ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_H1.ids ] && Rscript ${NN_CLASS} ${BASENAME}_H1.tre 4 7 8 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_H3.ids ] && Rscript ${NN_CLASS} ${BASENAME}_H3.tre 4 7 8 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_N1.ids ] && Rscript ${NN_CLASS} ${BASENAME}_N1.tre 5 1 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_N2.ids ] && Rscript ${NN_CLASS} ${BASENAME}_N2.tre 5 1 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_PB2.ids ] && Rscript ${NN_CLASS} ${BASENAME}_PB2.tre 5 1 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_PB1.ids ] && Rscript ${NN_CLASS} ${BASENAME}_PB1.tre 5 1 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_PA.ids ] && Rscript ${NN_CLASS} ${BASENAME}_PA.tre 5 1 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_NP.ids ] && Rscript ${NN_CLASS} ${BASENAME}_NP.tre 5 1 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_M.ids ] && Rscript ${NN_CLASS} ${BASENAME}_M.tre 5 1 >> ${BASENAME}_Final_Output.txt
[ -s ${BASENAME}_NS.ids ] && Rscript ${NN_CLASS} ${BASENAME}_NS.tre 5 1 >> ${BASENAME}_Final_Output.txt
echo "==== Final results in ${BASENAME}_Final_Output.txt"
echo "Tree files are listed below: "
ls -ltr ${BASENAME}*.tre