-
Notifications
You must be signed in to change notification settings - Fork 0
/
bandp.pl
267 lines (212 loc) · 9.55 KB
/
bandp.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/perl -w
#
###################################################################################################
#
# Name: bandp.pl
#
# This script is used to blast the query to non-redundant reads and parse the reads for different
# genes for each subject species.
#
# Datafile: fasta file query, local blast data base of the reads, indexed fastq files
#
# Input: the name of the query file
#
# Output: blastall output file and fastq reads file for different genes under each species
#
# Written by
# Chenhong Li
# Shanghai Ocean University, Shanghai, China.
# Created on Oct 2013
# Last modified on Aug 2016 by Hao Yuan
# New Traits: split query files first, then blast them against subject in parallel
#
###################################################################################################
use strict;
use warnings;
use Getopt::Long; # include the module for input
use Parallel::ForkManager;
my $query; #variable for store query species name
my $subject; #list of subject name
my $E = 0.0001; # E-value for blast, the default is 0.0001
my $max = 2; #for store the maximum number of processes/forks
my $help;
my $opt = GetOptions( 'query:s', \$query,
'subject:s', \$subject,
'E:f', \$E,
'fork:s', \$max,
'help!', \$help); #set command line options
if (!($opt && $query) || $help) {#check for the required inputs
print STDERR "\nExample usage:\n";
print STDERR "\n$0 -query=\"Oreochromis_niloticus\" -subject=\"Rhyacichthys_aspro Odontobutis_potamophila\"\n\n";
print STDERR "Options:\n";
print STDERR " -query = name of the query species \n";
print STDERR " -subject = a list of subject species, use underscore to link genus and\n";
print STDERR " species name; taxa seperated by one space\n\n";
print STDERR "For more optional parameters, see README.txt\n\n";
exit;
}
my @subgenome = split (/\s/, $subject); #split the names of suber genomes
my @sortedsubgenome = sort @subgenome;
my $pm = Parallel::ForkManager->new($max);
my $queryfile = "$query.fas"; #get the name of blast query file
# split query file for parallelisation;
my $QUERYFILE;
open $QUERYFILE ,"<$queryfile" or die "can't open $queryfile";
my $string = do { local $/; <$QUERYFILE>; }; # read all context from $queryfile
my @count_seq = $string =~ />/g; # get the number of seq
my $count_seq = scalar @count_seq;
my (@out) = $string =~ />\S+\n{1}\S+\n{1}/g; # split id and seq into array
my $aver_seq = $count_seq / $max; # average number of seq in each splited file
my $int_aver_seq = int ($aver_seq); # integer of average number
my $last_aver_seq = $count_seq - ($max - 1) * $int_aver_seq; # number of seq in last file
for (my $i = 0; $i < ($max - 1); $i++) { # create $max-1 files (1st - ($max-1)th) and write $int_aver_seq seqs
my $OUT_NEWFILE;
my $out_newfile = $query . "|" . $i . ".fas";
open ($OUT_NEWFILE, ">$out_newfile") or die "can't write $out_newfile ";
for (my $j = 0; $j < $int_aver_seq; $j++) {
my $id_seq = shift @out;
print $OUT_NEWFILE "$id_seq";
}
close ($OUT_NEWFILE);
}
my $OUT_NEWFILE; # create $max th file and type in $last_aver_seq seqs
my $last = $max - 1;
my $last_out_newfile = $query . "|" . $last . ".fas";
open ($OUT_NEWFILE, ">$last_out_newfile") or die "can't write $last_out_newfile ";
for (my $j = 0; $j < $last_aver_seq; $j++) {
my $id_seq = shift @out;
print $OUT_NEWFILE "$id_seq";
}
close ($OUT_NEWFILE);
close ($QUERYFILE);
foreach my $sub (@sortedsubgenome) {
my $dir = $sub . "_results";
`mkdir $dir`;
#open the index file and fastq data file for retrieving late
my $INDEX_FILE1;
my $indexfile1 = $sub . "_rmrep_R1.index";
open ($INDEX_FILE1, "<$indexfile1") or die "Cannot open $indexfile1 for reading ($!)";
#read the index1 file
my %index1;
my $id1;
while (my $line = readline ($INDEX_FILE1)) {
chomp $line;
my ($id1, $position1) = split /\t/, $line;# the id in index file is longer than in embl file
$index1{$id1} = $position1;
}
my $INDEX_FILE2;
my $indexfile2 = $sub . "_rmrep_R2.index";
open ($INDEX_FILE2, "<$indexfile2") or die "Cannot open $indexfile2 for reading ($!)";
#read the index2 file
my %index2;
my $id2;
while (my $line = readline ($INDEX_FILE2)) {
chomp $line;
my ($id2, $position2) = split /\t/, $line;# the id in index file is longer than in embl file
$index2{$id2} = $position2;
}
close ($INDEX_FILE1) or die "Can't close the index file!!!";
close ($INDEX_FILE2) or die "Can't close the index file!!!";
my $newfile = $sub . ".fas";
`makeblastdb -dbtype nucl -in "$newfile" -out "$sub" -max_file_sz 2GB`; #make a blast database for the splited fasta file of sub
# push name of splited files into array
my @split_fas;
for (my $i = 0; $i < $max; $i++){
my $split_fas = $query . "|" . $i;
push @split_fas, $split_fas;
}
# parallelised blast
DATA_LOOP:
foreach my $query (@split_fas) {
$pm->start and next DATA_LOOP;
my $query_file = $query . ".fas";
my $blastout = $query . "." . $sub . ".blast.txt";
print "Blasting, $query against $sub...\n";
`blastn -query "$query_file" -task blastn -db "$sub" -out "$blastout" -word_size 7 -gapopen 5 -gapextend 2 -penalty -1 -reward 1 -evalue $E -outfmt 6 -max_target_seqs 1000000000`; # blast splited fasta file of sub against query
print "$query against $sub is done\n";
$pm->finish;
}
$pm->wait_all_children();
my $merged_blast_out = $query . "." . $sub . ".blast.txt";
`cat $query"|"*.$sub.blast.txt > $merged_blast_out `; # merge the blast results of all splited file
my $blastout = $merged_blast_out;
#now we need parse the blast results and extract original reads and bin them by query gene name for each species
my ($SEQ_FILE1, $SEQ_FILE2);
my $seqfile1 = $sub . "_rmrep_R1.fq";
my $seqfile2 = $sub . "_rmrep_R2.fq";
open ($SEQ_FILE1, "<$seqfile1") or die "Cannot open $seqfile1 for reading ($!)";
open ($SEQ_FILE2, "<$seqfile2") or die "Cannot open $seqfile2 for reading ($!)";
my ($BLASTOUT, %saveid, $OUTFILE);
my $geneidlag = "huanghuanghuhu";
#parse blast output files
open ($BLASTOUT, "<$blastout") or die "Can't open the $blastout file!!!"; #open blastout file for each subject
while (my $line = readline ($BLASTOUT)){
my ($geneid, $hitidfr, $start, $end) = $line =~ /(\S+)\s+(\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+\S+\s+\S+/;
my ($hitid, $fr) = $hitidfr =~ /(\S+)\.(\w)/;
if (($geneid ne $geneidlag) && ($geneidlag ne "huanghuanghuhu")){ # if all the blast result of $geneidlag are read
close ($OUTFILE);
}
if ($geneid ne $geneidlag){ # if current $geneid differs from $geneidlag, open a new file for current $geneid
my $outfile = $dir . "/" . "$geneid" . "." . "fq";
open ($OUTFILE, ">$outfile") or die "Can't open the $outfile!!!";
}
# if reads id doesn't exist in hash, then retrieve the reads and paired reads, else skip
if (!($saveid{$hitid})){
my $hitidf = $hitid . ".f";
seek $SEQ_FILE1, $index1{$hitidf}, 0; # let’s jump to our seq
my $firstf = readline($SEQ_FILE1); # first line is header, skip
my $secondf = readline($SEQ_FILE1); # second line is the whole sequence
chomp $secondf;
my $thirdf = readline($SEQ_FILE1); # third line is "+"
my $fourthf = readline($SEQ_FILE1); # fourth line is the quality score
chomp $fourthf;
my $hitidr = $hitid . ".r";
seek $SEQ_FILE2, $index2{$hitidr}, 0; # let’s jump to our seq
my $firstr = readline($SEQ_FILE2); # first line is header, skip
my $secondr = readline($SEQ_FILE2); # second line is the whole sequence
chomp $secondr;
my $thirdr = readline($SEQ_FILE2); # third line is "+"
my $fourthr = readline($SEQ_FILE2); # fourth line is the quality score
chomp $fourthr;
if ($fr eq "f" ){ # if front seq is forword then the rear seq is reversed, vice versa
if ($start < $end){
$secondr = reverse $secondr;
$secondr =~ tr/ATCGatcg/TAGCtagc/;
$fourthr = reverse $fourthr;
}else{
$secondf = reverse $secondf;
$secondf =~ tr/ATCGatcg/TAGCtagc/;
$fourthf = reverse $fourthf;
}
}else{
if ($start < $end){
$secondf = reverse $secondf;
$secondf =~ tr/ATCGatcg/TAGCtagc/;
$fourthf = reverse $fourthf;
}else{
$secondr = reverse $secondr;
$secondr =~ tr/ATCGatcg/TAGCtagc/;
$fourthr = reverse $fourthr;
}
}
my $seqf = "$firstf" . "$secondf\n" . "$thirdf" . "$fourthf\n";
print $OUTFILE "$seqf";
my $seqr = "$firstr" . "$secondr\n" . "$thirdr" . "$fourthr\n";
print $OUTFILE "$seqr";
$saveid{$hitid} = "huanghuanghuhu"; # save reads id into hash, for skipping these reads if we meet them later
}
$geneidlag = $geneid; # save current $geneid as $geneidlag
}
close ($OUTFILE);
close ($BLASTOUT);
# my $number = scalar keys %saveid;
close ($SEQ_FILE1) or die "Can't close the new file!!!";
close ($SEQ_FILE2) or die "Can't close the new file!!!";
`rm "$sub"_rmrep_R*.index`; #remove index
`rm "$sub".fas`; # remove database
`rm "$sub".nhr`;
`rm "$sub".nin`;
`rm "$sub".nsq`;
`rm "$query""|"*."$sub".blast.txt` ; # remove splited query and blast output
}
`rm "$query""|"*.fas`;