/
Solexa_get_originalSeq_given_fasta.pl
67 lines (53 loc) · 1.51 KB
/
Solexa_get_originalSeq_given_fasta.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/perl -w
use strict;
use warnings;
my $usage = '
This script will accept a fasta file, get the original sequences for those reads,
output to an output file.
perl script <input FASTAQ file> <fasta file>
<input FASTAQ file1> = full path to FASTAQ file with original data
<fasta file> = full path to the the fasta file
output to a FASTAQ files with the same name except end with .fastq in the same
directory as the input FASTAQ files
';
die $usage unless scalar @ARGV == 2;
my ($in_FASTAQ, $unmapped_reads_file) = @ARGV;
my $outfile = $unmapped_reads_file."stq";
#print "out file is $outfile\n";
open (OUT, ">$outfile") or die $!;
my %read_names = (); # name of the read => 1
open (IN, "<$unmapped_reads_file") or die $!;
while (my $line = <IN>) {
if ($line =~ />/) {
chomp $line;
my @fields = split (/\t/, $line);
my $read_name = $fields[0];
$read_name =~ s/>//;
# print "read = \\$read_name\\\n";
$read_names{$read_name} = 1;
}
}
close IN;
my $keys = keys %read_names;
print "total reads = $keys\n\n";
open(IN, $in_FASTAQ)||die $!;
while (my $line = <IN>) {
# print "line = \\$line\\\n";
chomp $line; # the first line with @seq-name
my @temp = split(/\s+/, $line);
my $temp_seq_name = $temp[0];
my $seq_name = substr($temp_seq_name, 1);
# print ">$seq_name<\n";
if ( exists $read_names{$seq_name}) {
print OUT "@", $seq_name, "\n";
$line = <IN>; # seq line
print OUT $line;
$line = <IN>; # + line
print OUT $line;
$line = <IN>; # quality line
print OUT $line;
}
}
close IN;
close OUT;
exit;