Permalink
Find file
Fetching contributors…
Cannot retrieve contributors at this time
executable file 594 lines (489 sloc) 14.6 KB
#!/usr/bin/perl
###############################################################################
#
#
# Copyright (C) 2010-2013 Connor Skennerton
#
# Support added for the Manotator by Mike Feb 2012
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
###############################################################################
#pragmas
use strict;
use warnings;
#core Perl modules
use IO::Zlib;
use IO::File;
use IO::Uncompress::Bunzip2;
use Data::Dumper;
#CPAN modules
use Getopt::Euclid;
use Bio::Tools::CodonTable;
#locally-written modules
use Class::Struct Seq => {name => '$', seq => '$', comment => '$', qual => '$' };
BEGIN
{
select(STDERR);
$| = 1;
select(STDOUT);
$| = 1;
}
my $query = \*STDIN;
if (defined $ARGV{'-i'}) {
open($query, $ARGV{'-i'}) or die;
}
my $outfile = \*STDOUT;
if (defined $ARGV{'-o'}) {
open($outfile, ">", $ARGV{'-o'}) or die;
}
my %seqs;
my %cluster_map;
#printAtStart();
if(! defined $ARGV{'-r'}) {
if($ARGV{'-f'}) {
my @aux = undef;
while (my $seq = &readfq($query, \@aux)) {
my $name2 = $seq->name;
if($ARGV{'-Ri'}) {
$name2 =~ s/$ARGV{'-Ri'}/$1/;
}
$seqs{$name2} = 1;
}
} elsif (! defined $ARGV{'-n'}) {
if (defined $ARGV{'-c'}){
&mapping($ARGV{'-c'}, \%seqs);
} else {
while (my $line = <$query>)
{
chomp $line;
if ($line =~ /^$/) {
next;
}
my $name = undef;
if($ARGV{'-l'}) {
$name = &list($line);
} elsif ($ARGV{'-b'}) {
$name = &blast($line);
} elsif ($ARGV{'-s'}) {
# skip header lines
next if $line =~ /^@/;
$name =&sam($line);
} elsif ($ARGV{'-m'}) {
$name =&mannotator("UniRef90_".$line);
} else {
next if ($line =~ /^\#/);
last if ($line =~ /\#+FASTA/i);
$name =&gff($line);
}
if(! defined $name ) {
die "Crazy error!\n";
} else {
if($ARGV{'-Ri'}) {
$name =~ s/$ARGV{'-Ri'}/$1/;
}
$seqs{$name} = 1;
}
}
}
} elsif(defined $ARGV{'-n'}) {
foreach my $e (@{$ARGV{'-n'}}) {
if($ARGV{'-Ri'}) {
$e =~ s/$ARGV{'-Ri'}/$1/;
}
$seqs{$e} = 1;
}
}
}
close $query;
my $keys_to_find = scalar keys %seqs;
if(defined $ARGV{'-r'}) {
$keys_to_find = 1;
}
foreach my $database (@{$ARGV{'-d'}}) {
# check to see if there are any keys left
last unless($keys_to_find > 0);
my $dfh;
if($ARGV{'-z'}) {
$dfh = IO::Zlib->new($database,"rb") || die $!;
} elsif($ARGV{'-j'}){
$dfh = IO::Uncompress::Bunzip2->new($database) || die $!;
}else {
$dfh = IO::File->new($database, 'r') || die $!;
}
my @aux = undef;
while (my $seq = readfq($dfh, \@aux))
{
if(defined $ARGV{'-r'}) {
if($seq->name =~ /$ARGV{'-r'}/ || $seq->comment =~ /$ARGV{'-r'}/) {
unless($ARGV{'-v'})
{
print_seq(\$seq, $outfile);
}
}
elsif ($ARGV{'-v'})
{
print_seq(\$seq, $outfile);
}
next;
}
my $name2 = $seq->name;
if($ARGV{'-Rd'}) {
# perform the split according to the user
$name2 =~ s/$ARGV{'-Rd'}/$1/;
}
if (exists $seqs{$name2})
{
unless($ARGV{'-v'})
{
if (defined $ARGV{'-c'}) {
$outfile = $seqs{$name2};
}
print_seq(\$seq, $outfile);
$keys_to_find-- unless ($ARGV{'-Force'});
}
}
elsif ($ARGV{'-v'})
{
print_seq(\$seq, $outfile);
}
# check to see if there are any keys left
last unless($keys_to_find > 0);
}
$dfh->close();
}
if (defined $ARGV{'-c'}) {
while (my ($seq,$fh) = each %seqs) {
close $fh;
}
}
sub format_seq {
my $seq = shift;
if (defined ${$seq}->qual) {
return sprintf "@%s%s\n%s+\n%s\n", ${$seq}->name, (defined ${$seq}->comment ^ defined $ARGV{'-C'}) ? ${$seq}->comment : '', ${$seq}->seq, ${$seq}->qual;
} else {
if (defined ${$seq}->comment ^ defined $ARGV{'-C'}) {
return sprintf ">%s%s\n%s", ${$seq}->name, ${$seq}->comment, ${$seq}->seq;
}
return sprintf ">%s\n%s", ${$seq}->name, ${$seq}->seq;
}
}
sub fastaCut {
#-----
# Cut up a fasta sequence
#
my ($string, $prot, $line_wrap) = @_;
# translate if need be
if(0 != $prot)
{
my $codon_table = Bio::Tools::CodonTable -> new ( -id => $prot );
$string = $codon_table->translate($string);
}
# wrap the line if need be
if(0 != $line_wrap)
{
my $return_str = "";
my $len = length $string;
my $start = 0;
while($start < $len)
{
$return_str .= substr $string, $start, $line_wrap;
$return_str .="\n";
$start += $line_wrap;
}
return $return_str;
}
return "$string\n";
}
sub print_seq{
my ($seq_ref, $fh) = @_;
if(defined $ARGV{'-w'})
{
if(defined $ARGV{'-p'})
{
${$seq_ref}->seq( fastaCut(${$seq_ref}->seq, $ARGV{'-p'}, $ARGV{'-w'}) );
}
else
{
${$seq_ref}->seq( fastaCut(${$seq_ref}->seq, 0, $ARGV{'-w'}) );
}
}
elsif(defined $ARGV{'-p'})
{
${$seq_ref}->seq( fastaCut(${$seq_ref}->seq, $ARGV{'-p'}, 0) );
}
else
{
${$seq_ref}->seq( ${$seq_ref}->seq."\n");
}
print $fh format_seq($seq_ref);
#if (defined ${$seq_ref}->qual ^ defined $ARGV{'-F'})
#{
## fastq file
#print $fh "@".$$name_ref."\n".$seq."+".$$name_ref."\n".$$qual_ref."\n";
#}
#else
#{
#print $fh ">".$$name_ref."\n".$seq;
#}
}
sub readfq {
my ($fh, $aux) = @_;
@$aux = [undef, 0] if (!defined(@$aux));
return if ($aux->[1]);
if (!defined($aux->[0])) {
while (<$fh>) {
chomp;
if (substr($_, 0, 1) eq '>' || substr($_, 0, 1) eq '@') {
$aux->[0] = $_;
last;
}
}
if (!defined($aux->[0])) {
$aux->[1] = 1;
return;
}
}
my $current_seq = Seq->new();
/^.(\S+)(.*)/;
$current_seq->name($1);
$current_seq->comment($2);
my $seq = '';
my $c;
$aux->[0] = undef;
while (<$fh>) {
chomp;
$c = substr($_, 0, 1);
last if ($c eq '>' || $c eq '@' || $c eq '+');
$seq .= $_;
}
$aux->[0] = $_;
$aux->[1] = 1 if (!defined($aux->[0]));
$current_seq->seq($seq);
return $current_seq if ($c ne '+');
my $qual = '';
while (<$fh>) {
chomp;
$qual .= $_;
if (length($qual) >= length($seq)) {
$aux->[0] = undef;
$current_seq->qual($qual);
return $current_seq;
}
}
$aux->[1] = 1;
return $current_seq;
}
sub list{
my ($line) = shift;
return $line;
}
sub blast{
my ($line) = shift;
my @columns = split(/\t/, $line);
if (exists $ARGV{'-S'})
{
return $columns[1];
}
else
{
return $columns[0];
}
}
sub sam {
my ($line) = shift;
my @c = split(/\t/,$line);
# test whether the third bit is set - query unmapped
unless($c[1] & 4)
{
# test whether the read is paired
if (($c[1] & 1 ) && ($c[1] & 128)) {
# test whether the read is the second pair
if($ARGV{'-I'}) {
return $c[0]."/2";
} else {
return $c[0];
}
} else {
if ($ARGV{'-I'}) {
return $c[0]."/1";
} else {
return $c[0];
}
}
}
}
sub gff {
my ($line) = shift;
my @c = split(/\t/, $line);
return $c[0];
}
sub mannotator{
my ($line) = shift;
my @columns = split /\^/, $line;
return $columns[0];
}
sub mapping {
my ($mapfile, $seqs) = @_;
my $fh = IO::File->new($mapfile, 'r') || die $!;
my %seen_file;
my $fh_count = 0;
while(<$fh>) {
chomp;
my @mapping = split /\t/;
if (exists $seen_file{$mapping[1]}) {
$seqs->{$mapping[0]} = $seen_file{$mapping[1]}
} else {
$fh_count++;
if ($fh_count > 255) {
warn "there are more than 255 different groupings specified in the mappings file\n";
warn "please split the mapping file such that there are less groupings then run\n";
warn "contig_extractor using the split mapping files\n";
exit 1;
}
my $new_fh = IO::File->new($mapping[1],'w') || die $!;
$seen_file{$mapping[1]} = $new_fh;
$seqs->{$mapping[0]} = $new_fh;
}
}
}
__END__
=head1 NAME
contig_extractor
=head1 COPYRIGHT
copyright (C) 2010-2013 Connor Skennerton
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=head1 REQUIRED ARGUMENTS
=over
=item -d <file>
file of reads where a subset needs to be extracted (can be FASTA or FASTQ, automatically detected). Option can be specified multiple times.
=for Euclid:
repeatable
file.type: readable
=back
=head1 OPTIONS
=over
=item -h
print a short help msg
=item -i <input_file>
File containing sequences or identifiers to be extracted from the sequence database
=for Euclid:
input_file.type: readable
input_file.excludes: name
input_file.excludes.error: You cannot specify both names on the command line and give an input file
=item -o <output_file>
Output file name
=for Euclid
output_file.type: writable
=item -I
If using read names from a sam input file, append Illumina style (/1 or /2). [Default: false]
=item -l
Input file is a list of identifiers, one per line
=item -b
Input file is in tabular blast format
=item -s
Input is in Sam format
=item -f
Input is in fasta or fastq format
=item -g
Input is in gff3 format
=item -m
Input is a mannotator formated annotations file
=item -r <regex>
Specify a perl regular expression that will be matched against all sequences in the database.
This option is not compatible with other types of input specified with -n or -i. Unlike
other methods this regex will match against both the name and the comment sections of the
fasta record
=for Euclid
regex.excludes: name, input_file
=item -S
Used only when the input is in blast format; sets the subject as the list of identifiers. Default: query
=item -n <name>...
A list of sequence names to extract in the form of a space separated list
=item -c <mapping_file>
File containing a mapping of contigs and their groupings. Each line must contain a single contig name
and the name of an output file for that contig that is TAB sepatated.
Example:
contig0001 cluster_1.fa
contig0002 cluster_2.fa
NOTE: You cannot specify more than 255 different output file names. If you have more groupings than this
you will need to split your job into parts
=for Euclid
mapping_file.type: readable
=item -v
Invert the match. ie extract non-matching reads
=item -z
The database file is gziped
=item -j
The database file is bziped
=item -F
Force output to be in fasta format
=item -Ri <input_regex>
The header in the input file contains additional information that must be removed.
Specify a perl regular expression such that $1 (first capture group) contains the required information.
=item -Rd <database_regex>
The header in the database file contains additional information that can be removed.
Specify a perl regular expression such that $1 (first capture group) contains the required information.
=item -w [<wrap_length>]
Wrap the lines at wrap-length chars
=for Euclid
wrap_length.type: i
=item -p [<protein_code>]
Enter a code if you wish to convert to protein
Specify a number from the following list (Uses: Bio::Tools::CodonTable)
1 Standard
2 Vertebrate Mitochondrial
3 Yeast Mitochondrial
4 Mold, Protozoan,_and_CoelenterateMitochondrial_and_Mycoplasma/Spiroplasma
5 Invertebrate Mitochondrial
6 Ciliate, Dasycladacean_and_Hexamita_Nuclear
9 Echinoderm Mitochondrial
10 Euplotid Nuclear
11 Bacterial
12 Alternative Yeast_Nuclear
13 Ascidian Mitochondrial
14 Flatworm Mitochondrial
15 Blepharisma Nuclear
16 Chlorophycean Mitochondrial
21 Trematode Mitochondrial
22 Scenedesmus obliquus_Mitochondrial
23 Thraustochytrium Mitochondrial
Default: no conversion
=for Euclid
protein_code.type: i
=item -C
Do not print comments in fasta files
=item -Force
Force scanning the entire database file. contig_etractor tries to speed up processing by keeping track
of the patterns that have been found and exiting when they all have been, even if the end of the database
file has not been reached. However the combination of other options such as -Ri or -Rd means that
sometimes the patterns are not unique and so scanning should be forced till the end of the file to
ensure that all matches are found
be unique, however if that is not the case
=back
=head1 VERSION
0.5.9
=head1 DESCRIPTION
Used for extracting whole contigs / sequences from a multiple fasta file that contain
significant matches to reads/sequences/contigs from a variety of list formats
=cut