/
Split.pm
229 lines (190 loc) · 6.45 KB
/
Split.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
use v6;
use Bio::Role::Location;
role Bio::Role::Location::Split does Bio::Role::Location {
has @!subLocations;
has $.splittype is rw = 'JOIN';
method add_sub_Location(*@locations){
for @locations -> $loc {
if ($loc !~~ Bio::Role::Location ) {
#old bioperl5 msg
#self.throw("Trying to add $loc as a sub Location but it doesn't implement Bio::LocationI!");
next;
}
push @!subLocations,$loc;
}
return @!subLocations.elems;
}
multi method each_Location(Int $order = 0){
my @locs;
for self.sub_Location($order) -> $subloc {
# Recursively check to get hierarchical split locations:
push @locs, $subloc.each_Location($order);
}
return @locs;
}
method sub_Location(Int $order = 0) {
#not sure if we really need...
$order = 1 if ($order > 1);
$order = -1 if ($order < -1);
####
my @sublocs = defined @!subLocations ?? @!subLocations !! ();
# return the array if no ordering requested
return @sublocs if ( ($order == 0) || !(defined @sublocs) );
# sort those locations that are on the same sequence as the top (`master')
# if the top seq is undefined, we take the first defined in a sublocation
my $seqid = self.seq_id();
my $i = 0;
while ((! defined($seqid)) && ($i <= @sublocs.end)) {
$seqid = @sublocs[$i++].seq_id();
}
if ((! self.seq_id()) && defined($seqid)) {
#probably want to keep this
# $self->warn("sorted sublocation array requested but ".
# "root location doesn't define seq_id ".
# "(at least one sublocation does!)");
}
my @locs = ($seqid ??
grep { $_.seq_id() eq $seqid; } , @sublocs !!
@sublocs);
if (@locs) {
if ($order == 1) {
# Schwartzian transforms for performance boost
@locs = map { $_.[0] } ,
sort {
(defined $^a && defined $^b) ?? $^a.[1] <=> $^b.[1] !!
$^a ?? -1 !! 1
} , map { [$_, (defined $_.start ?? $_.start !! $_.end)] } , @locs;
} else { # $order == -1
@locs = map { $_.[0] } ,
sort {
(defined $^a && defined $^b) ?? $^b.[1] <=> $^a.[1] !!
$^a ?? -1 !! 1
} , map { [$_, (defined $_.end ?? $_.end !! $_.start)] } ,@locs;
}
}
# push the rest unsorted
if ($seqid) {
push(@locs, grep { $_.seq_id() ne $seqid; } ,@sublocs);
}
# done!
return @locs;
}
method to_FTstring(){
my @strs;
my $strand = self.strand() || 0;
my $stype = lc(self.splittype());
my $guide = self.guide_strand();
if ( $strand < 0 ) {
self.flip_strand; # this will recursively set the strand
# to +1 for all the sub locations
}
# If the split type is join, the order is important;
# otherwise must be 5'->3' regardless
my @locs = ($stype eq 'join' && (!$guide && $strand == -1)) ??
reverse self.sub_Location() !! self.sub_Location() ;
for ( @locs ) -> $loc {
# $loc.verbose(self.verbose);
my $str = $loc.to_FTstring();
# we only append the remote seq_id if it hasn't been done already
# by the sub-location (which it should if it knows it's remote)
# (and of course only if it's necessary)
if ( (! $loc.is_remote) &&
defined(self.seq_id) && defined($loc.seq_id) &&
($loc.seq_id ne self.seq_id) ) {
$str = sprintf("%s:%s", $loc.seq_id, $str);
}
push @strs, $str;
}
self.flip_strand if $strand < 0;
my $str;
if ( @strs == 1 ) {
($str) = @strs;
} elsif ( @strs == 0 ) {
# self.warn("no Sublocations for this splitloc, so not returning anything\n");
} else {
$str = sprintf('%s(%s)',lc(self.splittype), join(",", @strs));
}
if ( $strand < 0 ) { # wrap this in a complement if it was unrolled
$str = sprintf('%s(%s)','complement',$str);
}
return $str;
}
#bioperl5 version had these values return by coord policy but that is over design
#going to need a faster way to determine start other then resorting every time
multi method start(){
my @locs = self.sub_Location(1);
return @locs[0].min_start() if @locs;
return;
}
multi method end(){
# reverse sort locations by largest ending to smallest ending
my @locs = self.sub_Location(-1);
return @locs[0].max_end() if @locs;
return;
}
###
method min_start() {
my @locs = self.sub_Location(1);
return @locs[0].min_start() if @locs;
return;
}
method max_start() {
my @locs = self.sub_Location(1);
return @locs[0].max_start() if @locs;
return;
}
method max_end() {
# reverse sort locations by largest ending to smallest ending
my @locs = self.sub_Location(-1);
return @locs[0].max_end() if @locs;
return;
}
method min_end() {
# reverse sort locations by largest ending to smallest ending
my @locs = self.sub_Location(-1);
return @locs[0].min_end() if @locs;
return;
}
multi method flip_strand() {
for ( self.sub_Location(0) ) -> $loc {
$loc.flip_strand();
if ($loc ~~ Bio::Role::Location::Split) {
my $gs = (self.guide_strand == -1) ?? Mu !! -1;
$loc.guide_strand($gs);
}
}
}
method guide_strand($value?) {
return self.strand = $value if defined($value);
return self.strand;
}
multi method strand($value?) {
if ( defined $value) {
self.strand = $value;
# propagate to all sublocs
for (self.sub_Location(0)) -> $loc {
$loc.strand($value);
}
}
else {
my ($strand, $lstrand);
for (self.sub_Location(0)) -> $loc {
# we give up upon any location that's remote or doesn't have
# the strand specified, or has a differing one set than
# previously seen.
# calling strand() is potentially expensive if the subloc is also
# a split location, so we cache it
$lstrand = $loc.strand();
if ((! $lstrand) ||
($strand && ($strand != $lstrand)) ||
$loc.is_remote()) {
$strand = Any;
last;
} elsif (! $strand) {
$strand = $lstrand;
}
}
return $strand;
}
}
}