Permalink
Browse files

Filter slabs on dist before searching them

  • Loading branch information...
1 parent b4408eb commit 826667eb33fc98bd1ef261308b52d1924c0a34c2 @dgl committed Apr 1, 2012
Showing with 46 additions and 18 deletions.
  1. +1 −0 Makefile.PL
  2. +3 −3 bin/cpangrep-matcher
  3. +42 −15 lib/WWW/CPANGrep/Search.pm
View
@@ -21,6 +21,7 @@ my %opts = (
'IO::AIO' => 0,
'CPAN::DistnameInfo' => 0,
'Starman' => 0,
+ 'Set::Object' => 0,
}
);
View
@@ -56,7 +56,7 @@ sub do_match {
}
my @results;
- while($pm =~ /$re/g) {
+ while($pm =~ /$re/gm) {
if($+[0] - $-[0] > 1e5) {
$redis->publish($channel => encode_json {
error => "Regexp is too greedy"
@@ -100,8 +100,8 @@ while(1) {
last unless $item->[0];
print "$$: processing job: $item->[1]\n";
my $job = decode_json $item->[1];
- my $slabs = [@{$redis->lrange($job->{slablist}, @{$job->{slabs}})->recv}];
- my $max = 500;
+ my $slabs = [map $redis->lindex($job->{slablist}, $_)->recv, @{$job->{slabs}}];
+ my $max = $job->{max} || 500;
my $start = time;
do_match($job->{re}, $max, $job->{notify}, $slabs);
$redis->publish($job->{notify} => encode_json {
View
@@ -5,10 +5,12 @@ use Config::GitLike;
use CPAN::DistnameInfo;
use JSON;
use Moo;
-require re::engine::RE2;
use Scalar::Util qw(blessed);
+use Set::Object qw(set);
use Text::Balanced qw(gen_delimited_pat);
+require re::engine::RE2;
+
# TODO: stick in a module or something
my $config = Config::GitLike->new(confname => "cpangrep")->load_file("etc/config");
use constant MAX => 1_000;
@@ -86,7 +88,7 @@ sub _parse_search {
sub _re2_compiler {
use re::engine::RE2 -strict => 1;
- qr/$_[0]/;
+ qr/$_[0]/m;
}
sub _re2_compile {
@@ -123,10 +125,9 @@ sub search {
state $counter = 0;
my @results;
- my $slab = $config->{"key.slabs"};
- my $len = $redis->llen($slab)->recv;
+ my @slabs = $self->_find_slabs($redis);
my $req = $config->{"matcher.concurrency"};
- $req = $len if $len < $req;
+ $req = @slabs if @slabs < $req;
my $notify = "webfe1." . $$ . "." . ++$counter;
my $redis_other = AnyEvent::Redis->new(host => $config->{"server.slab"});
@@ -199,21 +200,47 @@ sub search {
}
});
- my $c = int $len/$req;
- for(1 .. $req) {
- my @slabs = ($c*($_-1), $_ eq $req ? $len : ($c*$_)-1);
- $redis_other->rpush("queue:cpangrep:slabsearch", encode_json({
- slablist => $slab,
- slabs => \@slabs,
- re => "" . $self->_re,
- notify => $notify
- }));
+ if(@slabs) {
+ my $c = int @slabs/$req;
+ for(1 .. $req) {
+ $redis_other->rpush("queue:cpangrep:slabsearch", encode_json({
+ slablist => $config->{"key.slabs"},
+ slabs => [@slabs[$c*($_-1) .. ($_ eq $req ? @slabs - 1 : ($c*$_) - 1)]],
+ re => "" . $self->_re,
+ notify => $notify,
+ # TODO: tune this
+ max => 10_000 * (1 / log 1 + @slabs)
+ }));
+ }
}
- my @finish = $other_cv->recv;
+ my @finish = @slabs ? $other_cv->recv : ();
return $self->filter_results(\@results), @finish;
}
+sub _find_slabs {
+ my($self, $redis) = @_;
+ # XXX: This is pretty small with a small number of web workers, but can do
+ # better (make the matcher more intelligent?)
+ state $dist_slab_map = { @{$redis->hgetall("cpangrep:dists")->recv} };
+ state $all_slabs = $redis->lrange($config->{"key.slabs"}, 0, -1)->recv;
+ state $slab_id_map = { map +($all_slabs->[$_] => $_), 0 .. $#$all_slabs };
+
+ my $slabs = set(@$all_slabs);
+
+ for my $option(values $self->_options) {
+ if(!$option->{negate}) {
+ if($option->{type} eq 'author') {
+ # TODO
+ } elsif($option->{type} eq 'dist') {
+ $slabs = set(map $dist_slab_map->{$_}, grep $_ =~ $option->{re}, keys
+ $dist_slab_map)->intersection($slabs);
+ }
+ }
+ }
+ sort { $a <=> $b } map $slab_id_map->{$_}, $slabs->members;
+}
+
# This could probably be optimised a lot, but take the lazy approach for now.
sub filter_results {
my($self, $results) = @_;

0 comments on commit 826667e

Please sign in to comment.