Skip to content
This repository has been archived by the owner on Apr 12, 2020. It is now read-only.

Commit

Permalink
About a year worth of misc changes which I should have put into git
Browse files Browse the repository at this point in the history
earlier
  • Loading branch information
dgl committed Mar 30, 2012
1 parent 1d34e92 commit 720e39a
Show file tree
Hide file tree
Showing 14 changed files with 164 additions and 85 deletions.
23 changes: 17 additions & 6 deletions README.pod
Expand Up @@ -6,27 +6,38 @@ something you probably want to use the site, not try and install this ;-)
To index the whole of CPAN takes an hour or two on a good box. For development
I currently build an index of just part of CPAN (all dists under /id/D/). You
can do this by C<grep 'D/D./' 02packages.details.txt.gz> and re-adding the
header. Hopefully soon L<fakecpan|http://www.fakecpan.org> will provide a
solution to this.
header. Alternatively the default development configuration will index a sample
CPAN from L<fakecpan|http://www.fakecpan.org>.

=head2 Components

=over 4

=item * cpangrep-index

Run with cpangrep-index --cpan_dir /path/to/cpan --slab_dir /data/cpangrep/extract
Run this to start with:

bin/cpangrep-index [--cpan_dir /path/to/cpan --slab_dir /data/cpangrep/extract]

If you omit the --cpan_dir and --slab_dir options you'll get the fakecpan
configured in etc/config and shipped with this.

=item * WWW::CPANGrep

The web frontend. Run with starman -Ilib lib/WWW/CPANGrep.pm or other plack
server.
The web frontend. Run with:

starman -Ilib lib/WWW/CPANGrep.pm

(or other plack server.)

There's also a version that wraps this in L<Plack::Debug> in F<debug.psgi>.

=item * cpangrep-matcher

Backend workers that do the matching, based on a redis queue.
Backend workers that do the matching, based on a redis queue. This should be running along with the web
frontend, all the time:

bin/cpangrep-matcher

=item * Redis

Expand Down
5 changes: 3 additions & 2 deletions TODO
@@ -1,12 +1,13 @@
# -*- mode: org -*-

* Consistent use of config
* Caching (at least for basic paging to avoid searches all the time)

* Stop using Tie::Redis in such a stupid/insane way
* Use Redis::Queue (or similar)?
* Consider using 0MQ or something other than Redis for queueing
* Tests
** Make unit testable
** Using fakecpan for end to end type tests
* Caching (at least for basic paging to avoid searches all the time)
* Complex search
** dist:...
searching a specific dist
Expand Down
4 changes: 4 additions & 0 deletions app.psgi
@@ -0,0 +1,4 @@
use lib qw(lib);
use WWW::CPANGrep;

WWW::CPANGrep->new->to_psgi_app;
2 changes: 2 additions & 0 deletions bin/cpangrep-index
@@ -1,5 +1,7 @@
#!/usr/bin/perl
use strict;
use FindBin ();
use lib "$FindBin::RealBin/../lib";
use WWW::CPANGrep::Index;

my $app = WWW::CPANGrep::Index->new_with_options;
Expand Down
20 changes: 13 additions & 7 deletions bin/cpangrep-matcher
@@ -1,6 +1,7 @@
#!perl
#!/usr/bin/perl
use strict;
use 5.012;
use 5.010;
use Time::HiRes qw(time);
use EV;
use AnyEvent::Redis;
use IO::AIO qw(mmap aio_readahead);
Expand All @@ -11,7 +12,7 @@ use Config::GitLike;
my $config = Config::GitLike->new(confname => "cpangrep")->load_file("etc/config");
my $dir = $config->{"location.slabs"};

my $c = 16;
my $c = $config->{"matcher.concurrency"} || 8;

for(1 .. $c) {
my $pid = fork;
Expand Down Expand Up @@ -49,7 +50,9 @@ sub do_match {

if(my $next = $process->[$i++]) {
$fh_next = open_cached($dir . "/" . $next->{file});
aio_readahead $fh_next, 0, -s $next;
# On machines with spare IO bandwidth this seemed to help, however I'm now
# running on VMs and this seems less of a help.
#aio_readahead $fh_next, 0, -s $next;
}

my @results;
Expand Down Expand Up @@ -80,22 +83,25 @@ sub do_match {
snippet => [$previous, $next],
match => [$-[0], $+[0]]
};

last if ++$matches > $max;
}

$redis->publish($channel => encode_json \@results);

return if ++$matches > $max;
return if $matches > $max;
}
}
use Time::HiRes qw(time);

print "$$: ready\n";

while(1) {
while(my $item = $redis->blpop("queue:cpangrep:slabsearch", 60)->recv) {
last unless $item->[0];
print "$$: processing job: $item->[1]\n";
my $job = decode_json $item->[1];
my $slabs = [map decode_json($_), @{$redis->lrange($job->{slablist}, @{$job->{slabs}})->recv}];
my $max = $job->{max} || 3_000;
my $max = 500;
my $start = time;
do_match($job->{re}, $max, $job->{notify}, $slabs);
$redis->publish($job->{notify} => encode_json {
Expand Down
2 changes: 1 addition & 1 deletion debug.psgi
Expand Up @@ -3,5 +3,5 @@ use WWW::CPANGrep;

builder {
enable 'Debug', panels => [ qw(Environment Response Memory Timer) ]; # Profiler::NYTProf
CPANGrep->new->to_psgi_app;
WWW::CPANGrep->new->to_psgi_app;
};
6 changes: 4 additions & 2 deletions etc/config
@@ -1,6 +1,6 @@
[Location]
CPAN = /Users/dgl/cpangrep/fakecpan
Slabs = /Users/dgl/cpangrep/extract
CPAN = fakecpan-sampler-0.001
Slabs = var/slabs

[Server]
Slab = localhost
Expand All @@ -9,3 +9,5 @@
[Key]
Slabs = cpangrep:slabs

[Matcher]
Concurrency = 16
102 changes: 71 additions & 31 deletions lib/WWW/CPANGrep.pm
@@ -1,8 +1,8 @@
#!/usr/bin/perl
use v5.10;
use Web::Simple 'CPANGrep';
use Web::Simple 'WWW::CPANGrep';

package CPANGrep;
package WWW::CPANGrep;
use JSON;
use POSIX qw(ceil);
use AnyEvent::Redis;
Expand Down Expand Up @@ -33,20 +33,39 @@ sub dispatch_request {
[ 'Content-type' => 'text/html' ],
[ join "", <$fh> ]
]
},
sub (GET + /api + ?q=&limit~&exclude_file~) {
my($self, $q, $limit, $exclude_file) = @_;
$limit ||= 100;
my $r = $self->_search($q, $exclude_file);

return [ 200, ['Content-type' => 'application/json' ],
[ encode_json({
count => scalar @{$r->{results}},
duration => $r->{duration},
results => [@{$r->{results}}[0 .. $limit]]
})]
];
}
};

sub search(GET + / + ?q=&page~) {
my($self, $q, $page_number) = @_;
sub search(GET + / + ?q=&page~&exclude_file~) {
my($self, $q, $page_number, $exclude_file) = @_;

state $counter = 0;
my $r = $self->_search($q, $exclude_file);
if(ref $r eq 'HASH') {

return [ 200, ['Content-type' => 'text/html'],
[ render_response($q, $r->{results}, $r->{duration}, $page_number)->to_html ] ];
} else {
return $r;
}
}

print "Search for $q\n";
my $redis = AnyEvent::Redis->new(host => $config->{"server.queue"});
sub _search {
my($self, $q, $exclude_file) = @_;

my $cache = $redis->get("querycache:" . uri_escape_utf8($q))->recv;
if($cache) {
}
my $redis = AnyEvent::Redis->new(host => $config->{"server.queue"});

my $re = eval { re_compiler($q) };

Expand All @@ -58,7 +77,7 @@ sub search(GET + / + ?q=&page~) {
return [ 200, ['Content-type' => 'text/html'],
[ "Please don't use lookbehind or anything else RE2 doesn't understand!" ] ];

} elsif("abcdefgh" x 20 =~ /$re/) {
} elsif("abcdefgh" x 20 =~ $re || $q =~ /^.$/) {
# RE2 is quite happy with most things you throw at it, but really doesn't
# like lots of long matches, this is just a lame check.
return [ 200, ['Content-type' => 'text/html'],
Expand All @@ -67,9 +86,47 @@ sub search(GET + / + ?q=&page~) {

my $start = AE::time;

my $results;
my $response;
my $cache = $redis->get("querycache:" . uri_escape_utf8($q))->recv;
if($cache) {
$results = decode_json($cache);
} else {
my %res = do_search($redis, $q);
if($res{error}) {
$response = "Something went wrong! $res{error}";
return [ 200, ['Content-type' => 'text/html'], [ $response ] ];
} else {
my $redis_cache = AnyEvent::Redis->new(host => $config->{"server.queue"});
$redis_cache->setex("querycache:" . uri_escape_utf8($q), 1800, encode_json($res{results}))->recv;
$results = $res{results};
}
}

if($exclude_file) {
$exclude_file = re_compiler($exclude_file);
$results = [grep $_->{file}->{file} !~ $exclude_file, @{$results}];
}

my $duration = AE::time - $start;
printf "Took %0.2f %s\n", $duration, $cache ? "(cached)" : "";

return { results => $results, duration => $duration };
}

sub re_compiler {
use re::engine::RE2;
eval(q{ sub { qr/$_[0]/ } })->(shift);
}

sub do_search {
my($redis, $q) = @_;
state $counter = 0;

my @results;
my $slab = $config->{"key.slabs"};
my $len = $redis->llen($slab)->recv;
my $req = 6;
my $req = $config->{"matcher.concurrency"};
my $c = int $len/$req;
my $notify = "webfe1." . $$ . "." . ++$counter;
for(1 .. $req) {
Expand All @@ -84,7 +141,6 @@ sub search(GET + / + ?q=&page~) {
}

my $redis_other = AnyEvent::Redis->new(host => $config->{"server.slab"});
my @results;
# cv used to manage lifetime of subscription and zrevrangebyscore results.
my $other_cv = AE::cv;
$other_cv->begin;
Expand Down Expand Up @@ -154,23 +210,7 @@ sub search(GET + / + ?q=&page~) {
}
});

my %res = $other_cv->recv;
my $duration = AE::time - $start;
print "Took $duration\n";

my $response;
if($res{error}) {
$response = "Something went wrong! $res{error}";
} else {
$response = render_response($q, \@results, $duration, $page_number)->to_html;
}

return [ 200, ['Content-type' => 'text/html'], [ $response ] ];
}

sub re_compiler {
use re::engine::RE2;
eval(q{ sub { qr/$_[0]/ } })->(shift);
return results => \@results, $other_cv->recv;
}

sub render_response {
Expand Down Expand Up @@ -235,4 +275,4 @@ sub render_response {
return $output;
}

CPANGrep->run_if_script;
WWW::CPANGrep->run_if_script;
18 changes: 16 additions & 2 deletions lib/WWW/CPANGrep/Index.pm
@@ -1,23 +1,30 @@
package WWW::CPANGrep::Index;
use Config::GitLike;
use Moose;
use namespace::autoclean;
use Parse::CPAN::Packages;
use WWW::CPANGrep::Index::Worker;
use FindBin ();
use Cwd 'abs_path';

with 'MooseX::Getopt';
with 'WWW::CPANGrep::Role::RedisConnection';

my $config = Config::GitLike->new(
confname => "cpangrep"
)->load_file("$FindBin::RealBin/../etc/config");

has cpan_dir => (
is => 'ro',
isa => 'Str',
required => 1,
default => sub { abs_path $config->{"location.cpan"} },
documentation => "Directory where CPAN mirror resides",
);

has slab_dir => (
is => 'ro',
isa => 'Str',
required => 1,
default => sub { abs_path $config->{"location.slabs"} },
documentation => "Directory in which to save 'slabs' extracted from CPAN",
);

Expand All @@ -41,12 +48,19 @@ sub index {
$self->redis->{$queue} = \@queue;
print "Inserted ", scalar(@{$self->redis->{$queue}}), " dists into $queue\n";

if($self->redis->{"cpangrep:indexer"}) {
warn "Semaphore not 0, previous run failed / in progress?";
}

$self->redis->{"cpangrep:indexer"} = 0;

delete $self->redis->{"new-index"};

WWW::CPANGrep::Index::Worker->new(
cpan_dir => $self->cpan_dir,
slab_dir => $self->slab_dir,
redis_server => $self->redis_server,
jobs => $self->jobs,
)->run($queue);
}

Expand Down

0 comments on commit 720e39a

Please sign in to comment.