Skip to content

Commit

Permalink
readded
Browse files Browse the repository at this point in the history
  • Loading branch information
Devin Austin committed Aug 31, 2009
1 parent 2e32f05 commit 0b27cae
Showing 1 changed file with 54 additions and 0 deletions.
54 changes: 54 additions & 0 deletions scrape.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env perl
use warnings;
use strict;
use Web::Scraper;
use URI;
use YAML qw/ Dump /;
use WWW::Mechanize;

#my $redirect_url = WWW::Mechanize->new;
my $yaml;
my $base_url = "http://yellowpages.com.au/search/postSearchEntry.do?clueType=0&clue=electrical+contractors&locationClue=All+States&x=0&y=0";
my $mech = WWW::Mechanize->new;
print "mech object initiated\n";
$mech->get( $base_url );
print "got our url\n";
my $names;
my @information;
print "Entering link following loop\n";

while ( $mech->follow_link( text => "Next" ) ) {
print "Beginning scrape inside loop\n";

my $want = scraper {
process "li.gold", "contractors[]" => scraper {
process ".omnitureListingNameLink", name => 'TEXT';
process ".address", address => 'TEXT'; # need to split this up into address, state, postcode,
process ".phoneNumber", phone => 'TEXT';
process ".links", website => '@href';
};
};

my $ua = $want->user_agent;
print "Before scrape is called\n";
$names = $want->scrape(
URI->new($base_url)
);

my $site = $names->{contractors}[3]->{website};
print "Site is: $site\n";

# my $true_url = $1;
push @information, { contractor => $names, real_website => $site };

print "Saving page info...\n";
print "Scrape successful\n";
print "Serializing -> YAML\n";
print "Dumping info:\n";
print Dump(@information);
warn "Page: $base_url\n";
print "Sleep for a bit\n";
sleep(1);
}

print "All done!\n";

0 comments on commit 0b27cae

Please sign in to comment.