Skip to content
Browse files

some documentation

  • Loading branch information...
1 parent f65d7fe commit cabf22212cca05b580fec9c622c0cc2442eebf28 @creaktive committed Apr 2, 2012
Showing with 393 additions and 36 deletions.
  1. +38 −10 README.pod
  2. +47 −3 bin/untemplate
  3. +48 −4 bin/xpathify
  4. +51 −9 lib/HTML/Linear.pm
  5. +40 −6 lib/HTML/Linear/Element.pm
  6. +45 −4 lib/HTML/Linear/Path.pm
  7. +54 −0 lib/HTML/Untemplate.pod
  8. +70 −0 t/10-flattern.t
View
48 README.pod
@@ -2,31 +2,59 @@
=head1 NAME
-HTML::Linear - ...
+HTML::Linear - represent HTML::Tree as a flat list
=head1 VERSION
version 0.001
+=head1 ATTRIBUTES
+
+=head2 _list
+
+Internal list representation.
+
+=head2 _strict
+
+Internal strict mode flag.
+
+=head2 _uniq
+
+Used for internal collision detection.
+
=head1 METHODS
-=head2 deparse($node, $path)
+=head2 add_element
-...
+Add an element to the list.
-=head1 SEE ALSO
+=head2 as_list
-=over 4
+Access list as array.
-=item *
+=head2 count_elements
-L<HTML::Similarity>
+Number of elements in list.
-=item *
+=head2 get_element
-L<XML::DifferenceMarkup>
+Element accessor.
+
+=head2 set_strict
+
+Do not group by C<id>, C<class> or C<name> attributes.
+
+=head2 unset_strict
+
+Group by C<id>, C<class> or C<name> attributes.
+
+=head2 eof
+
+Overrides L<HTML::TreeBuilder> C<eof>.
+
+=head2 deparse($node, $path)
-=back
+Recursively scan underlying L<HTML::TreeBuilder> structure.
=head1 AUTHOR
View
50 bin/untemplate
@@ -1,16 +1,57 @@
#!/usr/bin/env perl
-# ABSTRACT: ...
+# ABSTRACT: analyze several HTML documents based on the same template
# PODNAME: untemplate
+use strict;
use common::sense;
+use Getopt::Long;
use HTML::Linear;
use IO::Interactive qw(is_interactive);
+use Pod::Usage;
use Term::ANSIColor qw(:constants);
use Tie::IxHash;
# VERSION
-my $color = is_interactive(*STDOUT);
+=head1 SYNOPSIS
+
+ untemplate [options] HTML1 HTML2 [HTML3] [...]
+
+=head1 DESCRIPTION
+
+Takes multiple HTML documents generated using the same template and attempts to extract only the data inserted into original template.
+
+=head1 OPTIONS
+
+=over 4
+
+=item --help
+
+This.
+
+=item --[no]color
+
+Enable syntax highlight for XPath.
+By default, enabled automatically on interactive terminals.
+
+=item --[no]strict
+
+Strict mode disables grouping by C<id>, C<class> or C<name> attributes.
+The grouping is enabled by default.
+
+=back
+
+=cut
+
+GetOptions(
+ q(help) => \my $help,
+ q(color!) => \my $color,
+ q(strict!) => \my $strict,
+) or pod2usage(q(-verbose) => 1);
+pod2usage(q(-verbose) => 1)
+ if $help or $#ARGV < 1;
+
+$color //= is_interactive(*STDOUT);
if ($color) {
# ugly in the morning
@@ -29,7 +70,10 @@ if ($color) {
tie my %elem, 'Tie::IxHash';
for my $file (@ARGV) {
my $hl = HTML::Linear->new;
- #$hl->set_strict;
+
+ $hl->set_strict
+ if $strict // 0;
+
$hl->parse_file($file)
or die "Can't parse $file: $!";
View
52 bin/xpathify
@@ -1,19 +1,63 @@
#!/usr/bin/env perl
-# ABSTRACT: ...
+# ABSTRACT: output HTML document as a flat XPath/content list
# PODNAME: xpathify
+use strict;
use common::sense;
+use Getopt::Long;
use HTML::Linear;
+use Pod::Usage;
# VERSION
+=head1 SYNOPSIS
+
+ xpathify [options] HTML
+
+=head1 DESCRIPTION
+
+Represents a typical HTML document in a very verbose two-column mode.
+The first column is a XPath which locates each element inside the HTML tree.
+The second column is a respective content (if any).
+
+ /html/head/title/text() test 1
+ /html/body/h1/text() test 2
+ /html/body/p[1]/text() Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+
+=head1 OPTIONS
+
+=over 4
+
+=item --help
+
+This.
+
+=item --[no]strict
+
+Strict mode disables grouping by C<id>, C<class> or C<name> attributes.
+The grouping is enabled by default.
+
+=back
+
+=cut
+
+GetOptions(
+ q(help) => \my $help,
+ q(strict!) => \my $strict,
+) or pod2usage(q(-verbose) => 1);
+pod2usage(q(-verbose) => 1)
+ if $help or $#ARGV != 0;
+
my $hl = HTML::Linear->new;
-#$hl->set_strict;
+$hl->set_strict
+ if $strict;
$hl->parse_file($ARGV[0])
or die "Can't parse $ARGV[0]: $!";
for my $el ($hl->as_list) {
my $hash = $el->as_hash;
- say $_ . "\t" . ($hash->{$_} =~ s/\s+/ /grs)
- for sort grep { not m{/\@(?:class|id)$} } keys %{$hash};
+ for (sort grep { not m{/\@(?:class|id)$} } keys %{$hash}) {
+ $hash->{$_} =~ s/\s+/ /gs;
+ say $_ . "\t" . $hash->{$_};
+ }
}
View
60 lib/HTML/Linear.pm
@@ -1,5 +1,6 @@
package HTML::Linear;
-# ABSTRACT: ...
+# ABSTRACT: represent HTML::Tree as a flat list
+use strict;
use common::sense;
use Moose;
@@ -11,6 +12,28 @@ use HTML::Linear::Path;
# VERSION
+=attr _list
+
+Internal list representation.
+
+=method add_element
+
+Add an element to the list.
+
+=method as_list
+
+Access list as array.
+
+=method count_elements
+
+Number of elements in list.
+
+=method get_element
+
+Element accessor.
+
+=cut
+
has _list => (
traits => ['Array'],
is => 'ro',
@@ -24,6 +47,20 @@ has _list => (
},
);
+=attr _strict
+
+Internal strict mode flag.
+
+=method set_strict
+
+Do not group by C<id>, C<class> or C<name> attributes.
+
+=method unset_strict
+
+Group by C<id>, C<class> or C<name> attributes.
+
+=cut
+
has _strict => (
traits => ['Bool'],
is => 'ro',
@@ -35,8 +72,20 @@ has _strict => (
},
);
+=attr _uniq
+
+Used for internal collision detection.
+
+=cut
+
has _uniq => (is => 'ro', isa => 'HashRef[Str]', default => sub { {} });
+=method eof
+
+Overrides L<HTML::TreeBuilder> C<eof>.
+
+=cut
+
after eof => sub {
my ($self) = @_;
@@ -52,7 +101,7 @@ after eof => sub {
=method deparse($node, $path)
-...
+Recursively scan underlying L<HTML::TreeBuilder> structure.
=cut
@@ -112,13 +161,6 @@ sub deparse {
return $level;
}
-=head1 SEE ALSO
-
-=for :list
-* L<HTML::Similarity>
-* L<XML::DifferenceMarkup>
-
-=cut
no Moose;
__PACKAGE__->meta->make_immutable;
View
46 lib/HTML/Linear/Element.pm
@@ -1,5 +1,6 @@
package HTML::Linear::Element;
-# ABSTRACT: ...
+# ABSTRACT: represent elements to populate HTML::Linear
+use strict;
use common::sense;
use Digest::SHA;
@@ -9,7 +10,40 @@ use HTML::Linear::Path;
# VERSION
-has [qw(left right)] => (is => 'rw', isa => 'Int', default => -1);
+=attr attributes
+
+Element attributes.
+
+=attr content
+
+Element content.
+
+=attr depth
+
+Depth level of an element inside a L<HTML::TreeBuilder> structure.
+
+=attr index
+
+Index to preserve elements order.
+
+=attr index_map
+
+Used for internal collision detection.
+
+=attr key
+
+Stringified element representation.
+
+=attr path
+
+Store representations of paths inside C<HTML::TreeBuilder> structure (L<HTML::Linear::Path>).
+
+=attr sha
+
+Lazy L<Digest::SHA> (256-bit) representation.
+
+=cut
+
has attributes => (is => 'rw', isa => 'HashRef[Str]', default => sub { {} }, auto_deref => 1);
has content => (is => 'rw', isa => 'Str', default => '');
has depth => (is => 'ro', isa => 'Int', required => 1);
@@ -32,7 +66,7 @@ sub BUILD {
=method as_string
-...
+Stringified signature of an element.
=cut
@@ -49,7 +83,7 @@ sub as_string {
=method as_xpath
-...
+Build a nice XPath representation of a path inside the L<HTML::TreeBuilder> structure.
=cut
@@ -64,7 +98,7 @@ sub as_xpath {
=method as_hash
-...
+Linearize element as an associative array (Perl hash).
=cut
@@ -73,7 +107,7 @@ sub as_hash {
my $hash = {};
my $xpath = $self->as_xpath . HTML::Linear::Path::_wrap(separator => '/');
- for my $key (sort keys $self->attributes) {
+ for my $key (sort keys %{$self->attributes}) {
$hash->{
$xpath
. HTML::Linear::Path::_wrap(sigil => '@')
View
49 lib/HTML/Linear/Path.pm
@@ -1,19 +1,48 @@
package HTML::Linear::Path;
-# ABSTRACT: ...
+# ABSTRACT: represent paths inside HTML::Tree
+use strict;
use common::sense;
use JSON::XS;
use Moose;
# VERSION
+=attr json
+
+Lazy L<JSON::XS> instance.
+
+=cut
+
has json => (
is => 'ro',
isa => 'JSON::XS',
default => sub { JSON::XS->new->ascii->canonical },
lazy => 1,
);
+=attr address
+
+Location inside L<HTML::TreeBuilder> tree.
+
+=attr attributes
+
+Element attributes.
+
+=attr key
+
+Stringified path representation.
+
+=attr strict
+
+Strict mode disables grouping by C<id>, C<class> or C<name> attributes.
+
+=attr tag
+
+Tag name.
+
+=cut
+
has address => (is => 'rw', isa => 'Str', required => 1);
has attributes => (is => 'ro', isa => 'HashRef[Str]', required => 1, auto_deref => 1);
has key => (is => 'rw', isa => 'Str', default => '');
@@ -35,7 +64,7 @@ our %xpath_wrap = (
=method as_string
-...
+Build a quick & dirty string representation of a path the L<HTML::TreeBuilder> structure.
=cut
@@ -47,14 +76,14 @@ sub as_string {
_tag => $self->tag,
addr => $self->address,
};
- $ref->{attr} = $self->attributes if keys $self->attributes;
+ $ref->{attr} = $self->attributes if keys %{$self->attributes};
return $self->key($self->json->encode($ref));
}
=method as_xpath
-...
+Build a nice XPath representation of a path inside the L<HTML::TreeBuilder> structure.
=cut
@@ -81,6 +110,12 @@ sub as_xpath {
return $xpath;
}
+=func _quote
+
+Quote attribute values for XPath representation.
+
+=cut
+
sub _quote {
local $_ = $_[0];
@@ -93,6 +128,12 @@ sub _quote {
return "'$_'";
}
+=func _wrap
+
+Help to make a fancy XPath.
+
+=cut
+
sub _wrap {
return
$xpath_wrap{$_[0]}->[0]
View
54 lib/HTML/Untemplate.pod
@@ -0,0 +1,54 @@
+# ABSTRACT: undo what the template engine does
+# PODNAME: HTML::Untemplate
+
+# VERSION
+
+=head1 DESCRIPTION
+
+Despite being named similarly to L<HTML::Template>, this distribution is not directly related to it.
+Instead, it attempts to reverse the templating action, whatever the template agent used.
+
+=head2 Why?
+
+Suppose you have a CMS.
+Typical CMS works roughly as this (data flows bottom-down):
+
+ RDBMS
+ scripting language
+ HTML
+ HTTP server
+ (...)
+ HTTP agent
+ layout engine
+ screen
+ user
+
+Consider the first 3 steps: C<RDBMS =E<gt> scripting language =E<gt> HTML>
+
+This is "applying template".
+
+Now, consider this: C<HTML =E<gt> scripting language =E<gt> RDBMS>
+
+I would call that "un-applying template", or "untemplate" C<:)>
+
+The practical application of this set of tools to assist in creation of web scrappers.
+
+=head2 xpathify
+
+The L<xpathify> tool flatterns the HTML tree into key/value list.
+The keys are in XPath format, while the values are respective content from the HTML tree.
+Theoretically, it could be possible to reassemble the HTML tree from the flat key/value list this tool generates.
+
+=head2 untemplate
+
+The L<untemplate> tool flatterns a set of HTML documents using the algorithm from L<xpathify>.
+Then, it strips the shared key/value pairs.
+The "rest" is composed of original values fed into the template engine.
+
+=head1 SEE ALSO
+
+=for :list
+* L<HTML::Similarity>
+* L<XML::DifferenceMarkup>
+
+=cut
View
70 t/10-flattern.t
@@ -0,0 +1,70 @@
+use common::sense;
+
+use Data::Dumper;
+use FindBin qw($Bin);
+use Path::Class;
+use Test::More;
+
+use_ok(q(HTML::Linear));
+
+my $hl = HTML::Linear->new;
+isa_ok($hl, q(HTML::Linear));
+can_ok($hl, qw(
+ eof
+ set_strict
+ parse_file
+ as_list
+));
+
+ok(
+ $hl->set_strict,
+ q(set_strict),
+);
+
+ok(
+ $hl->parse_file(q...file($Bin, q(test.html))),
+ q(parse_file),
+);
+
+my $n = 0;
+my %hash;
+
+for my $el ($hl->as_list) {
+ isa_ok($el, q(HTML::Linear::Element));
+ can_ok($el, qw(as_hash));
+
+ my $hash = $el->as_hash;
+ $hash{$_} .= $hash->{$_}
+ for keys %{$hash};
+
+ ++$n;
+}
+
+my $expect = {
+ '/html/body/h1/text()' => 'test 2',
+ '/html/body/p[1]/text()' => ' Lorem ipsum dolor sit amet, consectetur adipiscing elit. Ut sed scelerisque nulla. Nam sit amet massa ac justo lacinia cursus. Et harum quidem rerum facilis est et expedita distinctio. ',
+ '/html/body/p[1]/ul/li[1]/@id' => 'li1',
+ '/html/body/p[1]/ul/li[1]/text()' => 'Vestibulum ullamcorper eleifend justo.',
+ '/html/body/p[1]/ul/li[2]/text()' => 'Sed id sapien tortor.',
+ '/html/body/p[1]/ul/li[3]/text()' => ' Fusce et volutpat mi. ',
+ '/html/body/p[1]/ul/li[4]/text()' => 'Quisque ullamcorper mauris lacus.',
+ '/html/body/p[1]/ul/li[5]/text()' => 'Nunc in erat sit amet nisi vulputate pharetra.',
+ '/html/body/p[2]/text()' => ' Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur? ',
+ '/html/head/title/text()' => 'test 1',
+};
+
+ok(
+ scalar keys %hash == scalar keys %{$expect},
+ q(result length match),
+);
+
+my $err = 0;
+ok(
+ $hash{$_} eq $expect->{$_},
+ qq(XPath $_)
+) or ++$err for keys %$expect;
+
+$Data::Dumper::Sortkeys = 1;
+$err and diag(Dumper \%hash);
+
+done_testing(6 + $n * 2 + keys %{$expect});

0 comments on commit cabf222

Please sign in to comment.
Something went wrong with that request. Please try again.