diff --git a/lib/XML/Handler/Trees.pm b/lib/XML/Handler/Trees.pm new file mode 100644 index 0000000..33a1b36 --- /dev/null +++ b/lib/XML/Handler/Trees.pm @@ -0,0 +1,837 @@ +use strict; + +package XML::Handler::Trees; +use vars qw/$VERSION/; +$VERSION = '0.02'; + +package XML::Handler::Tree; + +sub new { + my $class = ref($_[0]) || $_[0]; + bless {},$class; +} + +sub start_document { + my $self=shift; + $self->{Lists}=[]; + $self->{Curlist}=$self->{Tree}=[]; +} + +sub start_element { + my ($self,$element)=@_; + my $newlist; + if (exists $element->{LocalName}) { + # namespaces are available! + $newlist = [{}]; + foreach my $attr (values %{$element->{Attributes}}) { + if ($attr->{NamespaceURI}) { + $newlist->[0]{"{$attr->{NamespaceURI}}$attr->{LocalName}"} = $attr->{Value}; + } + else { + $newlist->[0]{$attr->{Name}} = $attr->{Value}; + } + } + } + elsif (ref $element->{Attributes} eq 'HASH') { + $newlist=[{map {$_=>$element->{Attributes}{$_}} keys %{$element->{Attributes}}}]; + } + else { + $newlist=[{map {$_=>$element->{Attributes}{$_}{Value}} keys %{$element->{Attributes}}}]; + } + push @{ $self->{Lists} }, $self->{Curlist}; + if (exists($element->{LocalName}) && $element->{NamespaceURI}) { + push @{ $self->{Curlist} }, "{$element->{NamespaceURI}}$element->{LocalName}" => $newlist; + } + else { + push @{ $self->{Curlist} }, $element->{Name} => $newlist; + } + $self->{Curlist} = $newlist; +} + +sub end_element { + my ($self,$element)=@_; + $self->{Curlist}=pop @{$self->{Lists}}; +} + +sub characters { + my ($self,$text)=@_; + my $clist = $self->{Curlist}; + my $pos = $#$clist; + if ($pos>0 and $clist->[$pos-1] eq '0') { + $clist->[$pos].=$text->{Data}; + } + else { + push @$clist,0=>$text->{Data}; + } +} + +sub comment {} + +sub processing_instruction {} + +sub end_document { + my $self=shift; + delete $self->{Curlist}; + delete $self->{Lists}; + $self->{Tree}; +} + +package XML::Handler::EasyTree; + +sub new { + my $class=shift; + $class=ref($class) || $class; + my $self={Noempty=>0,Latin=>0,Searchable=>0,@_}; + $self->{Noempty}||=$self->{Searchable}; + bless $self,$class; +} + +sub start_document { + my $self = shift; + $self->{Lists} = []; + $self->{Curlist} = $self->{Tree} = []; +} + +sub start_element { + my ($self,$element)=@_; + $self->checkempty(); + my $newlist=[]; + my $newnode; + if ($self->{Searchable}) { + $newnode= XML::Handler::EasyTree::Searchable->new( Name => $self->nsname($element), Content => $newlist ); + } + else { + $newnode={type=>'e',attrib=>{},name=>$self->nsname($element),content=>$newlist}; + } + if (exists $element->{LocalName}) { + while (my ($name,$obj) = each %{$element->{Attributes}}) { + $newnode->{attrib}{$name} = $self->encode($obj->{Value}); + } + } + elsif (ref $element->{Attributes} eq 'HASH') { + while (my ($name,$val)=each %{$element->{Attributes}}) { + $newnode->{attrib}{$self->nsname($name)}=$self->encode($val); + } + } + else { + foreach my $att (keys %{$element->{Attributes}}) { + $newnode->{attrib}{$self->nsname($element->{Attributes}{$att})}=$self->encode($element->{Attributes}{$att}{Value}); + } + } + push @{ $self->{Lists} }, $self->{Curlist}; + push @{ $self->{Curlist} }, $newnode; + $self->{Curlist} = $newlist; +} + +sub end_element { + my $self=shift; + $self->checkempty(); + $self->{Curlist}=pop @{$self->{Lists}}; +} + +sub characters { + my ($self,$text)=@_; + my $clist=$self->{Curlist}; + if (!@$clist || $clist->[-1]{type} ne 't') { + push @$clist,{type=>'t',content=>''}; + } + $clist->[-1]{content}.=$self->encode($text->{Data}); +} + +sub processing_instruction { + my ($self,$pi)=@_; + $self->checkempty(); + my $clist=$self->{Curlist}; + push @$clist,{type=>'p',target=>$self->encode($pi->{Target}),content=>$self->encode($pi->{Data})}; +} + +sub comment {} + +sub end_document { + my $self = shift; + $self->checkempty(); + delete $self->{Curlist}; + delete $self->{Lists}; + if ($self->{Searchable}) { + return XML::Handler::EasyTree::Searchable->new( Name => '__TOPLEVEL__', Content => $self->{Tree} ); + } + $self->{Tree}; +} + +sub nsname { + my ($self,$name)=@_; + if (ref $name) { + if (defined $name->{NamespaceURI}) { + $name="{$name->{NamespaceURI}}$name->{LocalName}"; + } + else { + $name=$name->{Name}; + } + } + return $self->encode($name); +} + +sub encode { + my ($self,$text)=@_; + if ($self->{Latin}) { + $text=~s{([\xc0-\xc3])(.)}{ + my $hi = ord($1); + my $lo = ord($2); + chr((($hi & 0x03) <<6) | ($lo & 0x3F)) + }ge; + } + $text; +} + +sub checkempty() { + my $self=shift; + if ($self->{Noempty}) { + my $clist=$self->{Curlist}; + if (@$clist && $clist->[-1]{type} eq 't' && $clist->[-1]{content}=~/^\s+$/) { + pop @$clist; + } + } +} + +package XML::Handler::EasyTree::Searchable; + +# +# new() returns a new node with the same structure at the `newnode' +# hashref +# +# Usage: XML::Handler::EasyTree::Searchable->new( Name => $name, Content => $content ); +# +sub new { + my $type = shift; + my $class = ref($type) || $type || die "must supply a object type" ; + + my %opts = @_; + + my $name = $opts{Name} || ''; + my $content = $opts{Content} || undef; + + return bless ( { + type => 'e', + attrib => {}, + name => $name, + content => $content, + }, $class); +} + +# +# name() returns the name of the node. Ideally, it should return a +# "fully qualified" name, but it doesn't +# +sub name { + my $self = shift; + return $self->{name}; +} + +# +# value() returns the value associated with an object +# +sub value { + my $self = shift; + + return( undef ) + unless( ( exists $self->{content} ) && ( defined $self->{content} ) ); + + my $possible = $self->{content}; + + die "not an array" unless( "$possible" =~ /ARRAY/ ); + + $possible = $possible->[0]; + + return( undef ) + unless( ( exists $possible->{type} ) && ( $possible->{type} eq 't' ) ); + + return( undef ) + unless( ( exists $possible->{content} ) && ( defined $possible->{content} ) ); + + return $possible->{content}; +} + +# +# usage: $newobj = $obj->child( $name ); +# +# child() returns a child (elements only) of the object with the $name +# +# for the case where there is more than one child that match $name, +# the array context semantics haven't been completely worked out: +# - in an array context, all children are returned. +# - in scalar context, the first child matching $name is returned. +# +# In a scalar context, The XML::Parser::SimpleObj class returns an +# object containing all the children matching $name, unless there is +# only one child in which case it returns that child (see commented +# code). I find that behavior confusing. +# +sub child { + my $self = shift; + my $spec = shift || ''; + + my $array = $self->{content}; + + my @rv; + if( $spec ) { + @rv = grep { $_->{name} eq $spec } grep { $_->{type} eq 'e' } @$array; + } else { + @rv = grep { $_->{type} eq 'e' } @$array; + } + + my $num = scalar( @rv ); + + if( wantarray() ) { + return @rv; + } else { + return '' unless( $num ); + return $rv[0] if( $num == 1 ); + # my $class = ref( $self ); + # return $class->new( Name => "__magic_child_list_object__", Content => [ @rv ] ); + } +} + +# +# usage: @children = $obj->children( $name ); +# +# children() returns a list of all children (elements only) of the +# $obj that match $name -- in the order in which they appeared in the +# original xml text. +# +sub children { + my $self = shift; + my $array = $self->{content}; + my $spec = shift || ''; + + + my @rv; + if( $spec ) { + @rv = grep { $_->{name} eq $spec } grep { $_->{type} eq 'e' } @$array; + } else { + @rv = grep { $_->{type} eq 'e' } @$array; + } + + return @rv; +} + +# +# usage: @children_names = $obj->children_names(); +# +# children_names() returns a list of all the names of the objects +# children (elements only) in the order in which they appeared in the +# original text +# +sub children_names { + my $self = shift; + my $array = $self->{content}; + + return map { $_->{name} } grep { $_->{type} eq 'e' } @$array; +} + +# +# usage: $attrib = $obj->attribute( $att_name ); +# +# attribute() returns the string associated with the attribute of the +# object. If not found returns a null string. +# +sub attribute { + my $self = shift; + my $spec = shift || return ''; + + return '' unless( ( exists $self->{attrib} ) && ( defined $self->{attrib} ) ); + + my $attrib = $self->{attrib}; + return '' unless( ( exists $attrib->{$spec} ) && ( defined $attrib->{$spec} ) ); + + return $attrib->{$spec}; +} + +# +# usage: @attribute_list = $obj->attribute_list(); +# +# attribute_list() returns a list (in no particular order) of the +# attribute names associated with the object +# +sub attribute_list { + my $self = shift; + + return '' unless( ( exists $self->{attrib} ) && ( defined $self->{attrib} ) ); + + my $attrib = $self->{attrib}; + return '' unless( "$attrib" =~ /HASH/ ); + + return keys %$attrib; +} + +# +# usage: $text = $obj->dump_tree(); +# +# dump_tree() returns a textual representation (in xml form) of the +# object's heirarchy. Only elements are processed. +# +# +sub dump_tree { + my $self = shift; + my %opts = @_; + + my $pretty = delete $opts{-pretty}; + + my $name = $self->name(); + my $value = $self->value(); + my @children = $self->children(); + + my $text = ''; + unless( $name eq '__TOPLEVEL__' ) { + $text .= "<$name"; + for my $att ( $self->attribute_list() ) { + $text .= sprintf( " %s=\"%s\"", $att, encode($self->attribute( $att )) ); + } + $text .= ">"; + + if( $value ) { + $text .= encode($value); + } + } + + + for my $child ( @children ) { + $text .= $child->dump_tree(); + } + + unless( $name eq '__TOPLEVEL__' ) { + $text .= ""; + } + + return $text; +} + +# +# usage: $text = $obj->pretty_dump_tree(); +# +# pretty_dump_tree() is identical to dump_tree(), except that newline +# and indentation embellishments are added +# +sub pretty_dump_tree { + my $self = shift; + my $tab = shift || 0; + + my $indent = " " x ( 2 * $tab ); + + my $name = $self->name(); + my $value = $self->value(); + my @children = $self->children(); + + my $text = ''; + unless( $name eq '__TOPLEVEL__' ) { + $text .= "$indent<$name"; + for my $att ( $self->attribute_list() ) { + $text .= sprintf( " %s=\"%s\"", $att, encode($self->attribute( $att )) ); + } + $text .= ">"; + + if( defined $value ) { + $text .= encode($value); + $text .= "\n"; + return $text; + } else { + $text .= "\n"; + } + } + + for my $child ( @children ) { + $text .= $child->pretty_dump_tree( $tab + 1 ); + } + + unless( $name eq '__TOPLEVEL__' ) { + $text .= "$indent\n"; + } + + return $text; +} + +sub encode { + my $encstr=shift; + my %encodings=('&'=>'amp','<'=>'lt','>'=>'gt','"'=>'quot',"'"=>'apos'); + $encstr=~s/([&<>"'])/&$encodings{$1};/g; + $encstr; +} + +package XML::Handler::TreeBuilder; + +use vars qw(@ISA); +@ISA=qw(XML::Element); + +sub new { + require XML::Element; + my $class = ref($_[0]) || $_[0]; + my $self = XML::Element->new('NIL'); + $self->{'_element_class'} = 'XML::Element'; + $self->{'_store_comments'} = 0; + $self->{'_store_pis'} = 0; + $self->{'_store_declarations'} = 0; + $self->{_stack}=[]; + bless $self, $class; +} + +sub start_document {} + +sub start_element { + my ($self,$element)=@_; + my @attlist; + if (exists $element->{LocalName}) { + @attlist=map {$_=>$element->{Attributes}{$_}{Value}} keys %{$element->{Attributes}}; + } + elsif (ref $element->{Attributes} eq 'HASH') { + @attlist=map {$_=>$element->{Attributes}{$_}} keys %{$element->{Attributes}}; + } + else { + @attlist=map {$_=>$element->{Attributes}{$_}{Value}} keys %{$element->{Attributes}}; + } + if(@{$self->{_stack}}) { + push @{$self->{_stack}}, $self->{'_element_class'}->new($element->{Name},@attlist); + $self->{_stack}[-2]->push_content( $self->{_stack}[-1] ); + } + else { + $self->tag($element->{Name}); + while(@attlist) { + $self->attr(splice(@attlist,0,2)); + } + push @{$self->{_stack}}, $self; + } +} + +sub end_element { + my $self=shift; + pop @{$self->{_stack}}; + return +} + +sub characters { + my ($self,$text)=@_; + $self->{_stack}[-1]->push_content($text->{Data}); +} + +sub comment { + my ($self,$comment)=@_; + return unless $self->{'_store_comments'}; + (@{$self->{_stack}} ? $self->{_stack}[-1] : $self)->push_content( + $self->{'_element_class'}->new('~comment', 'text' => $comment->{Data}) + ); + return; +} + +sub processing_instruction { + my ($self,$pi)=@_; + return unless $self->{'_store_pis'}; + (@{$self->{_stack}} ? $self->{_stack}[-1] : $self)->push_content( + $self->{'_element_class'}->new('~pi', 'text' => "$pi->{Target} $pi->{Data}") + ); + return; +} + +sub end_document { + my $self=shift; + return $self; +} + +sub _elem # universal accessor... +{ + my($self, $elem, $val) = @_; + my $old = $self->{$elem}; + $self->{$elem} = $val if defined $val; + return $old; +} + +sub store_comments { shift->_elem('_store_comments', @_); } +sub store_declarations { shift->_elem('_store_declarations', @_); } +sub store_pis { shift->_elem('_store_pis', @_); } + +1; +__END__ + +=head1 NAME + +XML::Handler::Trees - PerlSAX handlers for building tree structures + +=head1 SYNOPSIS + + use XML::Handler::Trees; + use XML::Parser::PerlSAX; + + my $p=XML::Parser::PerlSAX->new(); + my $h=XML::Handler::Tree->new(); + my $tree=$p->parse(Handler=>$h,Source=>{SystemId=>'file.xml'}); + + my $p=XML::Parser::PerlSAX->new(); + my $h=XML::Handler::EasyTree->new(Noempty=>1); + my $easytree=$p->parse(Handler=>$h,Source=>{SystemId=>'file.xml'}); + + my $p=XML::Parser::PerlSAX->new(); + my $h=XML::Handler::TreeBuilder->new(); + $h->store_pis(1); + my $tree=$p->parse(Handler=>$h,Source=>{SystemId=>'file.xml'}); + +=head1 DESCRIPTION + +XML::Handler::Trees provides three PerlSAX handler classes for building +tree structures. XML::Handler::Tree builds the same type of tree as the +"Tree" style in XML::Parser. XML::Handler::EasyTree builds the same +type of tree as the "EasyTree" style added to XML::Parser by +XML::Parser::EasyTree. XML::Handler::TreeBuilder builds the same type +of tree as Sean M. Burke's XML::TreeBuilder. These classes make it +possible to construct these tree structures from sources other than +XML::Parser. + +All three handlers can be driven by either PerlSAX 1 or PerlSAX 2 +drivers. In all cases, the end_document() method returns a reference to +the constructed tree, which normally becomes the return value of the +PerlSAX driver. + +=head1 CLASS XML::Handler::Tree + +This handler builds the same type of tree structure as the "Tree" style +in XML::Parser. Some modules such as Dan Brian's XML::SimpleObject work +with this type of tree. See the documentation for XML::Parser for details. + +=head2 METHODS + +=over 4 + +=item $handler = XML::Handler::Tree->new() + +Creates a handler object. + +=back + +=head1 CLASS XML::Handler::EasyTree + +This handler builds a lightweight tree structure representing the XML +document. This structure is, at least in this author's opinion, easier to +work with than the "standard" style of tree. It is the same type of +structure as built by XML::Parser when using XML::Parser::EasyTree, or +by the get_simple_tree method in XML::Records. + +The tree is returned as a reference to an array of tree nodes, each of +which is a hash reference. All nodes have a 'type' key whose value is +the type of the node: 'e' for element nodes, 't' for text nodes, and 'p' +for processing instruction nodes. All nodes also have a 'content' key +whose value is a reference to an array holding the element's child nodes +for element nodes, the string value for text nodes, and the data value +for processing instruction nodes. Element nodes also have an 'attrib' +key whose value is a reference to a hash of attribute names and values and a 'name' +key whose value is the element's name. Processing instructions also have +a 'target' key whose value is the PI's target. + +EasyTree nodes are ordinary Perl hashes and are not objects. Contiguous +runs of text are always returned in a single node. + +The reason the parser returns an array reference rather than the root +element's node is that an XML document can legally contain processing +instructions outside the root element (the xml-stylesheet PI is commonly +used this way). + +If namespace information is available (only possible with PerlSAX 2), +element and attribute names will be prefixed with their (possibly empty) +namespace URI enclosed in curly brackets, and namespace prefixes will be +stripped from names. + +=head2 METHODS + +=over 4 + +=item $handler = XML::Handler::EasyTree->new([options]) + +Creates a handler object. Options can be provided hash-style: + +=over 4 + +=item Noempty + +If this is set to a true value, text nodes consisting entirely of +whitespace will not be stored in the tree. The default is false. + +=item Latin + +If this is set to a true value, characters with Unicode values in the +Latin-1 range (160-255) will be stored in the tree as Latin-1 rather +than UTF-8. The default is false. + +=item Searchable + +If this is set to a true value, the parser will return a tree of XML::Handler::EasyTree::Searchable +objects rather than bare array references, providing access to the navigation methods +listed below. The top-level node returned will be a dummy element node with a name of "__TOPLEVEL__". +It is false by default. Setting this option automatically enables the Noempty option. + +=back + +=back + +=head2 XML::Handler::EasyTree::Searchable METHODS + +If the Searchable option is set, all nodes in the tree will be XML::Handler::EasyTree::Searchable objects, +which have the same structure as EasyTree nodes but also implement the following methods similar to +those in XML::SimpleObject. + +=over 4 + +=item $name = $node->name() + +Returns the name of the node. Ideally, it should return a +"fully qualified" name, but it doesn't. + +=item $val = $node->value() + +Returns the text value associated with a node object. Returns undef if the node has +no text children or its first child is not a text node. + +=item $newobj = $obj->child( $name ); + +Returns a child (elements only) of the object with the $name. + +For the case where there is more than one child that match $name, +the array context semantics haven't been completely worked out: +- in an array context, all children are returned. +- in scalar context, the first child matching $name is returned. + +In a scalar context, The XML::Parser::SimpleObj class returns an +object containing all the children matching $name, unless there is +only one child in which case it returns that child (see commented +code). I find that behavior confusing. + +=item @children = $obj->children( $name ); + +Returns a list of all children (elements only) of the +$obj that match $name -- in the order in which they appeared in the +original xml text. + +=item @children_names = $obj->children_names(); + +Returns a list of all the names of the objects +children (elements only) in the order in which they appeared in the +original text. + +=item $attrib = $obj->attribute( $att_name ); + +Returns the string associated with the attribute of the +object. If not found returns a null string. + +=item @attribute_list = $obj->attribute_list(); + +Returns a list (in no particular order) of the +attribute names associated with the object + +=item $text = $obj->dump_tree(); + +Returns a textual representation (in xml form) of the +object's hierarchy. Only elements are processed. The result will +be in whatever character encoding the SAX driver delivered (which may +not be the same encoding as the original source). + +=item $text = $obj->pretty_dump_tree(); + +Identical to dump_tree(), except that newline +and indentation embellishments are added + +=back + +=head2 EXAMPLE + + #! /usr/bin/perl -w + + use XML::Handler::Trees; + use XML::Parser::PerlSAX; + use strict; + + my $p=XML::Parser::PerlSAX->new(); + my $h=XML::Handler::EasyTree->new( Searchable=>1 ); + my $easytree=$p->parse( Handler => $h, Source => { SystemId => 'systemB.xml' } ); + + my $vme = $easytree->child( "vmesystem" ); + + print "\n"; + print "vmesystem config: ", $vme->attribute( "configuration_name" ), "\n"; + + print "\n"; + print "vmesystem children: ", join( ', ', $vme->children_names() ), "\n"; + + print "\n"; + print "gps model is ", $vme->child( "gps" )->child( "model" )->value(), "\n"; + my $gps = $vme->child( "gps" ); + print "gps slot is ", $gps->child( "slot" )->value(), "\n"; + + print "\n"; + print "reconstructed XML: \n"; + print $easytree->dump_tree(), "\n"; + + # print "\n"; + # print "recontructed XML (pretty): \n"; + # print $easytree->pretty_dump_tree(), "\n"; + + print "\n"; + exit; + +=head1 CLASS XML::Handler::TreeBuilder + +This handler builds XML document trees constructed of +XML::Element objects (XML::Element is a subclass of HTML::Element +adapted for XML). To use it, XML::TreeBuilder and its prerequisite +HTML::Tree need to be installed. See the documentation for those +modules for information on how to work with these tree structures. + +=head2 METHODS + +=over 4 + +=item $handler = XML::Handler::TreeBuilder->new() + +Creates a handler which builds a tree rooted in an XML::Element. + +=item $root->store_comments(value) + +This determines whether comments will be stored in the tree (not all SAX +drivers generate comment events). Currently, this is off by default. + +=item $root->store_declarations(value) + +This determines whether markup declarations will be stored in the tree. +Currently, this is off by default. The present implementation does not +store markup declarations in any case; this method is provided for future use. + +=item $root->store_pis(value) + +This determines whether processing instructions will be stored in the tree. +Currently, this is off (false) by default. + +=back + +=head1 AUTHOR + +Eric Bohlman (ebohlman@omsdev.com) + +PerlSAX 2 compatibility added by Matt Sergeant (matt@sergeant.org) + +XML::EasyTree::Searchable written by Stuart McDow (smcdow@moontower.org) + +Copyright (c) 2001 Eric Bohlman. + +Portions of this code Copyright (c) 2001 Matt Sergeant. + +Portions of this code Copyright (c) 2001 Stuart McDow. + +All rights reserved. This program is free software; you can redistribute it +and/or modify it under the same terms as Perl itself. + +=head1 SEE ALSO + + L + L + L + L + L + L + L + L + +=cut +