diff --git a/README.md b/README.md index b819529..06176e3 100644 --- a/README.md +++ b/README.md @@ -14,8 +14,11 @@ PICA::Data - PICA record processing use PICA::Parser::XML; use PICA::Writer::Plain; + use PICA::Schema; + $parser = PICA::Parser::XML->new( @options ); $writer = PICA::Writer::Plain->new( @options ); + $schema = PICA::Schema->new({ fields => { '021A' => { unique => 1 } } }); # parse records while ( my $record = $parser->next ) { @@ -44,6 +47,10 @@ PICA::Data - PICA record processing # stringify record my $plain = $record->string; my $xml = $record->string('xml'); + + # check 021A exists and is not repeated + my @errors = $schema->check($record, ignore_unknown_fields => 1); + ... } # parse single record from string diff --git a/lib/PICA/Data.pm b/lib/PICA/Data.pm index d194c6d..b965e1f 100644 --- a/lib/PICA/Data.pm +++ b/lib/PICA/Data.pm @@ -282,11 +282,11 @@ Austauschformat fuer Bibliotheken (MAB). In addition to PICA+ in CBS there is the cataloging format Pica3 which can losslessly be convert to PICA+ and vice versa. -Records in PICA::Data are encoded either as array of arrays, the inner -arrays representing PICA fields, or as an object with two fields, C<_id> and -C, the latter holding the record as array of arrays, and the former -holding the record identifier, stored in field C<003@>, subfield C<0>. For -instance a minimal record with just one field C<003@>: +Records in PICA::Data are encoded either as array of arrays, the inner arrays +representing PICA fields, or as an object with two keys, C<_id> and C, +the latter holding the record as array of arrays, and the former holding the +record identifier, stored in field C<003@>, subfield C<0>. For instance a +minimal record with just one field (having tag C<003@> and no occurrence): { _id => '12345X', @@ -300,7 +300,7 @@ or in short form: [ [ '003@', undef, '0' => '12345X' ] ] PICA path expressions (see L) can be used to facilitate processing -PICA+ records. +PICA+ records and L to validate PICA+ records. =head1 FUNCTIONS @@ -427,7 +427,7 @@ Same as C but only returns the first value. =head2 fields( $path ) -Returns a PICA record limited to fields specified in a L +Returns a PICA record limited to fields specified in a L expression. Always returns an array reference. =head2 holdings diff --git a/lib/PICA/Schema.pm b/lib/PICA/Schema.pm new file mode 100644 index 0000000..d57c647 --- /dev/null +++ b/lib/PICA/Schema.pm @@ -0,0 +1,142 @@ +package PICA::Schema; +use strict; +use warnings; + +our $VERSION = '0.34'; + +use Scalar::Util qw(reftype); + +sub new { + my ($class, $schema) = @_; + bless $schema, $class; +} + +sub check { + my ($self, $record, %options) = @_; + + $record = $record->{record} if reftype $record eq 'HASH'; + + $options{counter} = {}; + return map { $self->check_field($_, %options) } @$record; +} + +sub _error { + my $field = shift; + return { + tag => $field->[0], + ($field->[1] ? (occurrence => $field->[1]) : ()), + @_ + } +} + +sub check_field { + my ($self, $field, %options) = @_; + + my $spec = $self->{fields}{$field->[0]}; + + if (!$spec) { + if (!$options{ignore_unknown_fields}) { + return _error($field, message => 'unknown field') + } else { + return () + } + } + + if ($options{counter} && $spec->{unique}) { + my $tag_occ = join '/', grep { defined } @$field[0,1]; + if ($options{counter}{$tag_occ}++) { + return _error($field, unique => 1, message => 'field is not repeatable') + } + } + + my %errors; + if ($spec->{subfields}) { + my %sfcounter; + my (undef, undef, @subfields) = @$field; + while (@subfields) { + my ($code, undef) = splice @subfields, 0, 2; + my $sfspec = $spec->{subfields}{$code}; + + if ($sfspec) { + if ($sfspec->{unique} && $sfcounter{$code}++) { + $errors{$code} = { + message => 'subfield is not repeatable', + unique => 1 + }; + } + } elsif (!$options{ignore_unknown_subfields}) { + $errors{$code} = { message => 'unknown subfield' }; + } + } + } + + return %errors ? _error($field, subfields => \%errors) : (); +} + +1; +__END__ + +=head1 NAME + +PICA::Schema - Specification of a PICA based format + +=head1 DESCRIPTION + +A PICA Schema defines a set of PICA+ fields and subfields to validate +L records. A schema is given as hash reference such as: + + { + fields => { + '021A' => { }, # field without additional information + '003@' => { # field with additional constraints + unique => 1, + label => 'Pica-Produktionsnummer', + subfields => { + 0 => { unique => 1 } + } + } + } + } + +=head1 METHODS + +=head2 check( $record [, %options ] ) + +Check whether a given L record confirms to the schema and return a +list of detected violations. Possible options include: + +=over + +=item ignore_unknown_fields + +Don't report fields not included in the schema. + +=item ignore_unknown_subfields + +Don't report subfields not included in the schema. + +=back + +Errors are given as list of hash reference with keys C and C +set to tag and (optional) ocurrence of the violated field. If key C is +set, the field was repeated although not repeatable. Otherwise, if key +C is set, the field was defined but contained invalid subfields. + +Additional error field C contains a human-readable error message which +can also be derived from the rest of the error object. + +=head2 check_field( $field [, %options ] ) + +Check whether a PICA field confirms to the schema. Use same options as method C. + +=head1 LIMITATIONS + +The current version can only validate records with tags on level 0. + +=head1 SEE ALSO + +L (support may be added in a future version) + +L + +=cut diff --git a/t/50-schema.t b/t/50-schema.t new file mode 100644 index 0000000..58e25f1 --- /dev/null +++ b/t/50-schema.t @@ -0,0 +1,50 @@ +use strict; +use warnings; +use utf8; +use PICA::Data qw(pica_parser); +use PICA::Schema; +use Test::More; + +my $schema = PICA::Schema->new({ fields => { '021A' => { unique => 1 } } }); + +sub validate(@) { ## no critic + my ($schema, $record, $errors, %options) = @_; + + use Data::Dumper; print Dumper([ $schema->check($record) ]); + + my ($message) = map { $_->{message} } @$errors; + is_deeply $errors, [ $schema->check($record, %options) ], $message; +} + +my $record = [ ['021A', undef, a => 'title'] ]; +validate $schema, $record, []; + +push @$record, ['021A', undef, a => 'title']; +validate $schema, $record, [ { tag => '021A', unique => 1, message => 'field is not repeatable' } ]; + +$record->[1] = ['003@', undef, 0 => '12345']; +validate $schema, $record, [ { tag => '003@', message => 'unknown field' } ]; +validate $schema, $record, [], ignore_unknown_fields => 1; + +$schema->{fields}{'003@'} = { unique => 1, subfields => { } }; +validate $schema, $record, [ { + tag => '003@', + subfields => { + 0 => { message => 'unknown subfield' } + } } ]; +validate $schema, $record, [], ignore_unknown_subfields => 1; + +$schema->{fields}{'003@'} = { unique => 1, subfields => { 0 => { unique => 1 } } }; +validate $schema, $record, []; + +$record->[1] = ['003@', undef, 0 => '12345', 0 => '6789']; +validate $schema, $record, [ { + tag => '003@', + subfields => { + 0 => { message => 'subfield is not repeatable', unique => 1 } + } } ]; + +# TODO: check fields in level 1 and level 2 +# $record = pica_parser( 'Plain' => 't/files/bgb.example' )->next; + +done_testing;