Skip to content

Commit

Permalink
Merge pull request #33 from gbv/schema
Browse files Browse the repository at this point in the history
Add PICA::Schema
  • Loading branch information
jorol committed Jan 29, 2018
2 parents b4cda16 + 06a5d05 commit 459f6f1
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 7 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ PICA::Data - PICA record processing
use PICA::Parser::XML;
use PICA::Writer::Plain;
use PICA::Schema;

$parser = PICA::Parser::XML->new( @options );
$writer = PICA::Writer::Plain->new( @options );
$schema = PICA::Schema->new({ fields => { '021A' => { unique => 1 } } });

# parse records
while ( my $record = $parser->next ) {
Expand Down Expand Up @@ -44,6 +47,10 @@ PICA::Data - PICA record processing
# stringify record
my $plain = $record->string;
my $xml = $record->string('xml');

# check 021A exists and is not repeated
my @errors = $schema->check($record, ignore_unknown_fields => 1);
...
}

# parse single record from string
Expand Down
14 changes: 7 additions & 7 deletions lib/PICA/Data.pm
Original file line number Diff line number Diff line change
Expand Up @@ -282,11 +282,11 @@ Austauschformat fuer Bibliotheken (MAB). In addition to PICA+ in CBS there is
the cataloging format Pica3 which can losslessly be convert to PICA+ and vice
versa.
Records in PICA::Data are encoded either as array of arrays, the inner
arrays representing PICA fields, or as an object with two fields, C<_id> and
C<record>, the latter holding the record as array of arrays, and the former
holding the record identifier, stored in field C<003@>, subfield C<0>. For
instance a minimal record with just one field C<003@>:
Records in PICA::Data are encoded either as array of arrays, the inner arrays
representing PICA fields, or as an object with two keys, C<_id> and C<record>,
the latter holding the record as array of arrays, and the former holding the
record identifier, stored in field C<003@>, subfield C<0>. For instance a
minimal record with just one field (having tag C<003@> and no occurrence):
{
_id => '12345X',
Expand All @@ -300,7 +300,7 @@ or in short form:
[ [ '003@', undef, '0' => '12345X' ] ]
PICA path expressions (see L<PICA::Path>) can be used to facilitate processing
PICA+ records.
PICA+ records and L<PICA::Schema> to validate PICA+ records.
=head1 FUNCTIONS
Expand Down Expand Up @@ -427,7 +427,7 @@ Same as C<values> but only returns the first value.
=head2 fields( $path )
Returns a PICA record limited to fields specified in a L<PICA::path>
Returns a PICA record limited to fields specified in a L<PICA::Path>
expression. Always returns an array reference.
=head2 holdings
Expand Down
142 changes: 142 additions & 0 deletions lib/PICA/Schema.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
package PICA::Schema;
use strict;
use warnings;

our $VERSION = '0.34';

use Scalar::Util qw(reftype);

sub new {
my ($class, $schema) = @_;
bless $schema, $class;
}

sub check {
my ($self, $record, %options) = @_;

$record = $record->{record} if reftype $record eq 'HASH';

$options{counter} = {};
return map { $self->check_field($_, %options) } @$record;
}

sub _error {
my $field = shift;
return {
tag => $field->[0],
($field->[1] ? (occurrence => $field->[1]) : ()),
@_
}
}

sub check_field {
my ($self, $field, %options) = @_;

my $spec = $self->{fields}{$field->[0]};

if (!$spec) {
if (!$options{ignore_unknown_fields}) {
return _error($field, message => 'unknown field')
} else {
return ()
}
}

if ($options{counter} && $spec->{unique}) {
my $tag_occ = join '/', grep { defined } @$field[0,1];
if ($options{counter}{$tag_occ}++) {
return _error($field, unique => 1, message => 'field is not repeatable')
}
}

my %errors;
if ($spec->{subfields}) {
my %sfcounter;
my (undef, undef, @subfields) = @$field;
while (@subfields) {
my ($code, undef) = splice @subfields, 0, 2;
my $sfspec = $spec->{subfields}{$code};

if ($sfspec) {
if ($sfspec->{unique} && $sfcounter{$code}++) {
$errors{$code} = {
message => 'subfield is not repeatable',
unique => 1
};
}
} elsif (!$options{ignore_unknown_subfields}) {
$errors{$code} = { message => 'unknown subfield' };
}
}
}

return %errors ? _error($field, subfields => \%errors) : ();
}

1;
__END__
=head1 NAME
PICA::Schema - Specification of a PICA based format
=head1 DESCRIPTION
A PICA Schema defines a set of PICA+ fields and subfields to validate
L<PICA::Data> records. A schema is given as hash reference such as:
{
fields => {
'021A' => { }, # field without additional information
'003@' => { # field with additional constraints
unique => 1,
label => 'Pica-Produktionsnummer',
subfields => {
0 => { unique => 1 }
}
}
}
}
=head1 METHODS
=head2 check( $record [, %options ] )
Check whether a given L<PICA::Data> record confirms to the schema and return a
list of detected violations. Possible options include:
=over
=item ignore_unknown_fields
Don't report fields not included in the schema.
=item ignore_unknown_subfields
Don't report subfields not included in the schema.
=back
Errors are given as list of hash reference with keys C<tag> and C<occurrence>
set to tag and (optional) ocurrence of the violated field. If key C<unique> is
set, the field was repeated although not repeatable. Otherwise, if key
C<subfields> is set, the field was defined but contained invalid subfields.
Additional error field C<message> contains a human-readable error message which
can also be derived from the rest of the error object.
=head2 check_field( $field [, %options ] )
Check whether a PICA field confirms to the schema. Use same options as method C<check>.
=head1 LIMITATIONS
The current version can only validate records with tags on level 0.
=head1 SEE ALSO
L<PICA::Path> (support may be added in a future version)
L<MARC::Lint>
=cut
50 changes: 50 additions & 0 deletions t/50-schema.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use strict;
use warnings;
use utf8;
use PICA::Data qw(pica_parser);
use PICA::Schema;
use Test::More;

my $schema = PICA::Schema->new({ fields => { '021A' => { unique => 1 } } });

sub validate(@) { ## no critic
my ($schema, $record, $errors, %options) = @_;

use Data::Dumper; print Dumper([ $schema->check($record) ]);

my ($message) = map { $_->{message} } @$errors;
is_deeply $errors, [ $schema->check($record, %options) ], $message;
}

my $record = [ ['021A', undef, a => 'title'] ];
validate $schema, $record, [];

push @$record, ['021A', undef, a => 'title'];
validate $schema, $record, [ { tag => '021A', unique => 1, message => 'field is not repeatable' } ];

$record->[1] = ['003@', undef, 0 => '12345'];
validate $schema, $record, [ { tag => '003@', message => 'unknown field' } ];
validate $schema, $record, [], ignore_unknown_fields => 1;

$schema->{fields}{'003@'} = { unique => 1, subfields => { } };
validate $schema, $record, [ {
tag => '003@',
subfields => {
0 => { message => 'unknown subfield' }
} } ];
validate $schema, $record, [], ignore_unknown_subfields => 1;

$schema->{fields}{'003@'} = { unique => 1, subfields => { 0 => { unique => 1 } } };
validate $schema, $record, [];

$record->[1] = ['003@', undef, 0 => '12345', 0 => '6789'];
validate $schema, $record, [ {
tag => '003@',
subfields => {
0 => { message => 'subfield is not repeatable', unique => 1 }
} } ];

# TODO: check fields in level 1 and level 2
# $record = pica_parser( 'Plain' => 't/files/bgb.example' )->next;

done_testing;

0 comments on commit 459f6f1

Please sign in to comment.