Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
package EPrints::Plugin::Sword::Import::RJ_Broker;
use strict;
use XML::LibXML 1.63;
#use Data::Dumper;
use EPrints::Plugin::Sword::Import;
our @ISA = qw/ EPrints::Plugin::Sword::Import /;
our %SUPPORTED_MIME_TYPES = ( "application/zip" => 1, );
our %UNPACK_MIME_TYPES = ( "application/zip" => "Sword::Unpack::Sub_Zip", );
sub new {
my ( $class, %params ) = @_;
my $self = $class->SUPER::new(%params);
$self->{name}
= "SWORD Importer - Repository Junction Broker (Default EPrints)";
# Make it visible on the import menu and elsewhere
$self->{visible} = "all";
$self->{produce} = [ 'dataobj/eprint' ];
return $self;
} ## end sub new
### $opts{file} = $file;
### $opts{mime_type} = $headers->{content_type};
### $opts{dataset_id} = $target_collection;
### $opts{owner_id} = $owner->get_id;
### $opts{depositor_id} = $depositor->get_id if(defined $depositor);
### $opts{no_op} = is this a No-op?
### $opts{verbose} = is this verbosed?
sub input_file {
my ( $plugin, %opts ) = @_;
my $session = $plugin->{session};
my $dir = $opts{dir};
my $mime = $opts{mime_type};
my $file = $opts{file};
my $NO_OP = $opts{no_op};
my $log_dir = $session->config('archiveroot');
my $LOGDIR = "$log_dir/var";
my $logging = "|Broker_Sword_Import";
# barf unless we have been given a .zip file
unless ( defined $SUPPORTED_MIME_TYPES{$mime} ) {
$plugin->add_verbose("[ERROR] unknown MIME TYPE '$mime'.");
$plugin->set_status_code(415);
return undef;
}
my $unpacker = $UNPACK_MIME_TYPES{$mime};
my $tmp_dir;
# This is localised simply so we can have a bunch of variables to deduce the
# METS file I want (I know the file will be called mets.xml, and I know it
# will be at the top of the tree.
# It also barfs if we can't unzip the file given.
{
$tmp_dir = EPrints::TempDir->new( "swordXXXX", UNLINK => 1 );
if ( !defined $tmp_dir ) {
print STDERR
"\n[SWORD-DEPOSIT] [INTERNAL-ERROR] Failed to create the temp directory!";
$plugin->add_verbose("[ERROR] failed to create the temp directory.");
$plugin->set_status_code(500);
return undef;
} ## end if ( !defined $tmp_dir)
my $files = $plugin->unpack_files( $unpacker, $file, $tmp_dir );
unless ( defined $files ) {
$plugin->add_verbose("[ERROR] failed to unpack the files");
return undef;
}
my $candidates = $plugin->get_files_to_import( $files, "text/xml" );
if ( scalar(@$candidates) == 0 ) {
$plugin->add_verbose("[ERROR] could not find the XML file");
$plugin->set_status_code(400);
return undef;
}
elsif ( scalar(@$candidates) > 1 ) {
# We have more than one possible xml file. Take 'mets.xml' if it exists,
# otherwise just take the one with the 'highest' name, alphabetic-wise
my @mets = grep /\bmets\.xml$/, @$candidates;
if ( scalar(@mets) == 1 ) {
$$candidates[0] = $mets[0];
$plugin->add_verbose(
"[WARNING] there were more than one XML file in this archive, however I'm using the one called 'mets.xml'"
);
}
elsif ( scalar(@mets) > 1 ) {
@$candidates = sort { length($a) cmp length($b) } @mets;
$plugin->add_verbose(
"[WARNING] there were more than one mets.XML file in this archive, however I'm using the one closest to the top of the 'tree'"
);
}
else {
@mets = sort @$candidates;
$$candidates[0] = $mets[0];
$plugin->add_verbose(
"[WARNING] there were more than one XML file in this archive. I am using the one with the 'highest' name, alphabetic-wise (closer to 'z' than 'A')"
);
} ## end else [ if ( scalar(@mets) == ...)]
} ## end elsif ( scalar(@$candidates...))
$file = $$candidates[0];
} ## end if ( defined $unpacker)
# OK - so $file now contains the path & filenamne of the mets.xml file we're importing
# $tmp_dir is the directory that the .zip file has been unpacked into.
my $dataset_id = $opts{dataset_id};
my $owner_id = $opts{owner_id};
my $depositor_id = $opts{depositor_id};
my $fh;
if ( !open( $fh, $file ) ) {
$plugin->add_verbose(
"[ERROR] couldnt open the file: '$file' because '$!'");
$plugin->set_status_code(500);
return;
} ## end if ( !open( $fh, $file...))
# next problem: sometimes we end up with an extra directory in the stack
# (as in $tmp_dir/content/stuff)
# we need to pull "content/" off the file names
my $unpack_dir;
my $fntmp;
$fntmp = $file;
if ( $fntmp =~ /^(.*)\/(.*)$/ ) {
$unpack_dir = $1;
$fntmp = $2;
}
# $tmp_dir is the temporary directory
# $file is the name of the mets.xml file in that directory
# $unpack_dir is whatever 'content/' actually turns out to be (if anything)
# $fntmp is the mets.xml file we're going to import (under $tmp_dir/$unpack_dir)
# Finally, we can read the xml from the file:
my $xml;
while ( my $d = <$fh> ) {
$xml .= $d;
}
close $fh;
# $dataset is the EPrints object for where we're putting the deposit
my $dataset = $session->get_archive()->get_dataset($dataset_id);
# again, barf if the dataset is not provided (though as this is through SWORD,
# this shouldn't happen
if ( !defined $dataset ) {
print STDERR
"\n[SWORD-METS] [INTERNAL-ERROR] Failed to open the dataset '$dataset_id'.";
$plugin->add_verbose(
"[INTERNAL ERROR] failed to open the dataset '$dataset_id'");
$plugin->set_status_code(500);
return;
} ## end if ( !defined $dataset)
# convert the xml text to a LibXML dom-object
# What EPrints doesn't do is validate, it just checks for well-formed-ness
# We have it in an eval block in case the xml is not well-formed, as the parser
# throws an exception if the document is invalid
my $dom_doc;
eval { $dom_doc = EPrints::XML::parse_xml_string($xml); };
# No dom? barf
if ( $@ || !defined $dom_doc ) {
$plugin->add_verbose("[ERROR] failed to parse the xml: '$@'");
$plugin->set_status_code(400);
return;
}
if ( !defined $dom_doc ) {
$plugin->{status_code} = 400;
$plugin->add_verbose("[ERROR] failed to parse the xml.");
return;
}
# If the top element is not mets, barf
# This is just in case someone sends us a non METS xml file
my $dom_top = $dom_doc->getDocumentElement;
if ( lc $dom_top->tagName ne 'mets' ) {
$plugin->set_status_code(400);
$plugin->add_verbose(
"[ERROR] failed to parse the xml: no <mets> tag found.");
return;
} ## end if ( lc $dom_top->tagName...)
#####
# Now we dig down to get the xmldata from the METS dmdSec section
#####
# METS Headers (ignored)
my $mets_hdr = ( $dom_top->getElementsByTagName("metsHdr") )[0];
# METS Descriptive Metadata (main section for us)
# need to loop on dmdSec:
my @dmd_sections = $dom_top->getElementsByTagName("dmdSec");
my $dmd_id;
my $md_wrap;
my $found_wrapper = 0;
# METS can have more than one dmdSec, and we can't assume that what's
# pushed in conforms to what we want, so we want to find the stuff
# formally idendtified as containing epdcx
# (I know it doesn't guarentee anything, but its a start!)
foreach my $dmd_sec (@dmd_sections) {
# need to extract xmlData from here
$md_wrap = ( $dmd_sec->getElementsByTagName("mdWrap") )[0];
next if ( !defined $md_wrap );
next
if (
!( lc $md_wrap->getAttribute("MDTYPE") eq 'other'
&& defined $md_wrap->getAttribute("OTHERMDTYPE")
&& lc $md_wrap->getAttribute("OTHERMDTYPE") eq 'epdcx'
)
);
$found_wrapper = 1;
$dmd_id = $dmd_sec->getAttribute("ID");
last;
} ## end foreach my $dmd_sec (@dmd_sections)
unless ($found_wrapper) {
$plugin->set_status_code(400);
$plugin->add_verbose(
"[ERROR] failed to parse the xml: could not find epdcx <mdWrap> section."
);
return;
} ## end unless ($found_wrapper)
#########
# Now we get the (hopefully epdcx) xml from the xmlData section
########
my $xml_data = ( $md_wrap->getElementsByTagName("xmlData") )[0];
if ( !defined $xml_data ) {
$plugin->set_status_code(400);
$plugin->add_verbose(
"[ERROR] failed to parse the xml: no <xmlData> tag found.");
return;
} ## end if ( !defined $xml_data)
########
# and parse it into a hash
########
my $epdata = $plugin->parse_epdcx_xml_data($xml_data);
return unless ( defined $epdata );
# Add some useful (SWORD) info
if ( defined $depositor_id ) {
$epdata->{userid} = $owner_id;
$epdata->{sword_depositor} = $depositor_id;
}
else {
$epdata->{userid} = $owner_id;
}
$epdata->{eprint_status} = $dataset_id;
# We now want to deal with the associated files (if any).
# This is not a straight "every file is a document" import: EPrints has this
# weird thing where a "document" can have multiple files. The Broker deals
# with this by having defining the structure in <structMap>, and putting
# each document into a <div>. If there are multiple files, the main file is
# the first file in the set
# The first thing to do is create a list of all the files & their IDs
# File Section which will contain optional info about files to import:
my %files;
my $file_counter = 0;
foreach my $file_sec ( $dom_top->getElementsByTagName("fileSec") ) {
my $file_grp = ( $file_sec->getElementsByTagName("fileGrp") )[0];
$file_sec = $file_grp
if ( defined $file_grp )
; # this is because the <fileGrp> tag is optional
foreach my $file_div ( $file_sec->getElementsByTagName("file") ) {
my $file_id = $file_div->getAttribute("ID");
my $file_loc = ( $file_div->getElementsByTagName("FLocat") )[0];
if ( defined $file_loc ) { # yeepee we have a file (maybe)
my $fn = $file_loc->getAttribute("href");
unless ( defined $fn ) {
# to accommodate the gdome XML library:
$fn = $file_loc->getAttribute("xlink:href");
}
next unless ( defined $fn );
$files{$file_id} = $fn; # list files to get locally
} ## end if ( defined $file_loc)
} ## end foreach my $file_div ( $file_sec...)
} ## end foreach my $file_sec ( $dom_top...)
unless ( scalar keys %files ) {
$plugin->add_verbose(
"[WARNING] no <fileSec> tag found: no files will be imported.");
}
# we also need to hang onto details about where the importer unpacked stuff.
# I know it's nothing to do with files in the METS manifest, however this is
# probably the best place to keep them :)
$files{unpack_dir} = $unpack_dir;
$files{fntmp} = $fntmp;
# Oh - if the SWORD conenction has specified NO-OP, then don't actually
# go any further
if ($NO_OP) {
# need to send 200 Successful (the deposit handler will generate
# the XML response)
$plugin->add_verbose(
"[OK] Plugin - import successful (but in No-Op mode).");
$plugin->set_status_code(200);
return;
} ## end if ($NO_OP)
my $eprint = $dataset->create_object( $plugin->{session}, $epdata );
unless ( defined $eprint ) {
$plugin->set_status_code(500);
$plugin->add_verbose("[ERROR] failed to create the EPrint object.");
return;
}
$logging .= "|" . $eprint->get_id;
# Having created a base EPrint, and determined what files we have to add
# to it, and we're actually going to deposit it, we need to create a "map"
# of what files go where from the <structMap> element.
# $dmd_id contains the ID for the div we're looking for
#
# Oh, and a complication: the embargo information is kept in the
# amdSec, so we need to find any embargo dates.
# There will be two options: ClosedAccess and RestrictedAccess (RJB doesn't
# record OpenAccess dated)
# The system needs to map the resourceId of the epdcx:description to both the
# rightsAccess and the date
$xml_data = "";
my $amdSec = ( $dom_top->getElementsByTagName("amdSec") )[0];
$xml_data = ( $amdSec->getElementsByTagName("xmlData") )[0];
$files{'embargo_hash'} = $plugin->parse_rights_data($xml_data) if $xml_data;
my ( $embargo, $security );
foreach my $structMap ( $dom_top->getElementsByTagName("structMap") ) {
# we want the [first] div that has a dmdid that matches $dmd_id
my $div;
my @divs = $structMap->getElementsByTagName("div"); #childNodes;
foreach my $d (@divs) {
next unless $d->hasAttribute('DMDID');
my $v = $d->getAttribute('DMDID');
next unless $v eq $dmd_id;
$div = $d;
} ## end foreach my $d (@divs)
if ($div) {
my $doc_data_ref = {};
my @nodes = $div->getElementsByTagName("div"); #childNodes;
# As we need the embargo details later on, we have our own
# process_struct_elements routine that adds embargos and returns
# the embargo date and the security restriction.
my $document;
( $document, $embargo, $security )
= process_struct_elements( \@nodes, \%files, $doc_data_ref, $eprint,
$session );
if ( !defined $document ) {
$plugin->add_verbose("[WARNING] Failed to create Document object.");
}
} ## end if ($div)
} ## end foreach my $structMap ( $dom_top...)
if ( $plugin->keep_deposited_file() ) {
if (
$plugin->attach_deposited_file(
$eprint, $opts{file}, $opts{mime_type}, $embargo, $security
)
)
{
$plugin->add_verbose("[OK] attached deposited file.");
}
else {
$plugin->add_verbose("[WARNING] failed to attach the deposited file.");
}
} ## end if ( $plugin->keep_deposited_file...)
$plugin->add_verbose("[OK] EPrint object created.");
# Keep a track of the logging
my $log_string = scalar(CORE::localtime) . "$logging";
open( LOG, ">>$LOGDIR/transfer_log" )
or warn("could not open log file $LOGDIR/transfer_log");
print LOG "$log_string\n";
close(LOG);
# and let's put it in the error_log too since it's dead useful
$plugin->{session}->log("logging|$log_string");
return $eprint;
} ## end sub input_file
# This is where we actually track & build the stuff that
# adds the files. It also sets any embargo period.
# As we need the embargo details later on, it returns the
# embargo date and the security restriction.
sub process_struct_elements {
my ( $nodes_ref, $files_hashref, $doc_data_ref, $eprint, $session ) = @_;
my $hash_ref = {};
my $document;
my $unpack_dir = $files_hashref->{unpack_dir};
my $embargo_hash = $files_hashref->{embargo_hash};
my ( $embargo, $security );
$security = 'staffonly';
# Go through all the children
foreach my $element ( @{$nodes_ref} ) {
my $ID = $element->getAttribute("ID");
if ( exists $embargo_hash->{$ID} ) {
$embargo = $embargo_hash->{$ID}->{'date'};
}
my @fptrs = $element->getElementsByTagName("fptr"); #childNodes;
# check to see what element it is
if ( scalar @fptrs ) {
my $doc_data = {};
foreach my $fptr (@fptrs) {
my $fileid = $fptr->getAttribute("FILEID");
my $file = $files_hashref->{$fileid};
$file =~ m/^(\w+)/;
my $prot = $1 if $1;
$file =~ m/\/([^\/]+)$/; # find everything after the last slash
my $filename = $1 if $1;
# if the file is not a web reference, add 'file://' and the unpack path
# to the filename
for ($prot) {
/http/ && do { last; }; # $filename stays unchanged
{ # default option
$file =~ s#^#file://$unpack_dir/#;
last;
};
} ## end for ($prot)
$doc_data->{_parent} = $eprint
unless defined $doc_data->{_parent};
$doc_data->{eprintid} = $eprint->get_id
unless defined $doc_data->{eprintid};
$doc_data->{main} = $filename unless $doc_data->{main};
$doc_data->{format}
= $session->get_repository->call( 'guess_doc_type', $session,
$filename )
unless exists $doc_data->{format};
if ($embargo) {
$doc_data->{date_embargo} = $embargo;
$doc_data->{security} = $security;
}
my %file_data;
$file_data{filename} = $filename;
$file_data{url} = $file;
$doc_data->{files} = [] unless exists $doc_data->{files};
push @{ $doc_data->{files} }, \%file_data;
} ## end foreach my $fptr (@fptrs)
my $doc_dataset = $session->get_repository->get_dataset("document");
local $session->get_repository->{config}->{enable_web_imports} = 1;
local $session->get_repository->{config}->{enable_file_imports} = 1;
$document
= EPrints::DataObj::Document->create_from_data( $session, $doc_data,
$doc_dataset );
} ## end if ( scalar @fptrs )
} ## end foreach my $element ( @{$nodes_ref...})
return ( $document, $embargo, $security );
} ## end sub process_struct_elements
sub add_file {
my ( $eprint, $hash_ref, $unpack_dir ) = @_;
my $prot;
my $file = $hash_ref->{'file'};
$file =~ m/^(\w+)/;
$prot = $1 if $1;
$file =~ m/\/([^\/]+)$/; # find everything after the last slash
my $filename = $1 if $1;
return undef unless $filename;
$filename =~ s# #\ #g; # replace spaces with '\ '
# if the file is not a web reference, add 'file://' and the unpack path to the
# filename
for ($prot) {
/http/ && do {
# $filename stays unchanged
last;
};
{
$file =~ s#^#file://$unpack_dir/# if $file;
last;
};
} ## end for ($prot)
my $session = $hash_ref->{'session'};
# Setup: we return a doc_data hash.
# If one is passed in, we update & return that, else create a new one.
my $doc_data = {};
$doc_data->{_parent} = $eprint unless defined $doc_data->{_parent};
$doc_data->{eprintid} = $eprint->get_id
unless defined $doc_data->{eprintid};
$doc_data->{main} = $filename unless $doc_data->{main};
$doc_data->{format}
= $session->get_repository->call( 'guess_doc_type', $session,
$filename )
unless exists $doc_data->{format};
my %file_data;
$file_data{filename} = $filename;
$file_data{url} = "$file";
$doc_data->{files} = [ \%file_data ];
return $doc_data;
} ## end sub add_file
sub keep_deposited_file {
return 1;
}
sub parse_epdcx_xml_data {
my ( $plugin, $xml ) = @_;
my $set = ( $xml->getElementsByTagName("descriptionSet") )[0];
unless ( defined $set ) {
$plugin->set_status_code(400);
$plugin->add_verbose("ERROR: no <descriptionSet> tag found.");
return;
}
my $epdata = {};
my @creators_array = ();
foreach my $desc ( $set->getElementsByTagName("description") ) {
my $att = $desc->getAttribute("epdcx:resourceId");
for ($att) {
# title; abstract; identifer (publisherID);
# creator; affilitated institution (from authors)
# Note: we need to pass in the full $set so we can find
# creator info
m#^sword-mets-epdcx# && do {
$plugin->_parse_epdcx( $desc, $epdata, $set );
last;
};
# type; doi; date (yyyy-mm-dd); status; publisher; copyright_holder;
m#^sword-mets-expr# && do {
$plugin->_parse_expr( $desc, $epdata );
last;
};
# publication; issn; isbn; volume; issue; pages;
m#^sword-mets-manif# && do {
$plugin->_parse_manif( $desc, $epdata, $set );
last;
};
}
} ## end foreach my $desc ( $set->getElementsByTagName...)
return $epdata;
} ## end sub parse_epdcx_xml_data
# Parser for the "work" level
# item type; title; abstract; identifer (publisherID);
# creator; affilitated institution (from authors)
# Funder (a ';' seperated list); Grant codes (also a ';' seperated list)
# epdata is passed in, therefore the routine modifies the actual hash
sub _parse_epdcx {
my ( $plugin, $desc, $epdata, $set ) = @_;
foreach my $stat ( $desc->getElementsByTagName("statement") ) {
my ( $field, $value ) = _parse_statement($stat);
for ($field) {
# Note - for creators, we have two resources of information
# 1) There is the name string here in the 'work' section
# 2) The broker also stores an extended record in a seperate
# description with a known resourceId... a resourceId that
# matches the $attr value returned here.
m#creator# && do {
my ( $name, $attr ) = split /\t/, $value;
my ( $family, $given, $email, $inst );
# can we get the information from Person description?
if ($attr) {
foreach my $d ( $set->getElementsByTagName("description") ) {
next unless $d->getAttribute('epdcx:resourceId') eq $attr;
foreach my $stat2 ( $d->getElementsByTagName("statement") ) {
my ( $field2, $value2 ) = _parse_statement($stat2);
for ($field2) {
m#givenname# && do { $given = $value2; last; };
m#familyname# && do { $family = $value2; last; };
m#email# && do { $email = $value2; last; };
}
}
}
}
# if not, deduce it from the text here
unless ($family) {
if ( $name =~ /(\w+)\,\s?(.*)$/ ) {
$family = $1, $given = $2;
}
else {
$family = $value;
}
}
# now add the information to the eprint
# NOTE: eprints uses the index into arrays to syncronise
# names, email addresses and other fields, so we have to
# add something, even if its NULL
push @{ $epdata->{creators_name} },
{ family => $family, given => $given };
push @{ $epdata->{creators_id} }, $email;
last;
};
m#title# && do {
$epdata->{'title'} = $value;
last;
};
m#abstract# && do {
$epdata->{'abstract'} = $value;
last;
};
m#identifier# && do {
$epdata->{'id_number'} = $value;
last;
};
# In EPrints, there is a "Funders" field, but no grant.
# This is a repeating field, so we can push the items onto an array
# To note grant codes, we are bodging the string:
# <funder>: <grant>, <grant>, <grant>
# ... it makes it human readable, and computer-parseable
m#funders# && do {
if ($value) {
my ( $funder, $gcode );
$funder = $value;
$funder =~ s/^funder //;
my @grants = ();
foreach my $d ( $set->getElementsByTagName("description") ) {
next unless $d->getAttribute('epdcx:resourceId') eq $value;
foreach my $stat2 ( $d->getElementsByTagName("statement") ) {
my ( $field2, $value2 ) = _parse_statement($stat2);
if ( $field2 eq 'grant_code' ) {
push @grants, $value2;
}
}
if ( scalar @grants ) {
$funder .= ': ';
$funder .= join '; ', @grants;
}
}
push @{ $epdata->{'funders'} }, $funder;
}
last;
};
# This becomes a pseudo-list (as EPrints doesn't have this
# as a multiple field
m#affiliatedInstitution# && do {
my $i;
$i = $epdata->{'institution'} if exists $epdata->{'institution'};
if ($i) {
my @i = split /; /, $i;
push @i, $value;
$value = join '; ', @i;
}
$epdata->{'institution'} = $value;
last;
};
}
} ## end foreach my $stat ( $desc->getElementsByTagName...)
} ## end if($att =~ /^sword-mets-/)...
# Parser for the "expression" level
# genre, citation, doi, status, publisher, editor (provenance for
# opendepot), date published
# epdata is passed in, therefore the routine modifies the actual hash
sub _parse_expr {
my ( $plugin, $desc, $epdata ) = @_;
foreach my $stat ( $desc->getElementsByTagName("statement") ) {
my ( $field, $value ) = _parse_statement($stat);
for ($field) {
# This gets used once we have built the full record...
m#type# && do {
$epdata->{'type'} = $value;
last;
};
# date (published)
m#date# && do {
$epdata->{ispublished} = 'pub';
$epdata->{date} = $value;
$epdata->{date_type} = 'published';
last;
};
# This could be either a local identifier
# a local identifier code, or a URI into the broker
# We definitely don't want the latter.
# Of the first two, take a doi in preference to a simple identifier
m#identifier# && do {
last
if ( $value =~ /edina\.ac\.uk/ )
; # skip anything that refers to edina
my $i;
$i = $epdata->{identifier} if exists $epdata->{identifier};
last if ( $i =~ /doi\.org/ ); # skip if we already have a doi
$epdata->{identifier} = $value;
last;
};
# peer reviewed
m#refereed# && do {
$epdata->{'refereed'} = $value;
last;
};
# we need to come back to this: for a thesis, the broker
# understands that thesis are "institution; department"
# whereas everything else is just "publisher"
# stuff it in publisher for now, and sort it later
m#publisher# && do {
$epdata->{'publisher'} = $value;
last;
};
m#provenance# && do {
$epdata->{'provenance'} = $value; # copyright_holder?
last;
};
} ## end if ( defined $field )
} ## end foreach my $stat ( $desc->getElementsByTagName...)
} ## end if($att =~ /^sword-mets-/)...
# Parser for the "manifest" level
# epdata is passed in, therefore the routine modifies the actual hash
sub _parse_manif {
my ( $plugin, $desc, $epdata, $set ) = @_;
foreach my $stat ( $desc->getElementsByTagName("statement") ) {
my ( $field, $value ) = _parse_statement($stat);
for ($field) {
m#cite# && do {
my $cite = clean_text($value);
# now we need to decode the citation
# $publication [$vol[($num)]][, $pps|$pgs]
my ( $pub, $vol, $num, $pps, $pgs );
# Start by pulling the page range or number of pages off the end
if ( $cite =~ /\, (\d+(:?\-\d+))$/ ) {
$cite =~ s/\, (\d+(:?\-\d+))$//;
$num = $1 if $1;
if ( $num =~ /\-/ ) { $pps = $num }
else { $pgs = $num }
$num = undef;
} ## end if ( $cite =~ /\, (\d+(:?\-\d+))$/)
# Do we have volume number ?
if ( $cite =~ /\((.+)\)$/ ) {
$cite =~ s/\((.+)\)$//;
$num = $1 if $1;
}
# Get the volume number
if ( $cite =~ /\s+(\d+)$/ ) {
$cite =~ s/\s+(\d+)$//;
$vol = $1 if $1;
}
$pub = $cite;
$epdata->{publication} = $pub
if ( $pub && not exists $epdata->{publication} );
$epdata->{volume} = $vol if ( $vol && not exists $epdata->{volume} );
$epdata->{number} = $num if ( $num && not exists $epdata->{number} );
$epdata->{pagerange} = $pps
if ( $pps && not exists $epdata->{pagerange} );
# $epdata->{pages} = $pgs if $pgs;
last;
};
# issn & isbn
m#issn# && do {
$epdata->{'issn'} = $value;
last;
};
m#isbn# && do {
$epdata->{'isbn'} = $value;
last;
};
# publication,volume, issue, pages
# these trump anything from the citation
m#publication# && do {
$epdata->{'publication'} = $value;
last;
};
m#volume# && do {
$epdata->{'volume'} = $value;
last;
};
m#issue# && do {
$epdata->{'issue'} = $value;
last;
};
m#pagerange# && do {
$epdata->{'pagerange'} = $value;
last;
};
m#pages# && do {
# Pages has to be a numeric value!
$epdata->{'pages'} = $value + 0;
last;
};
# Available URLS (DOI, official_url, related urls)
m#available_url# && do {
# available-doi-1; available-official_url-1; available-related_url-4; ...
# can we get the information from Person description?
if ($value) {
foreach my $d ( $set->getElementsByTagName("description") ) {
next unless $d->getAttribute('epdcx:resourceId') eq $value;
my $uri;
$uri = $d->getAttribute('epdcx:resourceUrl');
if ($value =~ /official/) {
$epdata->{'official_url'} = $uri;
} elsif ($value =~ /doi/) {
$epdata->{'id_number'} = $uri;
} else {
push @{$epdata->{'related_url'}}, {url => $uri, type=>'pub'} if $uri
}
}
};
last;
};
} ## end if ( defined $field )
} ## end foreach my $stat ( $desc->getElementsByTagName...)
} ## end if($att =~ /^sword-mets-/)...
# Parser for the unknown level
# This is where we try to find the author/creator information
# epdata is passed in, therefore the routine modifies the actual hash
sub _parse_unknown {
my ( $plugin, $desc, $epdata ) = @_;
$epdata->{unknown} = {} unless exists $epdata->{unknown};
my $resourceId;
$resourceId = $desc->getAttribute("resourceId");
# we only parse if we have a resourceId: creators have them
if ($resourceId) {
$epdata->{unknown}->{$resourceId} = {};
foreach my $stat ( $desc->getElementsByTagName("statement") ) {
my ( $field, $value ) = _parse_statement($stat);
for ($field) {
m#givenname# && do {
$epdata->{'pages'} = $value;
last;
};
#familyname
#institution
#email
}
} ## end foreach my $stat ( $desc->getElementsByTagName...)
}
} ## end if($att =~ /^sword-mets-/)...
sub _parse_statement {
my ($stat) = @_;
my $property = $stat->getAttribute("propertyURI");
unless ( defined $property ) {
$property = $stat->getAttribute("epdcx:propertyURI");
}
for ($property) {
m#http://purl.org/dc/elements/1.1/type# && do {
return _get_type($stat);
last;
}; ## end if ($property eq 'http://purl.org/dc/elements/1.1/type'...)
m#http://purl.org/dc/elements/1.1/title# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
my $title
= EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( "title", clean_text($title) );
last;
}; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/title'...)
m#http://purl.org/dc/terms/abstract# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
my $abstract
= EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( "abstract", clean_text($abstract) );
last;
}; ## end elsif ($property eq 'http://purl.org/dc/terms/abstract'...)
m#http://purl.org/dc/elements/1.1/creator# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value .= "\t"
. $stat->getAttribute("epdcx:valueRef")
; # add the value of the attribute to the creators list
return ( "creator", $value );
last;
}; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/creator'...)
m#http://purl.org/dc/elements/1.1/identifier# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
my $id = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( "id_number", clean_text($id) );
last;
}; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/identifier'...)
m#http://purl.org/eprint/terms/status# && do {
if ( $stat->getAttribute("valueURI") =~ /status\/(.*)$/ ) {
my $status = $1;
if ( $status eq 'PeerReviewed' ) {
$status = 'TRUE';
}
elsif ( $status eq 'NonPeerReviewed' ) {
$status = 'FALSE';
}
else {
return;
}
return ( 'refereed', $status ); # is this the proper field?
} ## end if ( $stat->getAttribute...)
return;
last;
}; ## end elsif ($property eq 'http://purl.org/eprint/terms/status'...)
m#http://purl.org/dc/elements/1.1/language# && do {
# LANGUAGE (not parsed)
last;
};
m#http://purl.org/dc/terms/available# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
my $id = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( "date", clean_text($id) );
last;
};
m#http://purl.org/eprint/terms/copyrightHolder# && do {
# COPYRIGHT HOLDER (not parsed)
last;
};
m#http://purl.org/dc/terms/bibliographicCitation# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
my $cite = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'cite', $cite );
last;
};
m#http://purl.org/dc/elements/1.1/publisher# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'publisher', $value );
last;
};
# This lot are OA-RJ extensions to cover blantently missing fields
m#http://opendepot.org/broker/elements/1.0/publication# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'publication', $value );
last;
};
m#http://opendepot.org/broker/elements/1.0/issn# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'issn', $value );
last;
};
m#http://opendepot.org/broker/elements/1.0/isbn# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'isbn', $value );
last;
};
m#http://opendepot.org/broker/elements/1.0/volume# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'volume', $value );
last;
};
m#http://opendepot.org/broker/elements/1.0/issue# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'issue', $value );
last;
};
m#http://opendepot.org/broker/elements/1.0/pages# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'pagerange', $value ) if $value =~ /\-/;
return ( 'pages', $value );
last;
};
## new record metadata
m#http://purl.org/dc/elements/1.1/givenname# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'givenname', $value );
last;
};
m#http://purl.org/dc/elements/1.1/familyname# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'familyname', $value );
last;
};
m#http://xmlns.com/foaf/0.1/mbox# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'email', $value );
last;
};
m#http://purl.org/eprint/terms/affiliatedInstitution# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'institution', $value );
last;
};
m#http://www.loc.gov/loc.terms/relators/EDT# && do {
my $value = ( $stat->getElementsByTagName("valueString") )[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
return ( 'provenance', $value ); # editor?
last;
};
m#http://www.loc.gov/loc.terms/relators/FND# && do {
my $value = $stat->getAttribute("epdcx:valueRef")
; # add the value of the attribute to the creators list
return ( "funders", $value );
last;
};
m#http://purl.org/eprint/terms/grantNumber# && do {
my $value;
if ($stat->hasAttribute("epdcx:valueRef")) {
$value = $stat->getAttribute("epdcx:valueRef");
} else {
my @vs = $stat->getElementsByTagName("valueString");
if (scalar @vs) {
$value = $vs[0];
$value = EPrints::XML::to_string( EPrints::XML::contents_of($value) );
}
}
return ( "grant_code", $value );
last;
};
## end new data
## Alternate sources
m#http://purl.org/eprint/terms/isAvailableAs# && do {
my $value = $stat->getAttribute("epdcx:valueRef") ;
return ( "available_url", $value );
last;
};
## Data for embargoes
m#http://purl.org/dc/terms/accessRights# && do {
my $value = $stat->getAttribute("epdcx:valueRef")
; # add the value of the attribute to the creators list
$value =~ /([^\/]+)$/;
$value = $1 if $1;
return ( "access", $value );
last;
};
} ## end for ($property)
return;
} ## end sub _parse_statement
sub _get_type {
my ($stat) = @_;
if ( $stat->getAttribute("valueURI") =~ /type\/(.*)$/ ) { # then $1 = type
# reference for these mappings is:
# "http://www.ukoln.ac.uk/repositories/digirep/index/Eprints_Type_Vocabulary_Encoding_Scheme"
my $type = $1; # no need to clean_text( $1 ) here
if ( $type eq 'JournalArticle'
|| $type eq 'JournalItem'
|| $type eq 'SubmittedJournalArticle'
|| $type eq 'WorkingPaper' )
{
$type = 'article';
} ## end if ( $type eq 'JournalArticle'...)
elsif ( $type eq 'Book' ) {
$type = 'book';
}
elsif ( $type eq 'BookItem' ) {
$type = 'book_section';
}
elsif ( $type eq 'ConferenceItem'
|| $type eq 'ConferencePoster'
|| $type eq 'ConferencePaper' )
{
$type = 'conference_item';
} ## end elsif ( $type eq 'ConferenceItem'...)
elsif ( $type eq 'Patent' ) {
$type = 'patent';
}
elsif ( $type eq 'Report' ) {
$type = 'monograph'; # I think?
}
elsif ( $type eq 'Thesis' ) {
$type = 'thesis';
}
else {
return
; # problem there! But the user can still correct this piece of data...
}
return ( "type", $type );
} ## end if ( $stat->getAttribute...)
return;
} ## end sub _get_type
sub clean_text {
my ($text) = @_;
my @lines = split( "\n", $text );
foreach (@lines) {
$_ =~ s/\s+$//;
$_ =~ s/^\s+//;
}
return join( " ", @lines );
} ## end sub clean_text
################
#
# We need to replace the default routine so we can apply
# an embargo to the deposited file.
sub attach_deposited_file {
my ( $self, $eprint, $file, $mime, $embargo, $security ) = @_;
my $fn = $file;
if ( $file =~ /^.*\/(.*)$/ ) {
$fn = $1;
}
my %doc_data;
$doc_data{eprintid} = $eprint->get_id;
$doc_data{format} = $mime;
$doc_data{formatdesc}
= $self->{session}->phrase("Sword/Deposit:document_formatdesc");
$doc_data{main} = $fn;
if ( $embargo && $security ) {
$doc_data{date_embargo} = $embargo;
$doc_data{security} = $security;
}
local $self->{session}->get_repository->{config}->{enable_file_imports} = 1;
my %file_data;
$file_data{filename} = $fn;
$file_data{url} = "file://$file";
$doc_data{files} = [ \%file_data ];
$doc_data{_parent} = $eprint;
my $doc_dataset = $self->{session}->get_repository->get_dataset("document");
my $document
= EPrints::DataObj::Document->create_from_data( $self->{session},
\%doc_data, $doc_dataset );
return 0 unless ( defined $document );
$document->make_thumbnails;
$eprint->generate_static;
$self->set_deposited_file_docid( $document->get_id );
return 1;
}
sub parse_rights_data {
my ( $plugin, $xml ) = @_;
my $set = ( $xml->getElementsByTagName("descriptionSet") )[0];
unless ( defined $set ) {
$plugin->set_status_code(400);
$plugin->add_verbose("ERROR: no <descriptionSet> tag found.");
return;
}
my $data = {};
foreach my $desc ( $set->getElementsByTagName("description") ) {
# This is the ID that ties the description to the structMap record
my $att = $desc->getAttribute("epdcx:resourceId");
foreach my $stat ( $desc->getElementsByTagName("statement") ) {
my ( $field, $value ) = _parse_statement($stat);
$data->{$att} = {} unless exists $data->{$att};
$data->{$att}->{$field} = $value;
}
}
return $data;
}
1;