Permalink
Cannot retrieve contributors at this time
1322 lines (1068 sloc)
40.2 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package EPrints::Plugin::Sword::Import::RJ_Broker; | |
| use strict; | |
| use XML::LibXML 1.63; | |
| #use Data::Dumper; | |
| use EPrints::Plugin::Sword::Import; | |
| our @ISA = qw/ EPrints::Plugin::Sword::Import /; | |
| our %SUPPORTED_MIME_TYPES = ( "application/zip" => 1, ); | |
| our %UNPACK_MIME_TYPES = ( "application/zip" => "Sword::Unpack::Sub_Zip", ); | |
| sub new { | |
| my ( $class, %params ) = @_; | |
| my $self = $class->SUPER::new(%params); | |
| $self->{name} | |
| = "SWORD Importer - Repository Junction Broker (Default EPrints)"; | |
| # Make it visible on the import menu and elsewhere | |
| $self->{visible} = "all"; | |
| $self->{produce} = [ 'dataobj/eprint' ]; | |
| return $self; | |
| } ## end sub new | |
| ### $opts{file} = $file; | |
| ### $opts{mime_type} = $headers->{content_type}; | |
| ### $opts{dataset_id} = $target_collection; | |
| ### $opts{owner_id} = $owner->get_id; | |
| ### $opts{depositor_id} = $depositor->get_id if(defined $depositor); | |
| ### $opts{no_op} = is this a No-op? | |
| ### $opts{verbose} = is this verbosed? | |
| sub input_file { | |
| my ( $plugin, %opts ) = @_; | |
| my $session = $plugin->{session}; | |
| my $dir = $opts{dir}; | |
| my $mime = $opts{mime_type}; | |
| my $file = $opts{file}; | |
| my $NO_OP = $opts{no_op}; | |
| my $log_dir = $session->config('archiveroot'); | |
| my $LOGDIR = "$log_dir/var"; | |
| my $logging = "|Broker_Sword_Import"; | |
| # barf unless we have been given a .zip file | |
| unless ( defined $SUPPORTED_MIME_TYPES{$mime} ) { | |
| $plugin->add_verbose("[ERROR] unknown MIME TYPE '$mime'."); | |
| $plugin->set_status_code(415); | |
| return undef; | |
| } | |
| my $unpacker = $UNPACK_MIME_TYPES{$mime}; | |
| my $tmp_dir; | |
| # This is localised simply so we can have a bunch of variables to deduce the | |
| # METS file I want (I know the file will be called mets.xml, and I know it | |
| # will be at the top of the tree. | |
| # It also barfs if we can't unzip the file given. | |
| { | |
| $tmp_dir = EPrints::TempDir->new( "swordXXXX", UNLINK => 1 ); | |
| if ( !defined $tmp_dir ) { | |
| print STDERR | |
| "\n[SWORD-DEPOSIT] [INTERNAL-ERROR] Failed to create the temp directory!"; | |
| $plugin->add_verbose("[ERROR] failed to create the temp directory."); | |
| $plugin->set_status_code(500); | |
| return undef; | |
| } ## end if ( !defined $tmp_dir) | |
| my $files = $plugin->unpack_files( $unpacker, $file, $tmp_dir ); | |
| unless ( defined $files ) { | |
| $plugin->add_verbose("[ERROR] failed to unpack the files"); | |
| return undef; | |
| } | |
| my $candidates = $plugin->get_files_to_import( $files, "text/xml" ); | |
| if ( scalar(@$candidates) == 0 ) { | |
| $plugin->add_verbose("[ERROR] could not find the XML file"); | |
| $plugin->set_status_code(400); | |
| return undef; | |
| } | |
| elsif ( scalar(@$candidates) > 1 ) { | |
| # We have more than one possible xml file. Take 'mets.xml' if it exists, | |
| # otherwise just take the one with the 'highest' name, alphabetic-wise | |
| my @mets = grep /\bmets\.xml$/, @$candidates; | |
| if ( scalar(@mets) == 1 ) { | |
| $$candidates[0] = $mets[0]; | |
| $plugin->add_verbose( | |
| "[WARNING] there were more than one XML file in this archive, however I'm using the one called 'mets.xml'" | |
| ); | |
| } | |
| elsif ( scalar(@mets) > 1 ) { | |
| @$candidates = sort { length($a) cmp length($b) } @mets; | |
| $plugin->add_verbose( | |
| "[WARNING] there were more than one mets.XML file in this archive, however I'm using the one closest to the top of the 'tree'" | |
| ); | |
| } | |
| else { | |
| @mets = sort @$candidates; | |
| $$candidates[0] = $mets[0]; | |
| $plugin->add_verbose( | |
| "[WARNING] there were more than one XML file in this archive. I am using the one with the 'highest' name, alphabetic-wise (closer to 'z' than 'A')" | |
| ); | |
| } ## end else [ if ( scalar(@mets) == ...)] | |
| } ## end elsif ( scalar(@$candidates...)) | |
| $file = $$candidates[0]; | |
| } ## end if ( defined $unpacker) | |
| # OK - so $file now contains the path & filenamne of the mets.xml file we're importing | |
| # $tmp_dir is the directory that the .zip file has been unpacked into. | |
| my $dataset_id = $opts{dataset_id}; | |
| my $owner_id = $opts{owner_id}; | |
| my $depositor_id = $opts{depositor_id}; | |
| my $fh; | |
| if ( !open( $fh, $file ) ) { | |
| $plugin->add_verbose( | |
| "[ERROR] couldnt open the file: '$file' because '$!'"); | |
| $plugin->set_status_code(500); | |
| return; | |
| } ## end if ( !open( $fh, $file...)) | |
| # next problem: sometimes we end up with an extra directory in the stack | |
| # (as in $tmp_dir/content/stuff) | |
| # we need to pull "content/" off the file names | |
| my $unpack_dir; | |
| my $fntmp; | |
| $fntmp = $file; | |
| if ( $fntmp =~ /^(.*)\/(.*)$/ ) { | |
| $unpack_dir = $1; | |
| $fntmp = $2; | |
| } | |
| # $tmp_dir is the temporary directory | |
| # $file is the name of the mets.xml file in that directory | |
| # $unpack_dir is whatever 'content/' actually turns out to be (if anything) | |
| # $fntmp is the mets.xml file we're going to import (under $tmp_dir/$unpack_dir) | |
| # Finally, we can read the xml from the file: | |
| my $xml; | |
| while ( my $d = <$fh> ) { | |
| $xml .= $d; | |
| } | |
| close $fh; | |
| # $dataset is the EPrints object for where we're putting the deposit | |
| my $dataset = $session->get_archive()->get_dataset($dataset_id); | |
| # again, barf if the dataset is not provided (though as this is through SWORD, | |
| # this shouldn't happen | |
| if ( !defined $dataset ) { | |
| print STDERR | |
| "\n[SWORD-METS] [INTERNAL-ERROR] Failed to open the dataset '$dataset_id'."; | |
| $plugin->add_verbose( | |
| "[INTERNAL ERROR] failed to open the dataset '$dataset_id'"); | |
| $plugin->set_status_code(500); | |
| return; | |
| } ## end if ( !defined $dataset) | |
| # convert the xml text to a LibXML dom-object | |
| # What EPrints doesn't do is validate, it just checks for well-formed-ness | |
| # We have it in an eval block in case the xml is not well-formed, as the parser | |
| # throws an exception if the document is invalid | |
| my $dom_doc; | |
| eval { $dom_doc = EPrints::XML::parse_xml_string($xml); }; | |
| # No dom? barf | |
| if ( $@ || !defined $dom_doc ) { | |
| $plugin->add_verbose("[ERROR] failed to parse the xml: '$@'"); | |
| $plugin->set_status_code(400); | |
| return; | |
| } | |
| if ( !defined $dom_doc ) { | |
| $plugin->{status_code} = 400; | |
| $plugin->add_verbose("[ERROR] failed to parse the xml."); | |
| return; | |
| } | |
| # If the top element is not mets, barf | |
| # This is just in case someone sends us a non METS xml file | |
| my $dom_top = $dom_doc->getDocumentElement; | |
| if ( lc $dom_top->tagName ne 'mets' ) { | |
| $plugin->set_status_code(400); | |
| $plugin->add_verbose( | |
| "[ERROR] failed to parse the xml: no <mets> tag found."); | |
| return; | |
| } ## end if ( lc $dom_top->tagName...) | |
| ##### | |
| # Now we dig down to get the xmldata from the METS dmdSec section | |
| ##### | |
| # METS Headers (ignored) | |
| my $mets_hdr = ( $dom_top->getElementsByTagName("metsHdr") )[0]; | |
| # METS Descriptive Metadata (main section for us) | |
| # need to loop on dmdSec: | |
| my @dmd_sections = $dom_top->getElementsByTagName("dmdSec"); | |
| my $dmd_id; | |
| my $md_wrap; | |
| my $found_wrapper = 0; | |
| # METS can have more than one dmdSec, and we can't assume that what's | |
| # pushed in conforms to what we want, so we want to find the stuff | |
| # formally idendtified as containing epdcx | |
| # (I know it doesn't guarentee anything, but its a start!) | |
| foreach my $dmd_sec (@dmd_sections) { | |
| # need to extract xmlData from here | |
| $md_wrap = ( $dmd_sec->getElementsByTagName("mdWrap") )[0]; | |
| next if ( !defined $md_wrap ); | |
| next | |
| if ( | |
| !( lc $md_wrap->getAttribute("MDTYPE") eq 'other' | |
| && defined $md_wrap->getAttribute("OTHERMDTYPE") | |
| && lc $md_wrap->getAttribute("OTHERMDTYPE") eq 'epdcx' | |
| ) | |
| ); | |
| $found_wrapper = 1; | |
| $dmd_id = $dmd_sec->getAttribute("ID"); | |
| last; | |
| } ## end foreach my $dmd_sec (@dmd_sections) | |
| unless ($found_wrapper) { | |
| $plugin->set_status_code(400); | |
| $plugin->add_verbose( | |
| "[ERROR] failed to parse the xml: could not find epdcx <mdWrap> section." | |
| ); | |
| return; | |
| } ## end unless ($found_wrapper) | |
| ######### | |
| # Now we get the (hopefully epdcx) xml from the xmlData section | |
| ######## | |
| my $xml_data = ( $md_wrap->getElementsByTagName("xmlData") )[0]; | |
| if ( !defined $xml_data ) { | |
| $plugin->set_status_code(400); | |
| $plugin->add_verbose( | |
| "[ERROR] failed to parse the xml: no <xmlData> tag found."); | |
| return; | |
| } ## end if ( !defined $xml_data) | |
| ######## | |
| # and parse it into a hash | |
| ######## | |
| my $epdata = $plugin->parse_epdcx_xml_data($xml_data); | |
| return unless ( defined $epdata ); | |
| # Add some useful (SWORD) info | |
| if ( defined $depositor_id ) { | |
| $epdata->{userid} = $owner_id; | |
| $epdata->{sword_depositor} = $depositor_id; | |
| } | |
| else { | |
| $epdata->{userid} = $owner_id; | |
| } | |
| $epdata->{eprint_status} = $dataset_id; | |
| # We now want to deal with the associated files (if any). | |
| # This is not a straight "every file is a document" import: EPrints has this | |
| # weird thing where a "document" can have multiple files. The Broker deals | |
| # with this by having defining the structure in <structMap>, and putting | |
| # each document into a <div>. If there are multiple files, the main file is | |
| # the first file in the set | |
| # The first thing to do is create a list of all the files & their IDs | |
| # File Section which will contain optional info about files to import: | |
| my %files; | |
| my $file_counter = 0; | |
| foreach my $file_sec ( $dom_top->getElementsByTagName("fileSec") ) { | |
| my $file_grp = ( $file_sec->getElementsByTagName("fileGrp") )[0]; | |
| $file_sec = $file_grp | |
| if ( defined $file_grp ) | |
| ; # this is because the <fileGrp> tag is optional | |
| foreach my $file_div ( $file_sec->getElementsByTagName("file") ) { | |
| my $file_id = $file_div->getAttribute("ID"); | |
| my $file_loc = ( $file_div->getElementsByTagName("FLocat") )[0]; | |
| if ( defined $file_loc ) { # yeepee we have a file (maybe) | |
| my $fn = $file_loc->getAttribute("href"); | |
| unless ( defined $fn ) { | |
| # to accommodate the gdome XML library: | |
| $fn = $file_loc->getAttribute("xlink:href"); | |
| } | |
| next unless ( defined $fn ); | |
| $files{$file_id} = $fn; # list files to get locally | |
| } ## end if ( defined $file_loc) | |
| } ## end foreach my $file_div ( $file_sec...) | |
| } ## end foreach my $file_sec ( $dom_top...) | |
| unless ( scalar keys %files ) { | |
| $plugin->add_verbose( | |
| "[WARNING] no <fileSec> tag found: no files will be imported."); | |
| } | |
| # we also need to hang onto details about where the importer unpacked stuff. | |
| # I know it's nothing to do with files in the METS manifest, however this is | |
| # probably the best place to keep them :) | |
| $files{unpack_dir} = $unpack_dir; | |
| $files{fntmp} = $fntmp; | |
| # Oh - if the SWORD conenction has specified NO-OP, then don't actually | |
| # go any further | |
| if ($NO_OP) { | |
| # need to send 200 Successful (the deposit handler will generate | |
| # the XML response) | |
| $plugin->add_verbose( | |
| "[OK] Plugin - import successful (but in No-Op mode)."); | |
| $plugin->set_status_code(200); | |
| return; | |
| } ## end if ($NO_OP) | |
| my $eprint = $dataset->create_object( $plugin->{session}, $epdata ); | |
| unless ( defined $eprint ) { | |
| $plugin->set_status_code(500); | |
| $plugin->add_verbose("[ERROR] failed to create the EPrint object."); | |
| return; | |
| } | |
| $logging .= "|" . $eprint->get_id; | |
| # Having created a base EPrint, and determined what files we have to add | |
| # to it, and we're actually going to deposit it, we need to create a "map" | |
| # of what files go where from the <structMap> element. | |
| # $dmd_id contains the ID for the div we're looking for | |
| # | |
| # Oh, and a complication: the embargo information is kept in the | |
| # amdSec, so we need to find any embargo dates. | |
| # There will be two options: ClosedAccess and RestrictedAccess (RJB doesn't | |
| # record OpenAccess dated) | |
| # The system needs to map the resourceId of the epdcx:description to both the | |
| # rightsAccess and the date | |
| $xml_data = ""; | |
| my $amdSec = ( $dom_top->getElementsByTagName("amdSec") )[0]; | |
| $xml_data = ( $amdSec->getElementsByTagName("xmlData") )[0]; | |
| $files{'embargo_hash'} = $plugin->parse_rights_data($xml_data) if $xml_data; | |
| my ( $embargo, $security ); | |
| foreach my $structMap ( $dom_top->getElementsByTagName("structMap") ) { | |
| # we want the [first] div that has a dmdid that matches $dmd_id | |
| my $div; | |
| my @divs = $structMap->getElementsByTagName("div"); #childNodes; | |
| foreach my $d (@divs) { | |
| next unless $d->hasAttribute('DMDID'); | |
| my $v = $d->getAttribute('DMDID'); | |
| next unless $v eq $dmd_id; | |
| $div = $d; | |
| } ## end foreach my $d (@divs) | |
| if ($div) { | |
| my $doc_data_ref = {}; | |
| my @nodes = $div->getElementsByTagName("div"); #childNodes; | |
| # As we need the embargo details later on, we have our own | |
| # process_struct_elements routine that adds embargos and returns | |
| # the embargo date and the security restriction. | |
| my $document; | |
| ( $document, $embargo, $security ) | |
| = process_struct_elements( \@nodes, \%files, $doc_data_ref, $eprint, | |
| $session ); | |
| if ( !defined $document ) { | |
| $plugin->add_verbose("[WARNING] Failed to create Document object."); | |
| } | |
| } ## end if ($div) | |
| } ## end foreach my $structMap ( $dom_top...) | |
| if ( $plugin->keep_deposited_file() ) { | |
| if ( | |
| $plugin->attach_deposited_file( | |
| $eprint, $opts{file}, $opts{mime_type}, $embargo, $security | |
| ) | |
| ) | |
| { | |
| $plugin->add_verbose("[OK] attached deposited file."); | |
| } | |
| else { | |
| $plugin->add_verbose("[WARNING] failed to attach the deposited file."); | |
| } | |
| } ## end if ( $plugin->keep_deposited_file...) | |
| $plugin->add_verbose("[OK] EPrint object created."); | |
| # Keep a track of the logging | |
| my $log_string = scalar(CORE::localtime) . "$logging"; | |
| open( LOG, ">>$LOGDIR/transfer_log" ) | |
| or warn("could not open log file $LOGDIR/transfer_log"); | |
| print LOG "$log_string\n"; | |
| close(LOG); | |
| # and let's put it in the error_log too since it's dead useful | |
| $plugin->{session}->log("logging|$log_string"); | |
| return $eprint; | |
| } ## end sub input_file | |
| # This is where we actually track & build the stuff that | |
| # adds the files. It also sets any embargo period. | |
| # As we need the embargo details later on, it returns the | |
| # embargo date and the security restriction. | |
| sub process_struct_elements { | |
| my ( $nodes_ref, $files_hashref, $doc_data_ref, $eprint, $session ) = @_; | |
| my $hash_ref = {}; | |
| my $document; | |
| my $unpack_dir = $files_hashref->{unpack_dir}; | |
| my $embargo_hash = $files_hashref->{embargo_hash}; | |
| my ( $embargo, $security ); | |
| $security = 'staffonly'; | |
| # Go through all the children | |
| foreach my $element ( @{$nodes_ref} ) { | |
| my $ID = $element->getAttribute("ID"); | |
| if ( exists $embargo_hash->{$ID} ) { | |
| $embargo = $embargo_hash->{$ID}->{'date'}; | |
| } | |
| my @fptrs = $element->getElementsByTagName("fptr"); #childNodes; | |
| # check to see what element it is | |
| if ( scalar @fptrs ) { | |
| my $doc_data = {}; | |
| foreach my $fptr (@fptrs) { | |
| my $fileid = $fptr->getAttribute("FILEID"); | |
| my $file = $files_hashref->{$fileid}; | |
| $file =~ m/^(\w+)/; | |
| my $prot = $1 if $1; | |
| $file =~ m/\/([^\/]+)$/; # find everything after the last slash | |
| my $filename = $1 if $1; | |
| # if the file is not a web reference, add 'file://' and the unpack path | |
| # to the filename | |
| for ($prot) { | |
| /http/ && do { last; }; # $filename stays unchanged | |
| { # default option | |
| $file =~ s#^#file://$unpack_dir/#; | |
| last; | |
| }; | |
| } ## end for ($prot) | |
| $doc_data->{_parent} = $eprint | |
| unless defined $doc_data->{_parent}; | |
| $doc_data->{eprintid} = $eprint->get_id | |
| unless defined $doc_data->{eprintid}; | |
| $doc_data->{main} = $filename unless $doc_data->{main}; | |
| $doc_data->{format} | |
| = $session->get_repository->call( 'guess_doc_type', $session, | |
| $filename ) | |
| unless exists $doc_data->{format}; | |
| if ($embargo) { | |
| $doc_data->{date_embargo} = $embargo; | |
| $doc_data->{security} = $security; | |
| } | |
| my %file_data; | |
| $file_data{filename} = $filename; | |
| $file_data{url} = $file; | |
| $doc_data->{files} = [] unless exists $doc_data->{files}; | |
| push @{ $doc_data->{files} }, \%file_data; | |
| } ## end foreach my $fptr (@fptrs) | |
| my $doc_dataset = $session->get_repository->get_dataset("document"); | |
| local $session->get_repository->{config}->{enable_web_imports} = 1; | |
| local $session->get_repository->{config}->{enable_file_imports} = 1; | |
| $document | |
| = EPrints::DataObj::Document->create_from_data( $session, $doc_data, | |
| $doc_dataset ); | |
| } ## end if ( scalar @fptrs ) | |
| } ## end foreach my $element ( @{$nodes_ref...}) | |
| return ( $document, $embargo, $security ); | |
| } ## end sub process_struct_elements | |
| sub add_file { | |
| my ( $eprint, $hash_ref, $unpack_dir ) = @_; | |
| my $prot; | |
| my $file = $hash_ref->{'file'}; | |
| $file =~ m/^(\w+)/; | |
| $prot = $1 if $1; | |
| $file =~ m/\/([^\/]+)$/; # find everything after the last slash | |
| my $filename = $1 if $1; | |
| return undef unless $filename; | |
| $filename =~ s# #\ #g; # replace spaces with '\ ' | |
| # if the file is not a web reference, add 'file://' and the unpack path to the | |
| # filename | |
| for ($prot) { | |
| /http/ && do { | |
| # $filename stays unchanged | |
| last; | |
| }; | |
| { | |
| $file =~ s#^#file://$unpack_dir/# if $file; | |
| last; | |
| }; | |
| } ## end for ($prot) | |
| my $session = $hash_ref->{'session'}; | |
| # Setup: we return a doc_data hash. | |
| # If one is passed in, we update & return that, else create a new one. | |
| my $doc_data = {}; | |
| $doc_data->{_parent} = $eprint unless defined $doc_data->{_parent}; | |
| $doc_data->{eprintid} = $eprint->get_id | |
| unless defined $doc_data->{eprintid}; | |
| $doc_data->{main} = $filename unless $doc_data->{main}; | |
| $doc_data->{format} | |
| = $session->get_repository->call( 'guess_doc_type', $session, | |
| $filename ) | |
| unless exists $doc_data->{format}; | |
| my %file_data; | |
| $file_data{filename} = $filename; | |
| $file_data{url} = "$file"; | |
| $doc_data->{files} = [ \%file_data ]; | |
| return $doc_data; | |
| } ## end sub add_file | |
| sub keep_deposited_file { | |
| return 1; | |
| } | |
| sub parse_epdcx_xml_data { | |
| my ( $plugin, $xml ) = @_; | |
| my $set = ( $xml->getElementsByTagName("descriptionSet") )[0]; | |
| unless ( defined $set ) { | |
| $plugin->set_status_code(400); | |
| $plugin->add_verbose("ERROR: no <descriptionSet> tag found."); | |
| return; | |
| } | |
| my $epdata = {}; | |
| my @creators_array = (); | |
| foreach my $desc ( $set->getElementsByTagName("description") ) { | |
| my $att = $desc->getAttribute("epdcx:resourceId"); | |
| for ($att) { | |
| # title; abstract; identifer (publisherID); | |
| # creator; affilitated institution (from authors) | |
| # Note: we need to pass in the full $set so we can find | |
| # creator info | |
| m#^sword-mets-epdcx# && do { | |
| $plugin->_parse_epdcx( $desc, $epdata, $set ); | |
| last; | |
| }; | |
| # type; doi; date (yyyy-mm-dd); status; publisher; copyright_holder; | |
| m#^sword-mets-expr# && do { | |
| $plugin->_parse_expr( $desc, $epdata ); | |
| last; | |
| }; | |
| # publication; issn; isbn; volume; issue; pages; | |
| m#^sword-mets-manif# && do { | |
| $plugin->_parse_manif( $desc, $epdata, $set ); | |
| last; | |
| }; | |
| } | |
| } ## end foreach my $desc ( $set->getElementsByTagName...) | |
| return $epdata; | |
| } ## end sub parse_epdcx_xml_data | |
| # Parser for the "work" level | |
| # item type; title; abstract; identifer (publisherID); | |
| # creator; affilitated institution (from authors) | |
| # Funder (a ';' seperated list); Grant codes (also a ';' seperated list) | |
| # epdata is passed in, therefore the routine modifies the actual hash | |
| sub _parse_epdcx { | |
| my ( $plugin, $desc, $epdata, $set ) = @_; | |
| foreach my $stat ( $desc->getElementsByTagName("statement") ) { | |
| my ( $field, $value ) = _parse_statement($stat); | |
| for ($field) { | |
| # Note - for creators, we have two resources of information | |
| # 1) There is the name string here in the 'work' section | |
| # 2) The broker also stores an extended record in a seperate | |
| # description with a known resourceId... a resourceId that | |
| # matches the $attr value returned here. | |
| m#creator# && do { | |
| my ( $name, $attr ) = split /\t/, $value; | |
| my ( $family, $given, $email, $inst ); | |
| # can we get the information from Person description? | |
| if ($attr) { | |
| foreach my $d ( $set->getElementsByTagName("description") ) { | |
| next unless $d->getAttribute('epdcx:resourceId') eq $attr; | |
| foreach my $stat2 ( $d->getElementsByTagName("statement") ) { | |
| my ( $field2, $value2 ) = _parse_statement($stat2); | |
| for ($field2) { | |
| m#givenname# && do { $given = $value2; last; }; | |
| m#familyname# && do { $family = $value2; last; }; | |
| m#email# && do { $email = $value2; last; }; | |
| } | |
| } | |
| } | |
| } | |
| # if not, deduce it from the text here | |
| unless ($family) { | |
| if ( $name =~ /(\w+)\,\s?(.*)$/ ) { | |
| $family = $1, $given = $2; | |
| } | |
| else { | |
| $family = $value; | |
| } | |
| } | |
| # now add the information to the eprint | |
| # NOTE: eprints uses the index into arrays to syncronise | |
| # names, email addresses and other fields, so we have to | |
| # add something, even if its NULL | |
| push @{ $epdata->{creators_name} }, | |
| { family => $family, given => $given }; | |
| push @{ $epdata->{creators_id} }, $email; | |
| last; | |
| }; | |
| m#title# && do { | |
| $epdata->{'title'} = $value; | |
| last; | |
| }; | |
| m#abstract# && do { | |
| $epdata->{'abstract'} = $value; | |
| last; | |
| }; | |
| m#identifier# && do { | |
| $epdata->{'id_number'} = $value; | |
| last; | |
| }; | |
| # In EPrints, there is a "Funders" field, but no grant. | |
| # This is a repeating field, so we can push the items onto an array | |
| # To note grant codes, we are bodging the string: | |
| # <funder>: <grant>, <grant>, <grant> | |
| # ... it makes it human readable, and computer-parseable | |
| m#funders# && do { | |
| if ($value) { | |
| my ( $funder, $gcode ); | |
| $funder = $value; | |
| $funder =~ s/^funder //; | |
| my @grants = (); | |
| foreach my $d ( $set->getElementsByTagName("description") ) { | |
| next unless $d->getAttribute('epdcx:resourceId') eq $value; | |
| foreach my $stat2 ( $d->getElementsByTagName("statement") ) { | |
| my ( $field2, $value2 ) = _parse_statement($stat2); | |
| if ( $field2 eq 'grant_code' ) { | |
| push @grants, $value2; | |
| } | |
| } | |
| if ( scalar @grants ) { | |
| $funder .= ': '; | |
| $funder .= join '; ', @grants; | |
| } | |
| } | |
| push @{ $epdata->{'funders'} }, $funder; | |
| } | |
| last; | |
| }; | |
| # This becomes a pseudo-list (as EPrints doesn't have this | |
| # as a multiple field | |
| m#affiliatedInstitution# && do { | |
| my $i; | |
| $i = $epdata->{'institution'} if exists $epdata->{'institution'}; | |
| if ($i) { | |
| my @i = split /; /, $i; | |
| push @i, $value; | |
| $value = join '; ', @i; | |
| } | |
| $epdata->{'institution'} = $value; | |
| last; | |
| }; | |
| } | |
| } ## end foreach my $stat ( $desc->getElementsByTagName...) | |
| } ## end if($att =~ /^sword-mets-/)... | |
| # Parser for the "expression" level | |
| # genre, citation, doi, status, publisher, editor (provenance for | |
| # opendepot), date published | |
| # epdata is passed in, therefore the routine modifies the actual hash | |
| sub _parse_expr { | |
| my ( $plugin, $desc, $epdata ) = @_; | |
| foreach my $stat ( $desc->getElementsByTagName("statement") ) { | |
| my ( $field, $value ) = _parse_statement($stat); | |
| for ($field) { | |
| # This gets used once we have built the full record... | |
| m#type# && do { | |
| $epdata->{'type'} = $value; | |
| last; | |
| }; | |
| # date (published) | |
| m#date# && do { | |
| $epdata->{ispublished} = 'pub'; | |
| $epdata->{date} = $value; | |
| $epdata->{date_type} = 'published'; | |
| last; | |
| }; | |
| # This could be either a local identifier | |
| # a local identifier code, or a URI into the broker | |
| # We definitely don't want the latter. | |
| # Of the first two, take a doi in preference to a simple identifier | |
| m#identifier# && do { | |
| last | |
| if ( $value =~ /edina\.ac\.uk/ ) | |
| ; # skip anything that refers to edina | |
| my $i; | |
| $i = $epdata->{identifier} if exists $epdata->{identifier}; | |
| last if ( $i =~ /doi\.org/ ); # skip if we already have a doi | |
| $epdata->{identifier} = $value; | |
| last; | |
| }; | |
| # peer reviewed | |
| m#refereed# && do { | |
| $epdata->{'refereed'} = $value; | |
| last; | |
| }; | |
| # we need to come back to this: for a thesis, the broker | |
| # understands that thesis are "institution; department" | |
| # whereas everything else is just "publisher" | |
| # stuff it in publisher for now, and sort it later | |
| m#publisher# && do { | |
| $epdata->{'publisher'} = $value; | |
| last; | |
| }; | |
| m#provenance# && do { | |
| $epdata->{'provenance'} = $value; # copyright_holder? | |
| last; | |
| }; | |
| } ## end if ( defined $field ) | |
| } ## end foreach my $stat ( $desc->getElementsByTagName...) | |
| } ## end if($att =~ /^sword-mets-/)... | |
| # Parser for the "manifest" level | |
| # epdata is passed in, therefore the routine modifies the actual hash | |
| sub _parse_manif { | |
| my ( $plugin, $desc, $epdata, $set ) = @_; | |
| foreach my $stat ( $desc->getElementsByTagName("statement") ) { | |
| my ( $field, $value ) = _parse_statement($stat); | |
| for ($field) { | |
| m#cite# && do { | |
| my $cite = clean_text($value); | |
| # now we need to decode the citation | |
| # $publication [$vol[($num)]][, $pps|$pgs] | |
| my ( $pub, $vol, $num, $pps, $pgs ); | |
| # Start by pulling the page range or number of pages off the end | |
| if ( $cite =~ /\, (\d+(:?\-\d+))$/ ) { | |
| $cite =~ s/\, (\d+(:?\-\d+))$//; | |
| $num = $1 if $1; | |
| if ( $num =~ /\-/ ) { $pps = $num } | |
| else { $pgs = $num } | |
| $num = undef; | |
| } ## end if ( $cite =~ /\, (\d+(:?\-\d+))$/) | |
| # Do we have volume number ? | |
| if ( $cite =~ /\((.+)\)$/ ) { | |
| $cite =~ s/\((.+)\)$//; | |
| $num = $1 if $1; | |
| } | |
| # Get the volume number | |
| if ( $cite =~ /\s+(\d+)$/ ) { | |
| $cite =~ s/\s+(\d+)$//; | |
| $vol = $1 if $1; | |
| } | |
| $pub = $cite; | |
| $epdata->{publication} = $pub | |
| if ( $pub && not exists $epdata->{publication} ); | |
| $epdata->{volume} = $vol if ( $vol && not exists $epdata->{volume} ); | |
| $epdata->{number} = $num if ( $num && not exists $epdata->{number} ); | |
| $epdata->{pagerange} = $pps | |
| if ( $pps && not exists $epdata->{pagerange} ); | |
| # $epdata->{pages} = $pgs if $pgs; | |
| last; | |
| }; | |
| # issn & isbn | |
| m#issn# && do { | |
| $epdata->{'issn'} = $value; | |
| last; | |
| }; | |
| m#isbn# && do { | |
| $epdata->{'isbn'} = $value; | |
| last; | |
| }; | |
| # publication,volume, issue, pages | |
| # these trump anything from the citation | |
| m#publication# && do { | |
| $epdata->{'publication'} = $value; | |
| last; | |
| }; | |
| m#volume# && do { | |
| $epdata->{'volume'} = $value; | |
| last; | |
| }; | |
| m#issue# && do { | |
| $epdata->{'issue'} = $value; | |
| last; | |
| }; | |
| m#pagerange# && do { | |
| $epdata->{'pagerange'} = $value; | |
| last; | |
| }; | |
| m#pages# && do { | |
| # Pages has to be a numeric value! | |
| $epdata->{'pages'} = $value + 0; | |
| last; | |
| }; | |
| # Available URLS (DOI, official_url, related urls) | |
| m#available_url# && do { | |
| # available-doi-1; available-official_url-1; available-related_url-4; ... | |
| # can we get the information from Person description? | |
| if ($value) { | |
| foreach my $d ( $set->getElementsByTagName("description") ) { | |
| next unless $d->getAttribute('epdcx:resourceId') eq $value; | |
| my $uri; | |
| $uri = $d->getAttribute('epdcx:resourceUrl'); | |
| if ($value =~ /official/) { | |
| $epdata->{'official_url'} = $uri; | |
| } elsif ($value =~ /doi/) { | |
| $epdata->{'id_number'} = $uri; | |
| } else { | |
| push @{$epdata->{'related_url'}}, {url => $uri, type=>'pub'} if $uri | |
| } | |
| } | |
| }; | |
| last; | |
| }; | |
| } ## end if ( defined $field ) | |
| } ## end foreach my $stat ( $desc->getElementsByTagName...) | |
| } ## end if($att =~ /^sword-mets-/)... | |
| # Parser for the unknown level | |
| # This is where we try to find the author/creator information | |
| # epdata is passed in, therefore the routine modifies the actual hash | |
| sub _parse_unknown { | |
| my ( $plugin, $desc, $epdata ) = @_; | |
| $epdata->{unknown} = {} unless exists $epdata->{unknown}; | |
| my $resourceId; | |
| $resourceId = $desc->getAttribute("resourceId"); | |
| # we only parse if we have a resourceId: creators have them | |
| if ($resourceId) { | |
| $epdata->{unknown}->{$resourceId} = {}; | |
| foreach my $stat ( $desc->getElementsByTagName("statement") ) { | |
| my ( $field, $value ) = _parse_statement($stat); | |
| for ($field) { | |
| m#givenname# && do { | |
| $epdata->{'pages'} = $value; | |
| last; | |
| }; | |
| #familyname | |
| #institution | |
| } | |
| } ## end foreach my $stat ( $desc->getElementsByTagName...) | |
| } | |
| } ## end if($att =~ /^sword-mets-/)... | |
| sub _parse_statement { | |
| my ($stat) = @_; | |
| my $property = $stat->getAttribute("propertyURI"); | |
| unless ( defined $property ) { | |
| $property = $stat->getAttribute("epdcx:propertyURI"); | |
| } | |
| for ($property) { | |
| m#http://purl.org/dc/elements/1.1/type# && do { | |
| return _get_type($stat); | |
| last; | |
| }; ## end if ($property eq 'http://purl.org/dc/elements/1.1/type'...) | |
| m#http://purl.org/dc/elements/1.1/title# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| my $title | |
| = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( "title", clean_text($title) ); | |
| last; | |
| }; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/title'...) | |
| m#http://purl.org/dc/terms/abstract# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| my $abstract | |
| = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( "abstract", clean_text($abstract) ); | |
| last; | |
| }; ## end elsif ($property eq 'http://purl.org/dc/terms/abstract'...) | |
| m#http://purl.org/dc/elements/1.1/creator# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value .= "\t" | |
| . $stat->getAttribute("epdcx:valueRef") | |
| ; # add the value of the attribute to the creators list | |
| return ( "creator", $value ); | |
| last; | |
| }; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/creator'...) | |
| m#http://purl.org/dc/elements/1.1/identifier# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| my $id = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( "id_number", clean_text($id) ); | |
| last; | |
| }; ## end elsif ($property eq 'http://purl.org/dc/elements/1.1/identifier'...) | |
| m#http://purl.org/eprint/terms/status# && do { | |
| if ( $stat->getAttribute("valueURI") =~ /status\/(.*)$/ ) { | |
| my $status = $1; | |
| if ( $status eq 'PeerReviewed' ) { | |
| $status = 'TRUE'; | |
| } | |
| elsif ( $status eq 'NonPeerReviewed' ) { | |
| $status = 'FALSE'; | |
| } | |
| else { | |
| return; | |
| } | |
| return ( 'refereed', $status ); # is this the proper field? | |
| } ## end if ( $stat->getAttribute...) | |
| return; | |
| last; | |
| }; ## end elsif ($property eq 'http://purl.org/eprint/terms/status'...) | |
| m#http://purl.org/dc/elements/1.1/language# && do { | |
| # LANGUAGE (not parsed) | |
| last; | |
| }; | |
| m#http://purl.org/dc/terms/available# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| my $id = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( "date", clean_text($id) ); | |
| last; | |
| }; | |
| m#http://purl.org/eprint/terms/copyrightHolder# && do { | |
| # COPYRIGHT HOLDER (not parsed) | |
| last; | |
| }; | |
| m#http://purl.org/dc/terms/bibliographicCitation# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| my $cite = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'cite', $cite ); | |
| last; | |
| }; | |
| m#http://purl.org/dc/elements/1.1/publisher# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'publisher', $value ); | |
| last; | |
| }; | |
| # This lot are OA-RJ extensions to cover blantently missing fields | |
| m#http://opendepot.org/broker/elements/1.0/publication# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'publication', $value ); | |
| last; | |
| }; | |
| m#http://opendepot.org/broker/elements/1.0/issn# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'issn', $value ); | |
| last; | |
| }; | |
| m#http://opendepot.org/broker/elements/1.0/isbn# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'isbn', $value ); | |
| last; | |
| }; | |
| m#http://opendepot.org/broker/elements/1.0/volume# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'volume', $value ); | |
| last; | |
| }; | |
| m#http://opendepot.org/broker/elements/1.0/issue# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'issue', $value ); | |
| last; | |
| }; | |
| m#http://opendepot.org/broker/elements/1.0/pages# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'pagerange', $value ) if $value =~ /\-/; | |
| return ( 'pages', $value ); | |
| last; | |
| }; | |
| ## new record metadata | |
| m#http://purl.org/dc/elements/1.1/givenname# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'givenname', $value ); | |
| last; | |
| }; | |
| m#http://purl.org/dc/elements/1.1/familyname# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'familyname', $value ); | |
| last; | |
| }; | |
| m#http://xmlns.com/foaf/0.1/mbox# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'email', $value ); | |
| last; | |
| }; | |
| m#http://purl.org/eprint/terms/affiliatedInstitution# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'institution', $value ); | |
| last; | |
| }; | |
| m#http://www.loc.gov/loc.terms/relators/EDT# && do { | |
| my $value = ( $stat->getElementsByTagName("valueString") )[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| return ( 'provenance', $value ); # editor? | |
| last; | |
| }; | |
| m#http://www.loc.gov/loc.terms/relators/FND# && do { | |
| my $value = $stat->getAttribute("epdcx:valueRef") | |
| ; # add the value of the attribute to the creators list | |
| return ( "funders", $value ); | |
| last; | |
| }; | |
| m#http://purl.org/eprint/terms/grantNumber# && do { | |
| my $value; | |
| if ($stat->hasAttribute("epdcx:valueRef")) { | |
| $value = $stat->getAttribute("epdcx:valueRef"); | |
| } else { | |
| my @vs = $stat->getElementsByTagName("valueString"); | |
| if (scalar @vs) { | |
| $value = $vs[0]; | |
| $value = EPrints::XML::to_string( EPrints::XML::contents_of($value) ); | |
| } | |
| } | |
| return ( "grant_code", $value ); | |
| last; | |
| }; | |
| ## end new data | |
| ## Alternate sources | |
| m#http://purl.org/eprint/terms/isAvailableAs# && do { | |
| my $value = $stat->getAttribute("epdcx:valueRef") ; | |
| return ( "available_url", $value ); | |
| last; | |
| }; | |
| ## Data for embargoes | |
| m#http://purl.org/dc/terms/accessRights# && do { | |
| my $value = $stat->getAttribute("epdcx:valueRef") | |
| ; # add the value of the attribute to the creators list | |
| $value =~ /([^\/]+)$/; | |
| $value = $1 if $1; | |
| return ( "access", $value ); | |
| last; | |
| }; | |
| } ## end for ($property) | |
| return; | |
| } ## end sub _parse_statement | |
| sub _get_type { | |
| my ($stat) = @_; | |
| if ( $stat->getAttribute("valueURI") =~ /type\/(.*)$/ ) { # then $1 = type | |
| # reference for these mappings is: | |
| # "http://www.ukoln.ac.uk/repositories/digirep/index/Eprints_Type_Vocabulary_Encoding_Scheme" | |
| my $type = $1; # no need to clean_text( $1 ) here | |
| if ( $type eq 'JournalArticle' | |
| || $type eq 'JournalItem' | |
| || $type eq 'SubmittedJournalArticle' | |
| || $type eq 'WorkingPaper' ) | |
| { | |
| $type = 'article'; | |
| } ## end if ( $type eq 'JournalArticle'...) | |
| elsif ( $type eq 'Book' ) { | |
| $type = 'book'; | |
| } | |
| elsif ( $type eq 'BookItem' ) { | |
| $type = 'book_section'; | |
| } | |
| elsif ( $type eq 'ConferenceItem' | |
| || $type eq 'ConferencePoster' | |
| || $type eq 'ConferencePaper' ) | |
| { | |
| $type = 'conference_item'; | |
| } ## end elsif ( $type eq 'ConferenceItem'...) | |
| elsif ( $type eq 'Patent' ) { | |
| $type = 'patent'; | |
| } | |
| elsif ( $type eq 'Report' ) { | |
| $type = 'monograph'; # I think? | |
| } | |
| elsif ( $type eq 'Thesis' ) { | |
| $type = 'thesis'; | |
| } | |
| else { | |
| return | |
| ; # problem there! But the user can still correct this piece of data... | |
| } | |
| return ( "type", $type ); | |
| } ## end if ( $stat->getAttribute...) | |
| return; | |
| } ## end sub _get_type | |
| sub clean_text { | |
| my ($text) = @_; | |
| my @lines = split( "\n", $text ); | |
| foreach (@lines) { | |
| $_ =~ s/\s+$//; | |
| $_ =~ s/^\s+//; | |
| } | |
| return join( " ", @lines ); | |
| } ## end sub clean_text | |
| ################ | |
| # | |
| # We need to replace the default routine so we can apply | |
| # an embargo to the deposited file. | |
| sub attach_deposited_file { | |
| my ( $self, $eprint, $file, $mime, $embargo, $security ) = @_; | |
| my $fn = $file; | |
| if ( $file =~ /^.*\/(.*)$/ ) { | |
| $fn = $1; | |
| } | |
| my %doc_data; | |
| $doc_data{eprintid} = $eprint->get_id; | |
| $doc_data{format} = $mime; | |
| $doc_data{formatdesc} | |
| = $self->{session}->phrase("Sword/Deposit:document_formatdesc"); | |
| $doc_data{main} = $fn; | |
| if ( $embargo && $security ) { | |
| $doc_data{date_embargo} = $embargo; | |
| $doc_data{security} = $security; | |
| } | |
| local $self->{session}->get_repository->{config}->{enable_file_imports} = 1; | |
| my %file_data; | |
| $file_data{filename} = $fn; | |
| $file_data{url} = "file://$file"; | |
| $doc_data{files} = [ \%file_data ]; | |
| $doc_data{_parent} = $eprint; | |
| my $doc_dataset = $self->{session}->get_repository->get_dataset("document"); | |
| my $document | |
| = EPrints::DataObj::Document->create_from_data( $self->{session}, | |
| \%doc_data, $doc_dataset ); | |
| return 0 unless ( defined $document ); | |
| $document->make_thumbnails; | |
| $eprint->generate_static; | |
| $self->set_deposited_file_docid( $document->get_id ); | |
| return 1; | |
| } | |
| sub parse_rights_data { | |
| my ( $plugin, $xml ) = @_; | |
| my $set = ( $xml->getElementsByTagName("descriptionSet") )[0]; | |
| unless ( defined $set ) { | |
| $plugin->set_status_code(400); | |
| $plugin->add_verbose("ERROR: no <descriptionSet> tag found."); | |
| return; | |
| } | |
| my $data = {}; | |
| foreach my $desc ( $set->getElementsByTagName("description") ) { | |
| # This is the ID that ties the description to the structMap record | |
| my $att = $desc->getAttribute("epdcx:resourceId"); | |
| foreach my $stat ( $desc->getElementsByTagName("statement") ) { | |
| my ( $field, $value ) = _parse_statement($stat); | |
| $data->{$att} = {} unless exists $data->{$att}; | |
| $data->{$att}->{$field} = $value; | |
| } | |
| } | |
| return $data; | |
| } | |
| 1; | |