Skip to content
Browse files

Merge branch 'smithsonian_ingestion' of github.com:dpla/ingestion int…

…o smithsonian_ingestion
  • Loading branch information...
2 parents 704bffa + 028ff9b commit ca11c5d1360a4c3b47768967d4729b8d7d2b0e87 Szymon Guz committed Mar 19, 2013
Showing with 577 additions and 527 deletions.
  1. +42 −28 couchdb_views/qa_reports.js
  2. +18 −17 lib/akamod/arc-to-dpla.py
  3. +1 −3 lib/akamod/artstor_identify_object.py
  4. +1 −1 lib/akamod/artstor_select_isshownat.py
  5. +1 −1 lib/akamod/bhl_contributor_to_collection.py
  6. +10 −9 lib/akamod/cleanup_value.py
  7. +2 −9 lib/akamod/contentdm_identify_object.py
  8. +7 −9 lib/akamod/edan_to_dpla.py
  9. +2 −2 lib/akamod/enrich-date.py
  10. +50 −37 lib/akamod/enrich-format.py
  11. +1 −1 lib/akamod/enrich-subject.py
  12. +36 −20 lib/akamod/enrich-type.py
  13. +1 −1 lib/akamod/enrich.py
  14. +1 −1 lib/akamod/enrich_language.py
  15. +1 −1 lib/akamod/enrich_location.py
  16. +1 −3 lib/akamod/georgia_identify_object.py
  17. +3 −9 lib/akamod/kentucky_identify_object.py
  18. +1 −1 lib/akamod/mdl-enrich-location.py
  19. +12 −6 lib/akamod/move_date_values.py
  20. +1 −1 lib/akamod/mwdl_enrich_state_located_in.py
  21. +9 −12 lib/akamod/oai-to-dpla.py
  22. +9 −14 lib/akamod/primo-to-dpla.py
  23. +16 −12 lib/akamod/shred.py
  24. +1 −0 lib/oai.py
  25. +11 −12 profiles/artstor.pjs
  26. +9 −12 profiles/bhl.pjs
  27. +10 −11 profiles/clemson.pjs
  28. +32 −0 profiles/digital-commonwealth.pjs
  29. +12 −11 profiles/georgia.pjs
  30. +15 −10 profiles/kentucky.pjs
  31. +8 −9 profiles/minnesota.pjs
  32. +9 −11 profiles/mwdl.pjs
  33. +5 −5 profiles/nara.pjs
  34. +9 −10 profiles/scdl-charleston.pjs
  35. +9 −10 profiles/scdl-usc.pjs
  36. +1 −1 profiles/smithsonian.pjs
  37. +3 −0 scripts/export_reports
  38. +8 −5 scripts/poll_profiles
  39. +9 −23 test/test_artstor.py
  40. +3 −3 test/test_bhl_contributor_to_collection.py
  41. +2 −2 test/test_cleanup_value.py
  42. +8 −18 test/test_contentdm_identify_object.py
  43. +82 −82 test/test_copy_prop.py
  44. +37 −28 test/test_enrich.py
  45. +3 −3 test/test_enrich_date.py
  46. +4 −4 test/test_enrich_language.py
  47. +18 −18 test/test_enrich_location.py
  48. +3 −3 test/test_filtering.py
  49. +6 −10 test/test_georgia_identify_object.py
  50. +28 −22 test/test_move_date_values.py
  51. +6 −6 test/test_sets_prop.py
View
70 couchdb_views/qa_reports.js
@@ -3,80 +3,94 @@
"language": "javascript",
"views": {
"title": {
- "map": "function(doc) { if (doc.ingestType == 'item') { title = doc.aggregatedCHO.title; if (title.constructor.toString().indexOf('Array') == -1) { title = new Array(title); } for (i=0; i<title.length; i++) { emit(doc['id'], title[i]); }}}"
+ "map": "function(doc) { if (doc.ingestType == 'item') { title = doc.sourceResource.title; if (title.constructor.toString().indexOf('Array') == -1) { title = new Array(title); } for (i=0; i<title.length; i++) { emit(doc['id'], title[i]); }}}"
},
"title_count": {
- "map": "function(doc) { if (doc.ingestType == 'item') { title = doc.aggregatedCHO.title; if (title.constructor.toString().indexOf('Array') == -1) { title = new Array(title); } for (i=0; i<title.length; i++) { emit(title[i],1); }}}",
+ "map": "function(doc) { if (doc.ingestType == 'item') { title = doc.sourceResource.title; if (title.constructor.toString().indexOf('Array') == -1) { title = new Array(title); } for (i=0; i<title.length; i++) { emit(title[i],1); }}}",
"reduce": "_count"
},
"creator": {
- "map": "function(doc) { if (doc.ingestType == 'item') { creator = doc.aggregatedCHO.creator; if (creator.constructor.toString().indexOf('Array') == -1) { creator = new Array(creator); } for (i=0; i<creator.length; i++) { emit(doc['id'], creator[i]);}}}"
+ "map": "function(doc) { if (doc.ingestType == 'item') { creator = doc.sourceResource.creator; if (creator.constructor.toString().indexOf('Array') == -1) { creator = new Array(creator); } for (i=0; i<creator.length; i++) { emit(doc['id'], creator[i]);}}}"
},
"creator_count": {
- "map": "function(doc) { if (doc.ingestType == 'item') { creator = doc.aggregatedCHO.creator; if (creator.constructor.toString().indexOf('Array') == -1) { creator = new Array(creator); } for (i=0; i<creator.length; i++) { emit(creator[i],1);}}}",
+ "map": "function(doc) { if (doc.ingestType == 'item') { creator = doc.sourceResource.creator; if (creator.constructor.toString().indexOf('Array') == -1) { creator = new Array(creator); } for (i=0; i<creator.length; i++) { emit(creator[i],1);}}}",
"reduce": "_count"
},
"publisher": {
- "map": "function(doc) {if (doc.ingestType == 'item') {pub = doc.aggregatedCHO.publisher;if (pub.constructor.toString().indexOf('Array') == -1) { pub = new Array(pub); }for (i=0; i<pub.length; i++) {emit(doc['id'], pub[i]);}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {pub = doc.sourceResource.publisher;if (pub.constructor.toString().indexOf('Array') == -1) { pub = new Array(pub); }for (i=0; i<pub.length; i++) {emit(doc['id'], pub[i]);}}}"
},
"publisher_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {pub = doc.aggregatedCHO.publisher;if (pub.constructor.toString().indexOf('Array') == -1) { pub = new Array(pub); }for (i=0; i<pub.length; i++) {emit(pub[i],i);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {pub = doc.sourceResource.publisher;if (pub.constructor.toString().indexOf('Array') == -1) { pub = new Array(pub); }for (i=0; i<pub.length; i++) {emit(pub[i],i);}}}",
"reduce": "_count"
},
"dates": {
- "map": "function(doc) {if (doc.ingestType == 'item') {d = doc.aggregatedCHO.date;if (d.constructor.toString().indexOf('Array') == -1) { d = new Array(d); }for (i=0; i<d.length; i++) {emit(doc['id'], d[i]['displayDate']+' ('+d[i]['begin']+' to '+d[i]['end']+')');}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {d = doc.sourceResource.date;if (d.constructor.toString().indexOf('Array') == -1) { d = new Array(d); }for (i=0; i<d.length; i++) {emit(doc['id'], d[i]['displayDate']+' ('+d[i]['begin']+' to '+d[i]['end']+')');}}}"
},
"dates_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {d = doc.aggregatedCHO.date;if (d.constructor.toString().indexOf('Array') == -1) { d = new Array(d); }for (i=0; i<d.length; i++) {emit(d[i]['displayDate']+' ('+d[i]['begin']+' to '+d[i]['end']+')',1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {d = doc.sourceResource.date;if (d.constructor.toString().indexOf('Array') == -1) { d = new Array(d); }for (i=0; i<d.length; i++) {emit(d[i]['displayDate']+' ('+d[i]['begin']+' to '+d[i]['end']+')',1);}}}",
"reduce": "_count"
},
"description": {
- "map": "function(doc) {if (doc.ingestType == 'item') {desc = doc.aggregatedCHO.description;if (desc.constructor.toString().indexOf('Array') == -1) { desc = new Array(desc); }for (i=0; i<desc.length; i++) {emit(doc['id'], desc[i]);}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {desc = doc.sourceResource.description;if (desc.constructor.toString().indexOf('Array') == -1) { desc = new Array(desc); }for (i=0; i<desc.length; i++) {emit(doc['id'], desc[i]);}}}"
},
"description_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {desc = doc.aggregatedCHO.description;if (desc.constructor.toString().indexOf('Array') == -1) { desc = new Array(desc); }for (i=0; i<desc.length; i++) {emit(desc[i],1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {desc = doc.sourceResource.description;if (desc.constructor.toString().indexOf('Array') == -1) { desc = new Array(desc); }for (i=0; i<desc.length; i++) {emit(desc[i],1);}}}",
"reduce": "_count"
},
"format": {
- "map": "function(doc) {if (doc.ingestType == 'item') {format = doc.aggregatedCHO.physicalMedium;if (format.constructor.toString().indexOf('Array') == -1) { format = new Array(format); }for (i=0; i<format.length; i++) {emit(doc['id'], format[i]);}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {format = doc.sourceResource.format;if (format.constructor.toString().indexOf('Array') == -1) { format = new Array(format); }for (i=0; i<format.length; i++) {emit(doc['id'], format[i]);}}}"
},
"format_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {format = doc.aggregatedCHO.physicalMedium;if (format.constructor.toString().indexOf('Array') == -1) { format = new Array(format); }for (i=0; i<format.length; i++) {emit(format[i],1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {format = doc.sourceResource.format;if (format.constructor.toString().indexOf('Array') == -1) { format = new Array(format); }for (i=0; i<format.length; i++) {emit(format[i],1);}}}",
"reduce": "_count"
},
"type": {
- "map": "function(doc) {if (doc.ingestType == 'item') {t = doc.aggregatedCHO.type;if (t.constructor.toString().indexOf('Array') == -1) { t = new Array(t); }for (i=0; i<t.length; i++) {emit(doc['id'], t[i]);}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {t = doc.sourceResource.type;if (t.constructor.toString().indexOf('Array') == -1) { t = new Array(t); }for (i=0; i<t.length; i++) {emit(doc['id'], t[i]);}}}"
},
"type_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {t = doc.aggregatedCHO.type;if (t.constructor.toString().indexOf('Array') == -1) { t = new Array(t); }for (i=0; i<t.length; i++) {emit(t[i],1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {t = doc.sourceResource.type;if (t.constructor.toString().indexOf('Array') == -1) { t = new Array(t); }for (i=0; i<t.length; i++) {emit(t[i],1);}}}",
"reduce": "_count"
},
"subject": {
- "map": "function(doc) {if (doc.ingestType == 'item') {sub = doc.aggregatedCHO.subject;if (sub.constructor.toString().indexOf('Array') == -1) { sub = new Array(sub); }for (i=0; i<sub.length; i++) {emit(doc['id'], sub[i].name);}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {sub = doc.sourceResource.subject;if (sub.constructor.toString().indexOf('Array') == -1) { sub = new Array(sub); }for (i=0; i<sub.length; i++) {emit(doc['id'], sub[i].name);}}}"
},
"subject_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {sub = doc.aggregatedCHO.subject;if (sub.constructor.toString().indexOf('Array') == -1) { sub = new Array(sub); }for (i=0; i<sub.length; i++) {emit(sub[i].name,1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {sub = doc.sourceResource.subject;if (sub.constructor.toString().indexOf('Array') == -1) { sub = new Array(sub); }for (i=0; i<sub.length; i++) {emit(sub[i].name,1);}}}",
"reduce": "_count"
},
"spatial_state": {
- "map": "function(doc) {if (doc.ingestType == 'item') {spatial = doc.aggregatedCHO.spatial;if (spatial.constructor.toString().indexOf('Array') == -1) { spatial = new Array(spatial); }for (i=0; i<spatial.length; i++) {if ('state' in spatial[i]) {emit(doc['id'], spatial[i]['state'])};}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {spatial = doc.sourceResource.spatial;if (spatial.constructor.toString().indexOf('Array') == -1) { spatial = new Array(spatial); }for (i=0; i<spatial.length; i++) {if ('state' in spatial[i]) {emit(doc['id'], spatial[i]['state'])};}}}"
},
"spatial_state_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {spatial = doc.aggregatedCHO.spatial;if (spatial.constructor.toString().indexOf('Array') == -1) { spatial = new Array(spatial); }for (i=0; i<spatial.length; i++) {if ('state' in spatial[i]) {emit(spatial[i]['state'],1)};}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {spatial = doc.sourceResource.spatial;if (spatial.constructor.toString().indexOf('Array') == -1) { spatial = new Array(spatial); }for (i=0; i<spatial.length; i++) {if ('state' in spatial[i]) {emit(spatial[i]['state'],1)};}}}",
"reduce": "_count"
},
"spatial_name": {
- "map": "function(doc) {if (doc.ingestType == 'item') {spatial = doc.aggregatedCHO.spatial;if (spatial.constructor.toString().indexOf('Array') == -1) { spatial = new Array(spatial); }for (i=0; i<spatial.length; i++) {if ('state' in spatial[i]) {emit(doc['id'], spatial[i]['name'])};}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {spatial = doc.sourceResource.spatial;if (spatial.constructor.toString().indexOf('Array') == -1) { spatial = new Array(spatial); }for (i=0; i<spatial.length; i++) {if ('name' in spatial[i]) {emit(doc['id'], spatial[i]['name'])};}}}"
},
"spatial_name_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {spatial = doc.aggregatedCHO.spatial;if (spatial.constructor.toString().indexOf('Array') == -1) { spatial = new Array(spatial); }for (i=0; i<spatial.length; i++) {if ('state' in spatial[i]) {emit(spatial[i]['name'],1)};}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {spatial = doc.sourceResource.spatial;if (spatial.constructor.toString().indexOf('Array') == -1) { spatial = new Array(spatial); }for (i=0; i<spatial.length; i++) {if ('name' in spatial[i]) {emit(spatial[i]['name'],1)};}}}",
+ "reduce": "_count"
+ },
+ "state_located_in_state": {
+ "map": "function(doc) {if (doc.ingestType == 'item') {sli = doc.sourceResource.stateLocatedIn;if (sli.constructor.toString().indexOf('Array') == -1) { sli = new Array(sli); }for (i=0; i<sli.length; i++) {if ('state' in sli[i]) {emit(doc['id'], sli[i]['state'])};}}}"
+ },
+ "state_located_in_state_count": {
+ "map": "function(doc) {if (doc.ingestType == 'item') {sli = doc.sourceResource.stateLocatedIn;if (sli.constructor.toString().indexOf('Array') == -1) { sli = new Array(sli); }for (i=0; i<sli.length; i++) {if ('state' in sli[i]) {emit(sli[i]['state'],1)};}}}",
+ "reduce": "_count"
+ },
+ "state_located_in_name": {
+ "map": "function(doc) {if (doc.ingestType == 'item') {sli = doc.sourceResource.stateLocatedIn;if (sli.constructor.toString().indexOf('Array') == -1) { sli = new Array(sli); }for (i=0; i<sli.length; i++) {if ('name' in sli[i]) {emit(doc['id'], sli[i]['name'])};}}}"
+ },
+ "state_located_in_name_count": {
+ "map": "function(doc) {if (doc.ingestType == 'item') {sli = doc.sourceResource.stateLocatedIn;if (sli.constructor.toString().indexOf('Array') == -1) { sli = new Array(sli); }for (i=0; i<sli.length; i++) {if ('name' in sli[i]) {emit(sli[i]['name'],1)};}}}",
"reduce": "_count"
},
"rights": {
- "map": "function(doc) {if (doc.ingestType == 'item') {rights = doc.aggregatedCHO.rights;if (rights.constructor.toString().indexOf('Array') == -1) { rights = new Array(rights); }for (i=0; i<rights.length; i++) {emit(doc['id'], rights[i]);}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {rights = doc.sourceResource.rights;if (rights.constructor.toString().indexOf('Array') == -1) { rights = new Array(rights); }for (i=0; i<rights.length; i++) {emit(doc['id'], rights[i]);}}}"
},
"rights_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {rights = doc.aggregatedCHO.rights;if (rights.constructor.toString().indexOf('Array') == -1) { rights = new Array(rights); }for (i=0; i<rights.length; i++) {emit(rights[i],1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {rights = doc.sourceResource.rights;if (rights.constructor.toString().indexOf('Array') == -1) { rights = new Array(rights); }for (i=0; i<rights.length; i++) {emit(rights[i],1);}}}",
"reduce": "_count"
},
"provider": {
@@ -101,24 +115,24 @@
"reduce": "_count"
},
"contributor": {
- "map": "function(doc) {if (doc.ingestType == 'item') {contributor = doc.aggregatedCHO.contributor;if (contributor.constructor.toString().indexOf('Array') == -1) { contributor = new Array(contributor); }for (i=0; i<contributor.length; i++) {emit(doc['id'], contributor[i]);}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {contributor = doc.sourceResource.contributor;if (contributor.constructor.toString().indexOf('Array') == -1) { contributor = new Array(contributor); }for (i=0; i<contributor.length; i++) {emit(doc['id'], contributor[i]);}}}"
},
"contributor_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {contributor = doc.aggregatedCHO.contributor;if (contributor.constructor.toString().indexOf('Array') == -1) { contributor = new Array(contributor); }for (i=0; i<contributor.length; i++) {emit(contributor[i],1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {contributor = doc.sourceResource.contributor;if (contributor.constructor.toString().indexOf('Array') == -1) { contributor = new Array(contributor); }for (i=0; i<contributor.length; i++) {emit(contributor[i],1);}}}",
"reduce": "_count"
},
"language": {
- "map": "function(doc) {if (doc.ingestType == 'item') {language = doc.aggregatedCHO.language;if (language.constructor.toString().indexOf('Array') == -1) { language = new Array(language); }for (i=0; i<language.length; i++) {emit(doc['id'], language[i]['name']);}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {language = doc.sourceResource.language;if (language.constructor.toString().indexOf('Array') == -1) { language = new Array(language); }for (i=0; i<language.length; i++) {emit(doc['id'], language[i]['name']);}}}"
},
"language_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {language = doc.aggregatedCHO.language;if (language.constructor.toString().indexOf('Array') == -1) { language = new Array(language); }for (i=0; i<language.length; i++) {emit(language[i]['name'],1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {language = doc.sourceResource.language;if (language.constructor.toString().indexOf('Array') == -1) { language = new Array(language); }for (i=0; i<language.length; i++) {emit(language[i]['name'],1);}}}",
"reduce": "_count"
},
"temporal": {
- "map": "function(doc) {if (doc.ingestType == 'item') {temporal = doc.aggregatedCHO.temporal;if (temporal.constructor.toString().indexOf('Array') == -1) { temporal = new Array(temporal); }for (i=0; i<temporal.length; i++) {emit(doc['id'], temporal[i]['displayDate']+' ('+d[i]['begin']+' to '+d[i]['end']+')');}}}"
+ "map": "function(doc) {if (doc.ingestType == 'item') {temporal = doc.sourceResource.temporal;if (temporal.constructor.toString().indexOf('Array') == -1) { temporal = new Array(temporal); }for (i=0; i<temporal.length; i++) {emit(doc['id'], temporal[i]['displayDate']+' ('+d[i]['begin']+' to '+d[i]['end']+')');}}}"
},
"temporal_count": {
- "map": "function(doc) {if (doc.ingestType == 'item') {temporal = doc.aggregatedCHO.temporal;if (temporal.constructor.toString().indexOf('Array') == -1) { temporal = new Array(temporal); }for (i=0; i<temporal.length; i++) {emit(temporal[i]['displayDate']+' ('+d[i]['begin']+' to '+d[i]['end']+')',1);}}}",
+ "map": "function(doc) {if (doc.ingestType == 'item') {temporal = doc.sourceResource.temporal;if (temporal.constructor.toString().indexOf('Array') == -1) { temporal = new Array(temporal); }for (i=0; i<temporal.length; i++) {emit(temporal[i]['displayDate']+' ('+d[i]['begin']+' to '+d[i]['end']+')',1);}}}",
"reduce": "_count"
}
},
View
35 lib/akamod/arc-to-dpla.py
@@ -25,7 +25,7 @@
"state": "dpla:state",
"coordinates": "dpla:coordinates",
"stateLocatedIn" : "dpla:stateLocatedIn",
- "aggregatedCHO" : "edm:aggregatedCHO",
+ "sourceResource" : "edm:sourceResource",
"dataProvider" : "edm:dataProvider",
"hasView" : "edm:hasView",
"isShownAt" : "edm:isShownAt",
@@ -80,8 +80,8 @@ def collection_transform(d):
collection = getprop(d, "collection")
items = arc_group_extraction(d, "hierarchy", "hierarchy-item")
for item in (items if isinstance(items, list) else [items]):
- if item["hierarchy-item-id"] == collection["name"]:
- setprop(collection, "name", item["hierarchy-item-title"])
+ if item["hierarchy-item-id"] == collection["title"]:
+ setprop(collection, "title", item["hierarchy-item-title"])
return {"collection": collection} if collection else {}
def creator_transform(d):
@@ -97,7 +97,8 @@ def extent_transform(d):
extent = []
extents = arc_group_extraction(d, "physical-occurrences",
"physical-occurrence", "extent")
- [extent.append(e) for e in extents if e]
+ if extents:
+ [extent.append(e) for e in extents if e]
return {"extent": extent} if extent else {}
@@ -238,14 +239,14 @@ def arc_group_extraction(d, groupKey, itemKey, nameKey=None):
"physical-occurrences" : extent_transform,
"creators" : creator_transform,
"hierarchy" : is_part_of_transform,
- "release-dates" : lambda d: date_transform(d,"release-dates","release-date"),
- "broadcast-dates" : lambda d: date_transform(d,"broadcast-dates","broadcast-date"),
- "production-dates" : lambda d: date_transform(d,"production-dates","production-date"),
- "coverage-dates" : lambda d: date_transform(d,"coverage-dates",["cov-start-date","cov-end-date"]),
- "copyright-dates" : lambda d: date_transform(d,"copyright-dates","copyright-date"),
+ "release-dates" : lambda d: date_transform(d,"release-dates", "release-date"),
+ "broadcast-dates" : lambda d: date_transform(d,"broadcast-dates", "broadcast-date"),
+ "production-dates" : lambda d: date_transform(d,"production-dates", "production-date"),
+ "coverage-dates" : lambda d: date_transform(d,"coverage-dates", ["cov-start-date", "cov-end-date"]),
+ "copyright-dates" : lambda d: date_transform(d,"copyright-dates", "copyright-date"),
"title" : lambda d: {"title": d.get("title-only")},
"scope-content-note" : lambda d: {"description": d.get("scope-content-note")},
- "languages" : lambda d: {"language": arc_group_extraction(d,"languages","language")}
+ "languages" : lambda d: {"language": arc_group_extraction(d, "languages", "language")}
}
AGGREGATION_TRANSFORMER = {
@@ -278,26 +279,26 @@ def arctodpla(body,ctype,geoprop=None):
out = {
"@context": CONTEXT,
- "aggregatedCHO": {}
+ "sourceResource": {}
}
# Apply all transformation rules from original document
for p in data.keys():
if p in CHO_TRANSFORMER:
- out["aggregatedCHO"].update(CHO_TRANSFORMER[p](data))
+ out["sourceResource"].update(CHO_TRANSFORMER[p](data))
if p in AGGREGATION_TRANSFORMER:
out.update(AGGREGATION_TRANSFORMER[p](data))
# Apply transformations that are dependent on more than one
# original document field
- out["aggregatedCHO"].update(type_transform(data))
- out["aggregatedCHO"].update(rights_transform(data))
- out["aggregatedCHO"].update(subject_and_spatial_transform(data))
+ out["sourceResource"].update(type_transform(data))
+ out["sourceResource"].update(rights_transform(data))
+ out["sourceResource"].update(subject_and_spatial_transform(data))
out.update(has_view_transform(data))
- if exists(out, "aggregatedCHO/date"):
- logger.debug("OUTTYPE: %s"%getprop(out, "aggregatedCHO/date"))
+ if exists(out, "sourceResource/date"):
+ logger.debug("OUTTYPE: %s"%getprop(out, "sourceResource/date"))
# Additional content not from original document
if "HTTP_CONTRIBUTOR" in request.environ:
View
4 lib/akamod/artstor_identify_object.py
@@ -62,9 +62,7 @@ def artstor_identify_object(body, ctype, download="True"):
logger.error("Can't find url with '%s' prefix in [%s] for fetching document preview url for Artstor.", artstor_preview_prefix, data[original_document_key][original_sources_key])
return body
- data["object"] = {"@id": preview_url,
- "format": None,
- "rights": selector.getprop(data, "aggregatedCHO/rights", keyErrorAsNone=True)}
+ data["object"] = preview_url
status = IGNORE
if download == "True":
View
2 lib/akamod/artstor_select_isshownat.py
@@ -35,7 +35,7 @@ def artstor_select_source(body, ctype):
original_document_key = u"originalRecord"
original_sources_key = u"handle"
artstor_source_prefix = "Image View"
- source_key = u"isShownAt/@id"
+ source_key = u"isShownAt"
if original_document_key not in data:
logger.error("There is no '%s' key in JSON for doc [%s].", original_document_key, data[u'id'])
View
2 lib/akamod/bhl_contributor_to_collection.py
@@ -6,7 +6,7 @@
@simple_service('POST', 'http://purl.org/la/dp/contributor_to_collection',
'bhl_contributor_to_collection', 'application/json')
-def bhlcontributortocollection(body,ctype,contributor_field="aggregatedCHO/contributor"):
+def bhlcontributortocollection(body,ctype,contributor_field="sourceResource/contributor"):
""" Copies BHL contributor field value to collection field
"""
View
19 lib/akamod/cleanup_value.py
@@ -37,7 +37,7 @@ def cleanup(value, prop):
Converted string.
"""
# Do not remove double quotes from title
- dquote = '' if prop == "aggregatedCHO/title" else '"'
+ dquote = '' if prop == "sourceResource/title" else '"'
# Tags for stripping at beginning and at the end.
TAGS_FOR_STRIPPING = '[\.\' \r\t\n;,%s]*' % dquote
REGEXPS = (' *-- *', '--'), \
@@ -54,21 +54,22 @@ def cleanup(value, prop):
"""
Fields which should not be changed:
--- physicalMedium (there are often dimensions in this field)
+-- format (there are often dimensions in this field)
-- extent (for the same reason)
-- descriptions (full text, includes sentences)
-- rights (full text, includes sentences)
-- place (may end in an abbreviated state name)
"""
DEFAULT_PROP = [
- "aggregatedCHO/language",
- "aggregatedCHO/title",
- "aggregatedCHO/creator",
- "aggregatedCHO/relation",
- "aggregatedCHO/publisher",
- "aggregatedCHO/subject",
- "aggregatedCHO/format",
+ "sourceResource/language",
+ "sourceResource/title",
+ "sourceResource/creator",
+ "sourceResource/relation",
+ "sourceResource/publisher",
+ "sourceResource/subject",
+ "sourceResource/format",
+ "sourceResource/date"
]
View
11 lib/akamod/contentdm_identify_object.py
@@ -11,7 +11,7 @@
@simple_service('POST', 'http://purl.org/la/dp/contentdm_identify_object',
'contentdm_identify_object', 'application/json')
-def contentdm_identify_object(body, ctype, rights_field="aggregatedCHO/rights", download="True"):
+def contentdm_identify_object(body, ctype, download="True"):
"""
Responsible for: adding a field to a document with the URL where we
should expect to the find the thumbnail
@@ -72,16 +72,9 @@ def log_json():
return body
# Thumb url field.
- thumb_url = "%scgi-bin/thumbnail.exe?CISOROOT=%s&CISOPTR=%s" % \
+ data["object"] = "%scgi-bin/thumbnail.exe?CISOROOT=%s&CISOPTR=%s" % \
(base_url, p[0], p[1])
- # Gettings the rights field
- rights = None
- if exists(data, rights_field):
- rights = getprop(data, rights_field)
-
- data["object"] = {"@id": thumb_url, "format": "", "rights": rights}
-
status = IGNORE
if download == "True":
status = PENDING
View
16 lib/akamod/edan_to_dpla.py
@@ -25,7 +25,7 @@
"state": "dpla:state",
"coordinates": "dpla:coordinates",
"stateLocatedIn" : "dpla:stateLocatedIn",
- "aggregatedCHO" : "edm:aggregatedCHO",
+ "sourceResource" : "edm:sourceResource",
"dataProvider" : "edm:dataProvider",
"hasView" : "edm:hasView",
"isShownAt" : "edm:isShownAt",
@@ -570,25 +570,23 @@ def edantodpla(body,ctype,geoprop=None):
out = {
"@context": CONTEXT,
- "aggregatedCHO": {}
+ "sourceResource": {}
}
# Apply all transformation rules from original document
for k, v in CHO_TRANSFORMER.items():
if exists(data, k):
- out["aggregatedCHO"].update(v(data))
+ out["sourceResource"].update(v(data))
for k, v in AGGREGATION_TRANSFORMER.items():
if exists(data, k):
out.update(v(data))
# Apply transformations that are dependent on more than one
# original document field
- #out["aggregatedCHO"].update(type_transform(data))
- out["aggregatedCHO"].update(transform_rights(data))
- out["aggregatedCHO"].update(transform_subject(data))
- out["aggregatedCHO"].update(transform_spatial(data))
- #out["aggregatedCHO"].update(subject_and_spatial_transform(data))
- #out.update(has_view_transform(data))
+ #out["sourceResource"].update(type_transform(data))
+ out["sourceResource"].update(transform_rights(data))
+ out["sourceResource"].update(transform_subject(data))
+ out["sourceResource"].update(transform_spatial(data))
out.update(transform_is_shown_at(data))
View
4 lib/akamod/enrich-date.py
@@ -163,7 +163,7 @@ def convert_dates(data, prop, earliest):
delprop(data, p)
@simple_service('POST', 'http://purl.org/la/dp/enrich_earliest_date', 'enrich_earliest_date', HTTP_TYPE_JSON)
-def enrich_earliest_date(body, ctype, action="enrich_earliest_date", prop="aggregatedCHO/date"):
+def enrich_earliest_date(body, ctype, action="enrich_earliest_date", prop="sourceResource/date"):
"""
Service that accepts a JSON document and extracts the "created date" of the item, using the
following rules:
@@ -183,7 +183,7 @@ def enrich_earliest_date(body, ctype, action="enrich_earliest_date", prop="aggre
@simple_service('POST', 'http://purl.org/la/dp/enrich_date', 'enrich_date', HTTP_TYPE_JSON)
-def enrich_date(body, ctype, action="enrich_date", prop="aggregatedCHO/temporal"):
+def enrich_date(body, ctype, action="enrich_date", prop="sourceResource/temporal"):
"""
Service that accepts a JSON document and extracts the "created date" of the item, using the
following rules:
View
87 lib/akamod/enrich-format.py
@@ -2,29 +2,34 @@
from akara import response
from akara.services import simple_service
from amara.thirdparty import json
-from dplaingestion.selector import getprop, setprop, exists
+from dplaingestion.selector import delprop, getprop, setprop, exists
import re
import os
from amara.lib.iri import is_absolute
-@simple_service('POST', 'http://purl.org/la/dp/enrich-format', 'enrich-format', 'application/json')
-def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alternate="aggregatedCHO/physicalMedium",typefield="aggregatedCHO/type"):
+@simple_service('POST', 'http://purl.org/la/dp/enrich-format', 'enrich-format',
+ 'application/json')
+def enrichformat(body, ctype, action="enrich-format",
+ prop="sourceResource/format",
+ type_field="sourceResource/type"):
"""
- Service that accepts a JSON document and enriches the "format" field of that document
- by:
-
- a) setting the format to be all lowercase
- b) running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
- c) checking to see if the field is a valid IMT, and moving it to a separatee field if not
- See http://www.iana.org/assignments/media-types for list of valid media-types.
- We require that a subtype is defined.
- d) Remove any extra text after the IMT
- e) Set type field from format field, if it is not set.
- The format field is taken if it is a string, or the first element if it is a list.
- It is then splitted and the first part of IMT is taken.
-
- By default works on the 'format' field, but can be overridden by passing the name of the field to use
- as the 'prop' parameter. Non-IMT's are moved the field defined by the 'alternate' parameter.
+ Service that accepts a JSON document and enriches the "format" field of
+ that document by:
+
+ a) Setting the format to be all lowercase
+ b) Running through a set of cleanup regex's (e.g. image/jpg -> image/jpeg)
+ c) Checking to see if the field is a valid IMT
+ See http://www.iana.org/assignments/media-types for list of valid
+ media-types. We require that a subtype is defined.
+ d) Removing any extra text after the IMT
+ e) Moving valid IMT values to hasView/format if hasView exists and
+ its format is not set
+ f) Setting type field from format field, if it is not set. The format field
+ is taken if it is a string, or the first element if it is a list. It is
+ then split and the first part of IMT is taken.
+
+ By default works on the 'sourceResource/format' field but can be overridden
+ by passing the name of the field to use as the 'prop' parameter.
"""
FORMAT_2_TYPE_MAPPINGS = {
@@ -42,13 +47,15 @@ def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alter
'multipart', 'text', 'video']
def get_ext(s):
- return os.path.splitext(s)[1].split('.')[1]
+ ext = os.path.splitext(s)[1].split('.')
+
+ return ext[1] if len(ext) == 2 else ""
def cleanup(s):
s = s.lower().strip()
for pattern, replace in REGEXPS:
s = re.sub(pattern, replace, s)
- s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1",s)
+ s = re.sub(r"^([a-z0-9/]+)\s.*",r"\1", s)
return s
def is_imt(s):
@@ -63,34 +70,40 @@ def is_imt(s):
response.add_header('content-type','text/plain')
return "Unable to parse body as JSON"
- if exists(data,prop):
- v = getprop(data,prop)
+ if exists(data, prop):
+ v = getprop(data, prop)
format = []
- physicalFormat = getprop(data,alternate) if exists(data,alternate) else []
- if not isinstance(physicalFormat,list):
- physicalFormat = [physicalFormat]
+ hasview_format = []
for s in (v if not isinstance(v,basestring) else [v]):
if is_absolute(s):
s = get_ext(s)
cleaned = cleanup(s)
if is_imt(cleaned):
- if cleaned not in format:
- format.append(cleaned)
- else:
- if s not in physicalFormat:
- physicalFormat.append(s)
+ if exists(data, "hasView") and not \
+ exists(data, "hasView/format") and \
+ cleaned not in hasview_format:
+ hasview_format.append(cleaned)
+ else:
+ if cleaned not in format:
+ format.append(cleaned)
if format:
- setprop(data,prop,format[0]) if len(format) == 1 else setprop(data,prop,format)
+ if len(format) == 1:
+ setprop(data, prop, format[0])
+ else:
+ setprop(data, prop, format)
else:
- setprop(data,prop,None)
- if physicalFormat:
- setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat)
+ delprop(data, prop)
+ if hasview_format:
+ if len(hasview_format) == 1:
+ setprop(data, "hasView/format", hasview_format[0])
+ else:
+ setprop(data, "hasView/format", hasview_format)
# Setting the type if it is empty.
- f = getprop(data, typefield, True)
- if not f and exists(data, prop):
+ t = getprop(data, type_field, True)
+ if not t and exists(data, prop):
format = getprop(data, prop)
use_format = None
if isinstance(format, list) and len(format) > 0:
@@ -102,6 +115,6 @@ def is_imt(s):
use_format = use_format.split("/")[0]
if use_format in FORMAT_2_TYPE_MAPPINGS:
- setprop(data, typefield, FORMAT_2_TYPE_MAPPINGS[use_format])
+ setprop(data, type_field, FORMAT_2_TYPE_MAPPINGS[use_format])
return json.dumps(data)
View
2 lib/akamod/enrich-subject.py
@@ -6,7 +6,7 @@
import re
@simple_service('POST', 'http://purl.org/la/dp/enrich-subject', 'enrich-subject', 'application/json')
-def enrichsubject(body,ctype,action="enrich-subject",prop="aggregatedCHO/subject"):
+def enrichsubject(body,ctype,action="enrich-subject",prop="sourceResource/subject"):
'''
Service that accepts a JSON document and enriches the "subject" field of that document
by:
View
56 lib/akamod/enrich-type.py
@@ -2,27 +2,34 @@
from akara import response
from akara.services import simple_service
from amara.thirdparty import json
-from dplaingestion.selector import getprop, setprop, exists
+from dplaingestion.selector import delprop, getprop, setprop, exists
import re
-@simple_service('POST', 'http://purl.org/la/dp/enrich-type', 'enrich-type', 'application/json')
-def enrichtype(body,ctype,action="enrich-type", prop="aggregatedCHO/type", alternate="aggregatedCHO/physicalMedium"):
+@simple_service('POST', 'http://purl.org/la/dp/enrich-type', 'enrich-type',
+ 'application/json')
+def enrichtype(body,ctype,action="enrich-type", prop="sourceResource/type",
+ format_field="sourceResource/format"):
"""
- Service that accepts a JSON document and enriches the "type" field of that document
- by:
+ Service that accepts a JSON document and enriches the "type" field of that
+ document by:
a) making the type lowercase
- b) converting "image" to "still image" (TODO: Amy to confirm that this is ok)
+ b) converting "image" to "still image"
+ (TODO: Amy to confirm that this is ok)
c) applying a set of regexps to do data cleanup (remove plural forms)
- d) moving all items that are not standard DC types to the physical format field (http://dublincore.org/documents/resource-typelist/)
+ d) moving all items that are not standard DC types to the
+ sourceResource/format
+ (http://dublincore.org/documents/resource-typelist/)
- By default works on the 'type' field, but can be overridden by passing the name of the field to use
- as a parameter
+ By default works on the 'type' field, but can be overridden by passing the
+ name of the field to use as a parameter
"""
REGEXPS = ('images','image'), ('still image','image')
- DC_TYPES = ['collection', 'dataset', 'event', 'image', 'still image', 'interactive resource', 'model', 'moving image', 'party', 'physical object',
- 'place', 'service', 'software', 'sound', 'text']
+ DC_TYPES = ['collection', 'dataset', 'event', 'image', 'still image',
+ 'interactive resource', 'model', 'moving image', 'party',
+ 'physical object', 'place', 'service', 'software', 'sound',
+ 'text']
def cleanup(s):
s = s.lower().strip()
@@ -41,20 +48,29 @@ def is_dc_type(s):
return "Unable to parse body as JSON"
if exists(data,prop):
- v = getprop(data,prop)
+ v = getprop(data, prop)
dctype = []
- physicalFormat = getprop(data,alternate) if exists(data,alternate) else []
- if not isinstance(physicalFormat,list):
- physicalFormat = [physicalFormat]
+ f = getprop(data, format_field) if exists(data, format_field) else []
+ if not isinstance(f, list):
+ f = [f]
for s in (v if not isinstance(v,basestring) else [v]):
- dctype.append(cleanup(s)) if is_dc_type(cleanup(s)) else physicalFormat.append(s)
+ if is_dc_type(cleanup(s)):
+ dctype.append(cleanup(s))
+ else:
+ f.append(s)
if dctype:
- setprop(data,prop,dctype[0]) if len(dctype) == 1 else setprop(data,prop,dctype)
+ if len(dctype) == 1:
+ setprop(data, prop, dctype[0])
+ else:
+ setprop(data, prop, dctype)
else:
- setprop(data,prop,None)
- if physicalFormat:
- setprop(data,alternate,physicalFormat[0]) if len(physicalFormat) == 1 else setprop(data,alternate,physicalFormat)
+ delprop(data, prop)
+ if f:
+ if len(f) == 1:
+ setprop(data, format_field, f[0])
+ else:
+ setprop(data, format_field, f)
return json.dumps(data)
View
2 lib/akamod/enrich.py
@@ -176,7 +176,7 @@ def enrich(body, ctype):
# Add collection information
record[u'collection'] = {
'@id' : at_id,
- 'name' : enriched_collection.get('title',"")
+ 'title' : enriched_collection.get('title',"")
}
if 'description' in enriched_collection:
record[u'collection']['description'] = enriched_collection.get('description',"")
View
2 lib/akamod/enrich_language.py
@@ -6,7 +6,7 @@
import re
@simple_service('POST', 'http://purl.org/la/dp/enrich_language', 'enrich_language', 'application/json')
-def enrich_language(body, ctype, action="enrich_language", prop="aggregatedCHO/language"):
+def enrich_language(body, ctype, action="enrich_language", prop="sourceResource/language"):
'''
Service that accepts a JSON document and enriches the "language" field of that document
by:
View
2 lib/akamod/enrich_location.py
@@ -8,7 +8,7 @@
REGEXPS = ('\.',''), ('\(',''), ('\)',''), ('-',''), (',','')
@simple_service('POST', 'http://purl.org/la/dp/enrich_location', 'enrich_location', 'application/json')
-def enrichlocation(body,ctype,action="enrich_location", prop="aggregatedCHO/spatial"):
+def enrichlocation(body,ctype,action="enrich_location", prop="sourceResource/spatial"):
"""
Service that accepts a JSON document and enriches the "spatial" field of that document by
iterating through the spatial fields and mapping to the state and iso3166-2, if not already
View
4 lib/akamod/georgia_identify_object.py
@@ -62,9 +62,7 @@ def georgia_identify_object(body, ctype, download="True"):
preview_url = preview_url_pattern % {"repo": repo, "coll": coll, "item": item}
- data["object"] = {"@id": preview_url,
- "format": None,
- "rights": selector.getprop(data, "aggregatedCHO/rights", keyErrorAsNone=True)}
+ data["object"] = preview_url
status = IGNORE
if download == "True":
View
12 lib/akamod/kentucky_identify_object.py
@@ -11,7 +11,7 @@
@simple_service('POST', 'http://purl.org/la/dp/kentucky_identify_object',
'kentucky_identify_object', 'application/json')
-def kentucky_identify_object(body, ctype, rights_field="aggregatedCHO/rights", download="True"):
+def kentucky_identify_object(body, ctype, download="True"):
"""
Responsible for: adding a field to a document with the URL where we
should expect to the find the thumbnail
@@ -33,7 +33,7 @@ def log_json():
response.add_header('content-type', 'text/plain')
return msg
- relation_field = "aggregatedCHO/relation"
+ relation_field = "sourceResource/relation"
if exists(data, relation_field):
url = getprop(data, relation_field)
else:
@@ -42,13 +42,7 @@ def log_json():
return body
base_url, ext = os.path.splitext(url)
- thumb_url = "%s_tb%s" % (base_url, ext)
-
- rights = None
- if exists(data, rights_field):
- rights = getprop(data, rights_field)
-
- data["object"] = {"@id": thumb_url, "format": "", "rights": rights}
+ data["object"] = "%s_tb%s" % (base_url, ext)
status = IGNORE
if download == "True":
View
2 lib/akamod/mdl-enrich-location.py
@@ -5,7 +5,7 @@
from dplaingestion.selector import getprop, setprop, exists
@simple_service('POST', 'http://purl.org/la/dp/mdl-enrich-location', 'mdl-enrich-location', 'application/json')
-def mdlenrichlocation(body,ctype,action="mdl-enrich-location", prop="aggregatedCHO/spatial"):
+def mdlenrichlocation(body,ctype,action="mdl-enrich-location", prop="sourceResource/spatial"):
"""
Service that accepts a JSON document and enriches the "spatial" field of that document by:
View
18 lib/akamod/move_date_values.py
@@ -5,11 +5,13 @@
from dplaingestion.selector import getprop, setprop, delprop, exists
import re
-@simple_service('POST', 'http://purl.org/la/dp/move_date_values', 'move_date_values', 'application/json')
-def movedatevalues(body,ctype,action="move_date_values",prop=None,to_prop="aggregatedCHO/temporal"):
+@simple_service('POST', 'http://purl.org/la/dp/move_date_values',
+ 'move_date_values', 'application/json')
+def movedatevalues(body, ctype, action="move_date_values", prop=None,
+ to_prop="sourceResource/temporal"):
"""
- Service that accepts a JSON document and moves any dates found in the prop field to the
- temporal field.
+ Service that accepts a JSON document and moves any dates found in the prop
+ field to the temporal field.
"""
if not prop:
@@ -19,9 +21,13 @@ def movedatevalues(body,ctype,action="move_date_values",prop=None,to_prop="aggre
REGSUB = ("\(", ""), ("\)", ""), ("\.",""), ("\?","")
REGSEARCH = [
"\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
+ "\d{1,2} *[-/] *\d{4} *[-/] *\d{1,2} *[-/] *\d{4}",
+ "\d{4} *[-/] *\d{1,2} *[-/] *\d{4} *[-/] *\d{1,2}",
"\d{1,4} *[-/] *\d{1,4} *[-/] *\d{1,4}",
"\d{4} *[-/] *\d{4}",
- "\d{4}"
+ "\d{1,2} *[-/] *\d{4}",
+ "\d{4} *[-/] *\d{1,2}",
+ "\d{4}s?"
]
def cleanup(s):
@@ -41,7 +47,7 @@ def cleanup(s):
remove = []
toprop = getprop(data, to_prop) if exists(data, to_prop) else []
- for v in values:
+ for v in (values if isinstance(values, list) else [values]):
c = cleanup(v)
for pattern in REGSEARCH:
m = re.compile(pattern).findall(c)
View
2 lib/akamod/mwdl_enrich_state_located_in.py
@@ -7,7 +7,7 @@
@simple_service('POST', 'http://purl.org/la/dp/mwdl_enrich_state_located_in',
'mwdl_enrich_state_located_in', 'application/json')
def mwdlenrichstatelocatedin(body, ctype, action="mdl_enrich_state_located_in",
- prop="aggregatedCHO/stateLocatedIn"):
+ prop="sourceResource/stateLocatedIn"):
"""
Service that accepts a JSON document and enriches the "stateLocatedIn"
field of that document by:
View
21 lib/akamod/oai-to-dpla.py
@@ -27,7 +27,7 @@
"state": "dpla:state",
"coordinates": "dpla:coordinates",
"stateLocatedIn" : "dpla:stateLocatedIn",
- "aggregatedCHO" : "edm:aggregatedCHO",
+ "sourceResource" : "edm:sourceResource",
"dataProvider" : "edm:dataProvider",
"hasView" : "edm:hasView",
"isShownAt" : "edm:isShownAt",
@@ -45,17 +45,13 @@
}
def is_shown_at_transform(d):
- source = ""
+ source = None
for s in (d["handle"] if not isinstance(d["handle"],basestring) else [d["handle"]]):
if is_absolute(s):
source = s
break
- return {
- "isShownAt" : {
- "@id" : source,
- "format": d.get("format", None)
- }
- }
+
+ return {"isShownAt" : source }
def spatial_transform(d):
spatial = d["coverage"]
@@ -78,7 +74,8 @@ def spatial_transform(d):
"rights" : lambda d: {"rights": d.get("rights",None)},
"subject" : lambda d: {"subject": d.get("subject",None)},
"title" : lambda d: {"title": d.get("title",None)},
- "type" : lambda d: {"type": d.get("type",None)}
+ "type" : lambda d: {"type": d.get("type",None)},
+ "format" : lambda d: {"format": d.get("format",None)}
}
AGGREGATION_TRANSFORMER = {
@@ -116,13 +113,13 @@ def oaitodpla(body,ctype,geoprop=None):
out = {
"@context": CONTEXT,
- "aggregatedCHO" : {}
+ "sourceResource" : {}
}
- # Apply all transformation rules from original document to aggregatedCHO
+ # Apply all transformation rules from original document to sourceResource
for p in data.keys():
if p in CHO_TRANSFORMER:
- out['aggregatedCHO'].update(CHO_TRANSFORMER[p](data))
+ out['sourceResource'].update(CHO_TRANSFORMER[p](data))
if p in AGGREGATION_TRANSFORMER:
out.update(AGGREGATION_TRANSFORMER[p](data))
View
23 lib/akamod/primo-to-dpla.py
@@ -28,7 +28,7 @@
"state": "dpla:state",
"coordinates": "dpla:coordinates",
"stateLocatedIn" : "dpla:stateLocatedIn",
- "aggregatedCHO" : "edm:aggregatedCHO",
+ "sourceResource" : "edm:sourceResource",
"dataProvider" : "edm:dataProvider",
"hasView" : "edm:hasView",
"isShownAt" : "edm:isShownAt",
@@ -44,11 +44,6 @@
}
}
-def web_resource_transform(d, url):
- format_field = RECORD + "display/format"
- format = getprop(d, format_field) if exists(d, format_field) else None
- return {"@id": url, "format": format} if format else {"@id": url}
-
def multi_transform(d, key, props):
values = []
@@ -85,8 +80,8 @@ def multi_transform(d, key, props):
"originalRecord" : lambda d, p: {"originalRecord": getprop(d, p)},
"ingestType" : lambda d, p: {"ingestType": getprop(d, p)},
"ingestDate" : lambda d, p: {"ingestDate": getprop(d, p)},
- RECORD + "control/recordid" : lambda d, p: {"isShownAt": web_resource_transform(d, URL + getprop(d, p))},
- LINKS + "thumbnail" : lambda d, p: {"object": web_resource_transform(d, getprop(d, p))}
+ RECORD + "control/recordid" : lambda d, p: {"isShownAt": URL + getprop(d, p)},
+ LINKS + "thumbnail" : lambda d, p: {"object": getprop(d, p)}
}
@simple_service("POST", "http://purl.org/la/dp/primo-to-dpla", "primo-to-dpla", "application/ld+json")
@@ -109,7 +104,7 @@ def primotodpla(body,ctype,geoprop=None):
out = {
"@context": CONTEXT,
- "aggregatedCHO": {}
+ "sourceResource": {}
}
# For ARC, "data" is the source record so set it here
@@ -118,7 +113,7 @@ def primotodpla(body,ctype,geoprop=None):
# Apply all transformation rules from original document
for p in CHO_TRANSFORMER:
if exists(data, p):
- out["aggregatedCHO"].update(CHO_TRANSFORMER[p](data, p))
+ out["sourceResource"].update(CHO_TRANSFORMER[p](data, p))
for p in AGGREGATION_TRANSFORMER:
if exists(data, p):
out.update(AGGREGATION_TRANSFORMER[p](data, p))
@@ -129,10 +124,10 @@ def primotodpla(body,ctype,geoprop=None):
sp_props = ["display/lds08", "search/lsr14"]
ipo_props = ["display/lds04", "search/lsr13"]
title_props = ["display/title", "display/lds10"]
- out["aggregatedCHO"].update(multi_transform(data, "identifier", id_props))
- out["aggregatedCHO"].update(multi_transform(data, "spatial", sp_props))
- out["aggregatedCHO"].update(multi_transform(data, "isPartOf", ipo_props))
- out["aggregatedCHO"].update(multi_transform(data, "title", title_props))
+ out["sourceResource"].update(multi_transform(data, "identifier", id_props))
+ out["sourceResource"].update(multi_transform(data, "spatial", sp_props))
+ out["sourceResource"].update(multi_transform(data, "isPartOf", ipo_props))
+ out["sourceResource"].update(multi_transform(data, "title", title_props))
dp_props = ["display/lds03", "search/lsr12"]
out.update(multi_transform(data, "dataProvider", dp_props))
View
28 lib/akamod/shred.py
@@ -4,38 +4,42 @@
from amara.thirdparty import json
from dplaingestion.selector import getprop, setprop, exists
-@simple_service('POST', 'http://purl.org/la/dp/shred', 'shred', 'application/json')
-def shred(body,ctype,action="shred",prop=None,delim=';',keepdup=None):
+@simple_service('POST', 'http://purl.org/la/dp/shred', 'shred',
+ 'application/json')
+def shred(body, ctype, action="shred", prop=None, delim=';', keepdup=None):
'''
Service that accepts a JSON document and "shreds" or "unshreds" the value
of the field(s) named by the "prop" parameter
"prop" can include multiple property names, delimited by a comma (the delim
- property is used only for the fields to be shredded/unshredded). This requires
- that the fields share a common delimiter however.
+ property is used only for the fields to be shredded/unshredded). This
+ requires that the fields share a common delimiter however.
'''
try :
data = json.loads(body)
except:
response.code = 500
- response.add_header('content-type','text/plain')
+ response.add_header('content-type', 'text/plain')
return "Unable to parse body as JSON"
for p in prop.split(','):
- if exists(data,p):
- v = getprop(data,p)
+ if exists(data, p):
+ v = getprop(data, p)
if action == "shred":
- if isinstance(v,list):
+ if isinstance(v, list):
v = delim.join(v)
- setprop(data,p,v)
- if delim not in v: continue
+ if delim in v:
+ setprop(data, p, v)
+ else:
+ continue
shredded = []
- [shredded.append(s.strip()) for s in v.split(delim) if keepdup or not shredded.count(s.strip())]
+ [shredded.append(s.strip()) for s in v.split(delim) if
+ keepdup or not shredded.count(s.strip())]
setprop(data, p, shredded)
elif action == "unshred":
if isinstance(v,list):
- setprop(data,p,delim.join(v))
+ setprop(data, p, delim.join(v))
return json.dumps(data)
View
1 lib/oai.py
@@ -267,6 +267,7 @@ def list_records(self, set="", resumption_token = ""):
</header>
<metadata>
<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
+ <dc:subject ak:rel="local-name()" ak:value=".">Nuclear disarmament</dc:subject>
<dc:creator ak:rel="local-name()" ak:value=".">Cohen, Joshua</dc:creator>
<dc:date ak:rel="local-name()" ak:value=".">2004-08-20T19:48:34Z</dc:date>
<dc:date>2004-08-20T19:48:34Z</dc:date>
View
23 profiles/artstor.pjs
@@ -8,22 +8,21 @@
"/select-id?prop=id",
"/oai-to-dpla",
"/artstor_select_isshownat",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
- "/shred?prop=isShownAt%2Fformat",
- "/shred?prop=aggregatedCHO%2FphysicalMedium&delim=%3b",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "/shred?prop=aggregatedCHO%2Fspatial&delim=--",
+ "/shred?prop=sourceResource%2Fcontributor%2CsourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftype%2CsourceResource%2Fformat",
+ "/shred?prop=sourceResource%2Fformat&delim=%3b",
+ "/cleanup_value",
+ "/move_date_values?prop=sourceResource%2Fsubject",
+ "/move_date_values?prop=sourceResource%2Fspatial",
+ "/shred?prop=sourceResource%2Fspatial&delim=--",
"/enrich_earliest_date",
"/enrich-subject",
"/enrich_date",
- "/cleanup_value",
- "/enrich-type",
- "/enrich-format",
+ "/enrich-type",
+ "/enrich-format",
"/artstor_identify_object",
- "/filter_paths?paths=aggregatedCHO%2Fspatial%2CaggregatedCHO%2Frights%2CisShownAt%2Frights%2Cobject%2Frights",
+ "/filter_paths?paths=sourceResource%2Fspatial%2CsourceResource%2Frights",
"/enrich_location",
"/enrich_language"
],
View
21 profiles/bhl.pjs
@@ -7,28 +7,25 @@
"enrichments_rec": [
"/select-id",
"/oai-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
- "/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "/shred?prop=aggregatedCHO%2Fspatial&delim=--",
+ "/shred?prop=sourceResource%2Fcontributor%2CsourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftype%2CsourceResource%2Fformat",
+ "/cleanup_value",
+ "/move_date_values?prop=sourceResource%2Fsubject",
+ "/move_date_values?prop=sourceResource%2Fspatial",
+ "/shred?prop=sourceResource%2Fspatial&delim=--",
"/enrich_earliest_date",
"/enrich-subject",
"/enrich_date",
- "http://localhost:8875/cleanup_value",
"/enrich-type",
"/enrich-format",
"/enrich_location",
"/bhl_contributor_to_collection",
- "/copy_prop?prop=aggregatedCHO%2Fcontributor&to_prop=dataProvider&create=True&remove=True",
+ "/copy_prop?prop=sourceResource%2Fcontributor&to_prop=dataProvider&create=True&remove=True",
"/enrich_language"
],
"subresources": [
- "item",
- "articlepdf",
- "title"
+ "item"
],
"contributor": {
"@id": "http://dp.la/api/contributor/bhl",
View
21 profiles/clemson.pjs
@@ -7,24 +7,23 @@
"enrichments_rec": [
"/select-id",
"/oai-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
- "/shred?prop=aggregatedCHO%2Fsubject&delim=%3Cbr%3E",
- "/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "/shred?prop=aggregatedCHO%2Fspatial&delim=--",
- "/enrich_earliest_date",
+ "/shred?prop=sourceResource%2Fcontributor%2CsourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftype%2CsourceResource%2Fformat",
+ "/shred?prop=sourceResource%2Fsubject&delim=%3Cbr%3E",
"/cleanup_value",
+ "/move_date_values?prop=sourceResource%2Fsubject",
+ "/move_date_values?prop=sourceResource%2Fspatial",
+ "/shred?prop=sourceResource%2Fspatial&delim=--",
+ "/enrich_earliest_date",
"/enrich-subject",
"/enrich_date",
- "/enrich-subject?prop=aggregatedCHO%2Fcreator",
+ "/enrich-subject?prop=sourceResource%2Fcreator",
"/enrich-type",
"/enrich-format",
"/contentdm_identify_object",
"/enrich_location",
- "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
+ "/copy_prop?prop=sourceResource%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
"/enrich_language"
],
"subresources": [ ],
View
32 profiles/digital-commonwealth.pjs
@@ -0,0 +1,32 @@
+{
+ "list_sets": "http://localhost:8881/oai.listsets.json?endpoint=http://digitalcommonwealth.org/oai-pmh-repository/request",
+ "enrichments_coll": [
+ "http://localhost:8881/oai-set-name?sets_service=http://localhost:8881/oai.listsets.json?endpoint=http://digitalcommonwealth.org/oai-pmh-repository/request"
+ ],
+ "name": "digital-commonwealth",
+ "enrichments_rec": [
+ "http://localhost:8881/select-id",
+ "http://localhost:8881/oai-to-dpla",
+ "http://localhost:8881/shred?prop=sourceRecord%2Fcontributor%2CsourceRecord%2Fcreator%2CsourceRecord%2Fdate",
+ "http://localhost:8881/shred?prop=sourceRecord%2Flanguage%2CsourceRecord%2Fpublisher%2CsourceRecord%2Frelation",
+ "http://localhost:8881/shred?prop=sourceRecord%2Fsubject%2CsourceRecord%2Ftype%2CsourceResource%2Fformat",
+ "http://localhost:8881/cleanup_value",
+ "http://localhost:8881/move_date_values?prop=sourceRecord%2Fsubject",
+ "http://localhost:8881/move_date_values?prop=sourceRecord%2Fspatial",
+ "http://localhost:8881/shred?prop=sourceResource%2Fspatial&delim=--",
+ "http://localhost:8881/enrich-subject",
+ "http://localhost:8881/enrich-date",
+ "http://localhost:8881/enrich-temporal-date",
+ "http://localhost:8881/enrich-type",
+ "http://localhost:8881/enrich-format",
+ "http://localhost:8881/enrich_location",
+ "http://localhost:8881/contentdm_identify_object"
+ ],
+ "blacklist": ["3","5","6","8","13","18","19","20","27","41","177"],
+ "contributor": {
+ "@id": "http://dp.la/api/contributor/digital-commonwealth",
+ "name": "Digital Commonwealth"
+ },
+ "type": "oai",
+ "endpoint_URL": "http://localhost:8881/dpla-list-records?endpoint=http://digitalcommonwealth.org/oai-pmh-repository/request&oaiset="
+}
View
23 profiles/georgia.pjs
@@ -8,22 +8,23 @@
"enrichments_rec": [
"/select-id",
"/oai-to-dpla",
- "/sets_prop?prop=aggregatedCHO%2Fdate",
- "/copy_prop?prop=originalRecord%2Fsource&to_prop=aggregatedCHO%2Fdescription&create=True&no_replace=True",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftitle%2CaggregatedCHO%2Ftype%2CaggregatedCHO%2Fcreator",
- "/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fspatial&to_prop=aggregatedCHO%2Fdate",
+ "/sets_prop?prop=sourceResource%2Fdate",
+ "/copy_prop?prop=originalRecord%2Fsource&to_prop=sourceResource%2Fdescription&create=True&no_replace=True",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation%2CsourceResource%2Fformat",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftitle%2CsourceResource%2Ftype%2CsourceResource%2Fcreator",
+ "/shred?prop=sourceResource%2Fspatial&delim=--",
+ "/cleanup_value",
+ "/move_date_values?prop=sourceResource%2Fspatial&to_prop=sourceResource%2Fdate",
"/enrich_earliest_date",
"/enrich-subject",
"/enrich-type",
"/enrich-format",
"/enrich_location",
- "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=aggregatedCHO%2FstateLocatedIn&create=True&remove=True",
- "/enrich_location?prop=aggregatedCHO%2FstateLocatedIn",
- "/copy_prop?prop=aggregatedCHO%2Fcontributor&to_prop=dataProvider&remove=True",
- "/georgia_identify_object",
- "/cleanup_value"
+ "/enrich_language",
+ "/copy_prop?prop=sourceResource%2Fpublisher&to_prop=sourceResource%2FstateLocatedIn&create=True&remove=True",
+ "/enrich_location?prop=sourceResource%2FstateLocatedIn",
+ "/copy_prop?prop=sourceResource%2Fcontributor&to_prop=dataProvider&remove=True",
+ "/georgia_identify_object"
],
"subresources": [
"dpla"
View
25 profiles/kentucky.pjs
@@ -6,22 +6,27 @@
"/select-id",
"/oai-to-dpla",
"/sets_prop?prop=collection",
- "/sets_prop?prop=aggregatedCHO%2Fcontributor",
- "/shred?prop=aggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftitle%2CaggregatedCHO%2Ftype",
+ "/sets_prop?prop=sourceResource%2Fcontributor",
+ "/shred?prop=sourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftitle%2CsourceResource%2Ftype",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResouce%2Fformat",
+ "/shred?prop=sourceResource%2Fspatial&delim=--",
+ "/cleanup_value",
"/enrich_earliest_date",
"/enrich_location",
"/enrich-subject",
"/enrich-type",
"/enrich-format",
+ "/enrich_language",
+ "/copy_prop?prop=sourceResource%2Frelation&to_prop=hasView&key=@id&create=True",
+ "/copy_prop?prop=sourceResource%2Frelation&to_prop=hasView&key=format&create=True",
+ "/enrich-format?prop=hasView%2Fformat",
"/kentucky_identify_object",
- "/sets_prop?prop=aggregatedCHO%2Frelation",
- "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=dataProvider&create=True",
- "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=aggregatedCHO%2FstateLocatedIn&create=True",
- "/sets_prop?prop=aggregatedCHO%2Fpublisher",
- "/enrich_location?prop=aggregatedCHO%2FstateLocatedIn",
- "/cleanup_value"
+ "/sets_prop?prop=sourceResource%2Frelation",
+ "/copy_prop?prop=sourceResource%2Fpublisher&to_prop=dataProvider&create=True",
+ "/copy_prop?prop=sourceResource%2Fpublisher&to_prop=sourceResource%2FstateLocatedIn&create=True",
+ "/sets_prop?prop=sourceResource%2Fpublisher",
+ "/enrich_location?prop=sourceResource%2FstateLocatedIn"
],
"subresources": [],
"contributor": {
View
17 profiles/minnesota.pjs
@@ -7,23 +7,22 @@
"enrichments_rec": [
"/select-id",
"/oai-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
- "/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "http://localhost:8878/shred?prop=aggregatedCHO%2Fspatial&delim=--",
+ "/shred?prop=sourceResource%2Fcontributor%2CsourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftype%2CsourceResource%2Fformat",
+ "/cleanup_value",
+ "/move_date_values?prop=sourceResource%2Fsubject",
+ "/move_date_values?prop=sourceResource%2Fspatial",
+ "/shred?prop=sourceResource%2Fspatial&delim=--",
"/enrich_earliest_date",
"/enrich_date",
"/enrich-subject",
- "http://localhost:8875/cleanup_value",
"/enrich-type",
"/enrich-format",
"/mdl-enrich-location",
"/enrich_location",
"/contentdm_identify_object",
- "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
+ "/copy_prop?prop=sourceResource%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
"/enrich_language"
],
"subresources": [],
View
20 profiles/mwdl.pjs
@@ -1,27 +1,25 @@
{
- "bulk_size": "500",
+ "bulk_size": "100",
"enrichments_coll": [],
"name": "mwdl",
"enrichments_rec": [
"/select-id?prop=_id",
"/primo-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
- "/shred?prop=isShownAt%2Fformat",
+ "/shred?prop=sourceResource%2Fcontributor%2CsourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftype%2CsourceResource%2Fformat",
+ "/cleanup_value",
"/mwdl_enrich_state_located_in",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
+ "/move_date_values?prop=sourceResource%2Fsubject",
+ "/move_date_values?prop=sourceResource%2Fspatial",
"/enrich_earliest_date",
"/enrich_date",
"/enrich-subject",
- "/cleanup_value",
"/enrich-type",
"/enrich-format",
"/enrich_location",
- "/enrich_location?prop=aggregatedCHO%2FstateLocatedIn",
- "/enrich_language",
- "/sets_prop?prop=aggregatedCHO%2FphysicalMedium"
+ "/enrich_location?prop=sourceResource%2FstateLocatedIn",
+ "/enrich_language"
],
"last_checked": "2013-03-05T17:30:21.689809",
"contributor": {
View
10 profiles/nara.pjs
@@ -4,18 +4,18 @@
"enrichments_rec": [
"/select-id?prop=_id",
"/arc-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
+ "/shred?prop=sourceResource%2Fcontributor%2CsourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftype%2CsourceResource%2Fformat",
+ "/cleanup_value",
"/enrich_earliest_date",
"/enrich_date",
"/enrich-subject",
- "/cleanup_value",
"/enrich-type",
"/enrich-format",
"/enrich_location",
"/enrich_language",
- "/copy_prop?prop=aggregatedCHO%2Frights&to_prop=aggregatedCHO%2FhasView&key=rights"
+ "/copy_prop?prop=sourceResource%2Frights&to_prop=sourceResource%2FhasView&key=rights"
],
"subresources": [],
"last_checked": "2013-03-09T14:02:39.094687",
View
19 profiles/scdl-charleston.pjs
@@ -7,23 +7,22 @@
"enrichments_rec": [
"/select-id",
"/oai-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
- "/shred?prop=aggregatedCHO%2Fsubject&delim=%3Cbr%3E",
- "/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "/shred?prop=aggregatedCHO%2Fspatial&delim=--",
+ "/shred?prop=sourceResource%2Fcontributor%2CsourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftype%2CsourceResource%2Fformat",
+ "/shred?prop=sourceResource%2Fsubject&delim=%3Cbr%3E",
+ "/cleanup_value",
+ "/move_date_values?prop=sourceResource%2Fsubject",
+ "/move_date_values?prop=sourceResource%2Fspatial",
+ "/shred?prop=sourceResource%2Fspatial&delim=--",
"/enrich_earliest_date",
"/enrich_date",
"/enrich-subject",
- "/cleanup_value",
"/enrich-type",
"/enrich-format",
"/contentdm_identify_object",
"/enrich_location",
- "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
+ "/copy_prop?prop=sourceResource%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
"/enrich_language"
],
"subresources": [],
View
19 profiles/scdl-usc.pjs
@@ -7,23 +7,22 @@
"enrichments_rec": [
"/select-id",
"/oai-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
- "/shred?prop=aggregatedCHO%2Fsubject&delim=%3Cbr%3E",
- "/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "/shred?prop=aggregatedCHO%2Fspatial&delim=--",
+ "/shred?prop=sourceResource%2Fcontributor%2CsourceResource%2Fcreator%2CsourceResource%2Fdate",
+ "/shred?prop=sourceResource%2Flanguage%2CsourceResource%2Fpublisher%2CsourceResource%2Frelation",
+ "/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Ftype%2CsourceResource%2Fformat",
+ "/shred?prop=sourceResource%2Fsubject&delim=%3Cbr%3E",
+ "/cleanup_value",
+ "/move_date_values?prop=sourceResource%2Fsubject",
+ "/move_date_values?prop=sourceResource%2Fspatial",
+ "/shred?prop=sourceResource%2Fspatial&delim=--",
"/enrich_earliest_date",
"/enrich_date",
"/enrich-subject",
- "/cleanup_value",
"/enrich-type",
"/enrich-format",
"/contentdm_identify_object",
"/enrich_location",
- "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
+ "/copy_prop?prop=sourceResource%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
"/enrich_language"
],
"subresources": [
View
2 profiles/smithsonian.pjs
@@ -9,7 +9,7 @@
"/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
"/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
"/shred?prop=aggregatedCHO%2Fsubject&delim=%3Cbr%3E",
- "/shred?prop=isShownAt%2Fformat",
+ "/shred?prop=sourceResource%2Fformat",
"/enrich_earliest_date",
"/enrich_date",
"/enrich-subject",
View
3 scripts/export_reports
@@ -33,6 +33,9 @@ report[13]="description"
report[14]="spatial_name"
report[15]="spatial_state"
report[16]="data_provider"
+report[17]="state_located_in_name"
+report[18]="state_located_in_state"
+
dir="${provider}_exports"
rm -r $dir "${dir}.zip"
View
13 scripts/poll_profiles
@@ -74,7 +74,7 @@ def process_profile(uri_base, profile_f):
def process_primo_all(profile, blacklist=None):
# TODO flag to stop requesting
request_more = True
- index = 1
+ index = 0
while request_more:
collection = {}
collection['id'] = 1
@@ -87,17 +87,18 @@ def process_primo_all(profile, blacklist=None):
print >> sys.stderr, ' HTTP error (%s) resolving URL: %s' % (resp[u'status'], endpoint)
request_more = False
- print >> sys.stderr, "Index: %s" % index
-
endpoint_content = ARC_PARSE(content)
+ total_hits = endpoint_content['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['TOTALHITS']
+ print >> sys.stderr, "%s of %s total documents" % (index, total_hits)
items = endpoint_content['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC']
+
for item in (items if isinstance(items, list) else [items]):
item['_id'] = item['PrimoNMBib']['record']['control']['recordid']
collection['items'].append(item)
+ index += 1
enrich_coll(profile, collection['name'], json.dumps({'items':collection['items']}))
- index += int(profile[u'bulk_size'])
- if index > 5000:
+ if int(index) == int(total_hits):
request_more = False
return True
@@ -194,6 +195,8 @@ def enrich_coll(profile,subr,content):
def process_oai_rec(profile):
endpoint = profile[u'get_record']
+ if not is_absolute(endpoint):
+ endpoint = URI_BASE + endpoint
print >> sys.stderr, endpoint
resp, content = H.request(endpoint)
View
32 test/test_artstor.py
@@ -19,7 +19,7 @@ def test_artstor_identify_object():
"hasView": "edm:hasView",
"name": "xsd:string",
"object": "edm:object",
- "aggregatedCHO": "edm:aggregatedCHO",
+ "sourceResource": "edm:sourceResource",
"dpla": "http://dp.la/terms/",
"collection": "dpla:aggregation",
"edm": "http://www.europeana.eu/schemas/edm/",
@@ -68,7 +68,7 @@ def test_artstor_identify_object():
"id": "oai:oaicat.oclc.org:AKRESS_10310356237",
"subject": "Annunciation: Mary, Usually Reading, Is Visited by the Angel"
},
- "aggregatedCHO": {
+ "sourceResource": {
"rights": [
"",
"Please note that if this image is under copyright, you may need to contact one or more copyright owners for any use that is not permitted under the ARTstor Terms and Conditions of Use or not otherwise permitted by law. While ARTstor tries to update contact information, it cannot guarantee that such information is always accurate. Determining whether those permissions are necessary, and obtaining such permissions, is your sole responsibility."
@@ -110,14 +110,7 @@ def test_artstor_identify_object():
"@id": "http://dp.la/api/contributor/artstor",
"name": "ARTstor OAICatMuseum"
},
- "isShownAt": {
- "rights": [
- "",
- "Please note that if this image is under copyright, you may need to contact one or more copyright owners for any use that is not permitted under the ARTstor Terms and Conditions of Use or not otherwise permitted by law. While ARTstor tries to update contact information, it cannot guarantee that such information is always accurate. Determining whether those permissions are necessary, and obtaining such permissions, is your sole responsibility."
- ],
- "@id": "Thumbnail: http://media.artstor.net/imgstor/size2/kress/d0001/kress_1103_post.jpg",
- "format": null
- },
+ "isShownAt": "Thumbnail: http://media.artstor.net/imgstor/size2/kress/d0001/kress_1103_post.jpg",
"ingestType": "item",
"@id": "http://dp.la/api/items/6ae54cee603f75c275fd913e04c49a3f",
"id": "6ae54cee603f75c275fd913e04c49a3f"
@@ -131,8 +124,8 @@ def test_artstor_identify_object():
assert str(resp.status).startswith("2")
doc = json.loads(content)
- assert u"object" in doc and u"@id" in doc[u"object"], "object/@id path not found in document"
- FETCHED_PREVIEW = doc[u"object"][u'@id']
+ assert u"object" in doc, "objectpath not found in document"
+ FETCHED_PREVIEW = doc[u"object"]
assert FETCHED_PREVIEW == EXPECTED_PREVIEW, "%s != %s" % (FETCHED_PREVIEW, EXPECTED_PREVIEW)
def test_artstor_source_fetching():
@@ -153,7 +146,7 @@ def test_artstor_source_fetching():
"hasView": "edm:hasView",
"name": "xsd:string",
"object": "edm:object",
- "aggregatedCHO": "edm:aggregatedCHO",
+ "sourceResource": "edm:sourceResource",
"dpla": "http://dp.la/terms/",
"collection": "dpla:aggregation",
"edm": "http://www.europeana.eu/schemas/edm/",
@@ -202,7 +195,7 @@ def test_artstor_source_fetching():
"id": "oai:oaicat.oclc.org:AKRESS_10310356237",
"subject": "Annunciation: Mary, Usually Reading, Is Visited by the Angel"
},
- "aggregatedCHO": {
+ "sourceResource": {
"rights": [
"",
"Please note that if this image is under copyright, you may need to contact one or more copyright owners for any use that is not permitted under the ARTstor Terms and Conditions of Use or not otherwise permitted by law. While ARTstor tries to update contact information, it cannot guarantee that such information is always accurate. Determining whether those permissions are necessary, and obtaining such permissions, is your sole responsibility."
@@ -244,14 +237,7 @@ def test_artstor_source_fetching():
"@id": "http://dp.la/api/contributor/artstor",
"name": "ARTstor OAICatMuseum"
},
- "isShownAt": {
- "rights": [
- "",
- "Please note that if this image is under copyright, you may need to contact one or more copyright owners for any use that is not permitted under the ARTstor Terms and Conditions of Use or not otherwise permitted by law. While ARTstor tries to update contact information, it cannot guarantee that such information is always accurate. Determining whether those permissions are necessary, and obtaining such permissions, is your sole responsibility."
- ],
- "@id": "Thumbnail: http://media.artstor.net/imgstor/size2/kress/d0001/kress_1103_post.jpg",
- "format": null
- },
+ "isShownAt": "Thumbnail: http://media.artstor.net/imgstor/size2/kress/d0001/kress_1103_post.jpg",
"ingestType": "item",
"@id": "http://dp.la/api/items/6ae54cee603f75c275fd913e04c49a3f",
"id": "6ae54cee603f75c275fd913e04c49a3f"
@@ -263,7 +249,7 @@ def test_artstor_source_fetching():
url = server() + "artstor_select_isshownat"
resp, content = H.request(url, "POST", body=INPUT_JSON)
assert str(resp.status).startswith("2")
- FETCHED_SOURCE = json.loads(content)[u"isShownAt"][u"@id"]
+ FETCHED_SOURCE = json.loads(content)[u"isShownAt"]
assert FETCHED_SOURCE == EXPECTED_SOURCE
View
6 test/test_bhl_contributor_to_collection.py
@@ -12,7 +12,7 @@ def test_bhl_contributor_to_collection1():
INPUT = {
"key1": "value1",
- "aggregatedCHO": {
+ "sourceResource": {
"key1" : "value1",
"key2": "value2"
},
@@ -33,7 +33,7 @@ def test_bhl_contributor_to_collection2():
INPUT = {
"key1": "value1",
- "aggregatedCHO": {
+ "sourceResource": {
"key1" : "value1",
"contributor": "Missouri Botanical Garden, Peter H. Raven Library"
},
@@ -44,7 +44,7 @@ def test_bhl_contributor_to_collection2():
}
EXPECTED = {
"key1": "value1",
- "aggregatedCHO": {
+ "sourceResource": {
"key1" : "value1",
"contributor": "Missouri Botanical Garden, Peter H. Raven Library"
},
View
4 test/test_cleanup_value.py
@@ -136,7 +136,7 @@ def test_changes_using_default_prop_value():
"""Should process all default values."""
INPUT = {
"aaa": "bbb...",
- "aggregatedCHO": {
+ "sourceResource": {
"aaa": "bbb...",
"creator": "....a -- b....",
"language": ["...aaa...", "...bbb;;;.;."],
@@ -148,7 +148,7 @@ def test_changes_using_default_prop_value():
}
EXPECTED = {
"aaa": "bbb...",
- "aggregatedCHO": {
+ "sourceResource": {
"aaa": "bbb...",
"creator": "a--b",
"language": ["aaa", "bbb"],
View
26 test/test_contentdm_identify_object.py
@@ -11,10 +11,8 @@
## TODO: move to another file
-def contentdm_url(rights_field="r", download="True"):
- return server() + \
- "contentdm_identify_object?rights_field=%s&download=%s" \
- % (rights_field, download)
+def contentdm_url(download="True"):
+ return server() + "contentdm_identify_object?download=%s" % download
def test_contentdm_identify_object_without_download():
@@ -37,16 +35,12 @@ def test_contentdm_identify_object_without_download():
"handle":
["aaa", "http://repository.clemson.edu/u?/scp,104"]
},
- u"object": {
- "@id": "http://repository.clemson.edu/cgi-bin/" +
- "thumbnail.exe?CISOROOT=/scp&CISOPTR=104",
- "format": "",
- "rights": "right now!"
- },
+ u"object": ("http://repository.clemson.edu/cgi-bin/" +
+ "thumbnail.exe?CISOROOT=/scp&CISOPTR=104"),
u"admin": {u"object_status": 0},
u"left": "right now!"
}
- url = contentdm_url(u"left", "False")
+ url = contentdm_url("False")
resp, content = H.request(url, "POST", body=json.dumps(INPUT))
print_error_log()
assert str(resp.status).startswith("2")
@@ -73,16 +67,12 @@ def test_contentdm_identify_object_with_download():
u"originalRecord": {
"handle": ["aaa", "http://repository.clemson.edu/u?/scp,104"]
},
- u"object": {
- "@id": "http://repository.clemson.edu/cgi-bin/" +
- "thumbnail.exe?CISOROOT=/scp&CISOPTR=104",
- "format": "",
- "rights": "right now!"
- },
+ u"object": ("http://repository.clemson.edu/cgi-bin/" +
+ "thumbnail.exe?CISOROOT=/scp&CISOPTR=104"),
u"admin": {u"object_status": 1},
u"left": "right now!"
}
- url = contentdm_url(u"left", "True")
+ url = contentdm_url("True")
resp, content = H.request(url, "POST", body=json.dumps(INPUT))
View
164 test/test_copy_prop.py
@@ -22,13 +22,13 @@ def _get_server_response(body, prop=None, to_prop=None, create=None, key=None,
def test_copy_prop_rights1():
"""Should do nothing"""
- prop = "aggregatedCHO/rights"
+ prop = "sourceResource/rights"
to_prop = "isShownAt"
key = "rights"
INPUT = {
"key1": "value1",
- "aggregatedCHO": {
+ "sourceResource": {
"key1" : "value1",
"key2": "value2"
},
@@ -42,13 +42,13 @@ def test_copy_prop_rights1():
def test_copy_prop_rights2():
"""Should do nothing"""
- prop = "aggregatedCHO/rights"
+ prop = "sourceResource/rights"
to_prop = "isShownAt"
key = "rights"
INPUT = {
"key1": "value1",
- "aggregatedCHO": {
+ "sourceResource": {
"key1" : "value1",
"key2": "value2",
"rights": "These are the rights"
@@ -62,8 +62,8 @@ def test_copy_prop_rights2():
assert json.loads(content) == INPUT
def test_copy_prop_rights3():
- """Should copy aggregatedCHO/rights to isShownAt"""
- prop = "aggregatedCHO/rights"
+ """Should copy sourceResource/rights to isShownAt"""