Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'develop' into smithsonian_ingestion

  • Loading branch information...
commit 51916603ea0390807a4726488f488f81a612036e 2 parents f15842f + a85ca81
Szymon Guz authored
View
8 akara.conf
@@ -129,7 +129,10 @@ MODULES = [
"dplaingestion.akamod.sets_prop",
"dplaingestion.akamod.enrich_language",
"dplaingestion.akamod.arc-to-dpla",
- "dplaingestion.akamod.edan_to_dpla"
+ "dplaingestion.akamod.edan_to_dpla",
+ "dplaingestion.akamod.dpla-get-record",
+ "dplaingestion.akamod.primo-to-dpla",
+ "dplaingestion.akamod.mwdl_enrich_state_located_in"
]
### Section 3: Other module configuration goes here
@@ -180,3 +183,6 @@ class kentucky_identify_object(identify_object):
class artstor_identify_object(identify_object):
pass
+
+class georgia_identify_object(identify_object):
+ pass
View
11 lib/akamod/cleanup_value.py
@@ -19,15 +19,15 @@ def convert(data, prop):
if exists(data, prop):
v = getprop(data, prop)
if isinstance(v, basestring):
- setprop(data, prop, cleanup(v))
+ setprop(data, prop, cleanup(v, prop))
elif isinstance(v, list):
temp = []
for val in v:
- temp.append(cleanup(val))
+ temp.append(cleanup(val, prop))
setprop(data, prop, temp)
-def cleanup(value):
+def cleanup(value, prop):
""" Performs a cleanup of value using a bunch of regexps.
Arguments:
@@ -36,7 +36,10 @@ def cleanup(value):
Returns:
Converted string.
"""
- TAGS_FOR_STRIPPING = '[\.\' \r\t\n";,]*' # Tags for stripping at beginning and at the end.
+ # Do not remove double quotes from title
+ dquote = '' if prop == "aggregatedCHO/title" else '"'
+ # Tags for stripping at beginning and at the end.
+ TAGS_FOR_STRIPPING = '[\.\' \r\t\n;,%s]*' % dquote
REGEXPS = (' *-- *', '--'), \
('[\t ]{2,}', ' '), \
('^' + TAGS_FOR_STRIPPING, ''), \
View
14 lib/akamod/copy_prop.py
@@ -7,7 +7,7 @@
@simple_service('POST', 'http://purl.org/la/dp/copy_prop', 'copy_prop',
'application/json')
def copyprop(body,ctype,prop=None,to_prop=None,create=False,key=None,
- remove=None):
+ remove=None,no_replace=None):
"""Copies value in one prop to another prop.
Keyword arguments:
@@ -18,6 +18,7 @@ def copyprop(body,ctype,prop=None,to_prop=None,create=False,key=None,
create -- creates to_prop if True (default False)
key -- the key to use if to_prop is a dict (default None)
remove -- removes prop if True (default False)
+ no_replace -- creates list of to_prop string and appends prop if True
"""
@@ -29,12 +30,19 @@ def copyprop(body,ctype,prop=None,to_prop=None,create=False,key=None,
return "Unable to parse body as JSON"
if exists(data, prop) and create and not exists(data, to_prop):
- setprop(data, to_prop, "")
+ val = {} if key else ""
+ setprop(data, to_prop, val)
if exists(data, prop) and exists(data, to_prop):
val = getprop(data, prop)
to_element = getprop(data, to_prop)
+
if isinstance(to_element, basestring):
+ if no_replace:
+ el = [to_element] if to_element else []
+ el.append(val)
+ # Flatten
+ val = [e for s in el for e in (s if not isinstance(s, basestring) else [s])]
setprop(data, to_prop, val)
else:
# If key is set, assume to_element is dict or list of dicts
@@ -42,7 +50,7 @@ def copyprop(body,ctype,prop=None,to_prop=None,create=False,key=None,
if not isinstance(to_element, list):
to_element = [to_element]
for dict in to_element:
- if exists(dict, key):
+ if exists(dict, key) or create:
setprop(dict, key, val)
else:
msg = "Key %s does not exist in %s" % (key, to_prop)
View
286 lib/akamod/dpla-get-record.py
@@ -0,0 +1,286 @@
+# -*- encoding: utf-8 -*-
+'''
+@ 2011 by Uche ogbuji <uche@ogbuji.net>
+
+This file is part of the open source Akara project,
+provided under the Apache 2.0 license.
+See the files LICENSE and NOTICE for details.
+Project home, documentation, distributions: http://wiki.xml3k.org/Akara
+
+ Module name:: freemix_akara.oai
+
+Scrapes collections from a OAI site into JSON form for Freemix
+
+= Defined REST entry points =
+
+http://purl.org/com/zepheira/freemix/services/oai.json (freemix_akara.oai) Handles GET
+
+= Configuration =
+
+None
+
+= Notes on security =
+
+This makes heavy access to remote OAI sites
+
+= Notes =
+
+Adapted 2012 by Jeffrey Licht to support resumption tokens
+
+'''
+
+import sys, time
+
+from amara.thirdparty import json
+
+from akara.services import simple_service
+from akara import logger
+from akara import module_config
+
+from dplaingestion.oai import oaiservice
+import sys
+
+
+GETRECORD_SERVICE_ID = 'http://purl.org/la.dp/dpla-get-record'
+
+@simple_service('GET', GETRECORD_SERVICE_ID, 'dpla-get-record', 'application/json')
+def getrecord(endpoint, id):
+ """
+ e.g.:
+
+ curl "http://localhost:8880/dpla-get-record?endpoint=URL&id=IDENTIFIER"
+ """
+ remote = oaiservice(endpoint, logger)
+ get_record_result = remote.get_record(id=id)
+
+ record = get_record_result['record']
+
+ exhibit_record = []
+ properties_used = set() # track the properties in use
+ for rid, rinfo in record:
+ erecord = {u'id': rid}
+ for k, v in rinfo.iteritems():
+ if len(v) == 1:
+ erecord[k] = v[0]
+ else:
+ erecord[k] = v
+ if u'title' in erecord:
+ erecord[u'label'] = erecord[u'title']
+
+ properties_used.update(erecord.keys())
+ exhibit_record.append(erecord)
+
+ PROFILE["properties"][:] = strip_unused_profile_properties(PROFILE["properties"],properties_used)
+
+ #FIXME: This profile is NOT correct. Dumb copy from CDM endpoint. Please fix up below
+ return json.dumps({'items': exhibit_record, 'data_profile': PROFILE}, indent=4)
+
+# Rebuild the data profile by removing optional, unused properties
+strip_unused_profile_properties = lambda prof_props, used: [ p for p in prof_props if p["property"] in used ]
+
+#FIXME: This profile is NOT correct. Dumb copy from CDM endpoint.
+PROFILE = {
+ #"original_MIME_type": "application/vnd.ms-excel",
+ #"Akara_MIME_type_magic_guess": "application/vnd.ms-excel",
+ #"url": "/data/uche/amculturetest/data.json",
+ #"label": "amculturetest",
+ "properties": [
+ {
+ "property": "handle",
+ "enabled": True,
+ "label": "Handle",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "language",
+ "enabled": True,
+ "label": "Language",
+ "types": [
+ "text"
+ ],
+ "tags": [
+ ]
+ },
+ {
+ "property": "creator",
+ "enabled": True,
+ "label": "Creators",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "format",
+ "enabled": True,
+ "label": "Formats",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "relation",
+ "Enabled": True,
+ "label": "Relations",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "id",
+ "enabled": False,
+ "label": "id",
+ "types": [
+ "text"
+ ],
+ "tags": [
+ "property:type=url"
+ ]
+ },
+ {
+ "property": "date",
+ "enabled": True,
+ "label": "Date",
+ "tags": [
+ "property:type=date", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "datestamp",
+ "enabled": True,
+ "label": "Date stamp",
+ "tags": ["property:type=date"]
+ },
+ {
+ "property": "title",
+ "enabled": True,
+ "label": "Title",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ {
+ "property": "description",
+ "enabled": True,
+ "label": "Description",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ {
+ "property": "subject",
+ "enabled": True,
+ "label": "Subject",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "contributor",
+ "enabled": True,
+ "label": "Contributor",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "publisher",
+ "enabled": True,
+ "label": "Publisher",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ {
+ "property": "instructionalmethod",
+ "enabled": True,
+ "label": "Instructional Method",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ {
+ "property": "accrualmethod",
+ "enabled": True,
+ "label": "Accrual Method",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ {
+ "property": "source",
+ "enabled": True,
+ "label": "Source",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ {
+ "property": "provenance",
+ "enabled": True,
+ "label": "Provenance",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "rights",
+ "enabled": True,
+ "label": "Rights",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "rightsholder",
+ "enabled": True,
+ "label": "Rights Holder",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ {
+ "property": "coverage",
+ "enabled": True,
+ "label": "Coverage",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "audience",
+ "enabled": True,
+ "label": "Audience",
+ "tags": [
+ "property:type=text", "property:type=shredded_list"
+ ]
+ },
+ {
+ "property": "label",
+ "enabled": True,
+ "label": "Label",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ {
+ "property": "type",
+ "enabled": True,
+ "label": "Document Type",
+ "types": [
+ "text"
+ ],
+ "tags": []
+ },
+ ],
+ #"Akara_MIME_type_guess": "application/vnd.ms-excel"
+}
View
10 lib/akamod/enrich-date.py
@@ -7,11 +7,8 @@
from amara.thirdparty import json
from dateutil.parser import parse as dateutil_parse
from zen import dateparser
-
from dplaingestion.selector import getprop, setprop, delprop, exists
-
-
HTTP_INTERNAL_SERVER_ERROR = 500
HTTP_TYPE_JSON = 'application/json'
HTTP_TYPE_TEXT = 'text/plain'
@@ -29,8 +26,8 @@
DEFAULT_DATETIME_SECS = 32503680000.0 # UTC seconds for "3000-01-01"
-DATE_RANGE_RE = r'(\S+)\s*-\s*(\S+)'
-DATE_RANGE_EXT_RE = r'(\S+)\s*[-/]\s*(\S+)'
+DATE_RANGE_RE = r'([0-9-]+)\s*-\s*([0-9-]+)'
+DATE_RANGE_EXT_RE = r'([0-9-]+)\s*[-/]\s*([0-9-]+)'
def split_date(d):
reg = DATE_RANGE_EXT_RE
if len(d.split("/")) == 3: #so th date is like "2001 / 01 / 01"
@@ -144,6 +141,8 @@ def convert_dates(data, prop, earliest):
for s in (v if not isinstance(v, basestring) else [v]):
for part in s.split(";"):
stripped = remove_brackets_and_strip(part)
+ if len(stripped) < 4:
+ continue
a, b = parse_date_or_range(stripped)
if b != '3000-01-01':
dates.append( {
@@ -163,7 +162,6 @@ def convert_dates(data, prop, earliest):
else:
delprop(data, p)
-
@simple_service('POST', 'http://purl.org/la/dp/enrich_earliest_date', 'enrich_earliest_date', HTTP_TYPE_JSON)
def enrich_earliest_date(body, ctype, action="enrich_earliest_date", prop="aggregatedCHO/date"):
"""
View
15 lib/akamod/enrich-format.py
@@ -4,6 +4,8 @@
from amara.thirdparty import json
from dplaingestion.selector import getprop, setprop, exists
import re
+import os
+from amara.lib.iri import is_absolute
@simple_service('POST', 'http://purl.org/la/dp/enrich-format', 'enrich-format', 'application/json')
def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alternate="aggregatedCHO/physicalMedium",typefield="aggregatedCHO/type"):
@@ -32,9 +34,16 @@ def enrichformat(body,ctype,action="enrich-format",prop="isShownAt/format",alter
"text": "text"
}
- REGEXPS = ('audio/mp3', "audio/mpeg"), ('images/jpeg', 'image/jpeg'), ('image/jpg','image/jpeg'),('image/jp$', 'image/jpeg'), ('img/jpg', 'image/jpeg'), ('\W$','')
- IMT_TYPES = ['application','audio','image','message','model','multipart','text','video']
+ REGEXPS = ('audio/mp3', 'audio/mpeg'), ('images/jpeg', 'image/jpeg'),\
+ ('image/jpg','image/jpeg'),('image/jp$', 'image/jpeg'),\
+ ('img/jpg', 'image/jpeg'), ('^jpeg$','image/jpeg'),\
+ ('^jpg$', 'image/jpeg'), ('\W$','')
+ IMT_TYPES = ['application', 'audio', 'image', 'message', 'model',
+ 'multipart', 'text', 'video']
+ def get_ext(s):
+ return os.path.splitext(s)[1].split('.')[1]
+
def cleanup(s):
s = s.lower().strip()
for pattern, replace in REGEXPS:
@@ -62,6 +71,8 @@ def is_imt(s):
physicalFormat = [physicalFormat]
for s in (v if not isinstance(v,basestring) else [v]):
+ if is_absolute(s):
+ s = get_ext(s)
cleaned = cleanup(s)
if is_imt(cleaned):
if cleaned not in format:
View
4 lib/akamod/kentucky_identify_object.py
@@ -38,10 +38,10 @@ def log_json():
url = getprop(data, relation_field)
else:
msg = "Field %s does not exist" % relation_field
- logger.error(msg)
+ logger.debug(msg)
return body
- base_url, ext = os.path.splitext(url)
+ base_url, ext = os.path.splitext(url)
thumb_url = "%s_tb%s" % (base_url, ext)
rights = None
View
7 lib/akamod/move_date_values.py
@@ -17,7 +17,12 @@ def movedatevalues(body,ctype,action="move_date_values",prop=None,to_prop="aggre
return body
REGSUB = ("\(", ""), ("\)", ""), ("\.",""), ("\?","")
- REGSEARCH = ["\d{1,4} *[-/] *\d{1,4} *[-/] *\d{1,4}", "\d{4} *[-/] *\d{4}", "\d{4}"]
+ REGSEARCH = [
+ "\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}\s*[-/]\s*\d{1,4}",
+ "\d{1,4} *[-/] *\d{1,4} *[-/] *\d{1,4}",
+ "\d{4} *[-/] *\d{4}",
+ "\d{4}"
+ ]
def cleanup(s):
for p,r in REGSUB:
View
95 lib/akamod/mwdl_enrich_state_located_in.py
@@ -0,0 +1,95 @@
+from akara import logger
+from akara import response
+from akara.services import simple_service
+from amara.thirdparty import json
+from dplaingestion.selector import getprop, setprop, exists
+
+@simple_service('POST', 'http://purl.org/la/dp/mwdl_enrich_state_located_in',
+ 'mwdl_enrich_state_located_in', 'application/json')
+def mwdlenrichstatelocatedin(body, ctype, action="mdl_enrich_state_located_in",
+ prop="aggregatedCHO/stateLocatedIn"):
+ """
+ Service that accepts a JSON document and enriches the "stateLocatedIn"
+ field of that document by:
+
+ For primary use with MWDL documents.
+ """
+
+ try:
+ data = json.loads(body)
+ except:
+ response.code = 500
+ response.add_header('content-type', 'text/plain')
+ return "Unable to parse body as JSON"
+
+ if exists(data,prop):
+ sli = []
+ values = getprop(data,prop)
+ for v in values.split(";"):
+ if STATE_CODES.get(v):
+ sli.append(STATE_CODES[v])
+ else:
+ sli.append(v)
+ setprop(data, prop, "; ".join(sli))
+
+ return json.dumps(data)
+
+STATE_CODES = {
+ "101": "UT",
+ "102": "UT",
+ "103": "UT",
+ "104": "NV",
+ "105": "NV",
+ "106": "UT",
+ "107": "UT",
+ "108": "ID",
+ "109": "UT",
+ "110": "UT",
+ "111": "NV",
+ "112": "UT",
+ "114": "UT",
+ "115": "UT",
+ "116": "UT",
+ "117": "UT",
+ "118": "UT",
+ "119": "UT",
+ "120": "UT",
+ "121": "UT",
+ "122": "UT",
+ "123": "UT",
+ "124": "UT",
+ "125": "UT",
+ "126": "UT",
+ "127": "ID",
+ "128": "hi",
+ "129": "UT",
+ "131": "UT",
+ "132": "UT",
+ "133": "UT",
+ "135": "UT",
+ "136": "UT",
+ "137": "UT",
+ "138": "UT",
+ "139": "UT",
+ "140": "UT",
+ "141": "NV",
+ "142": "NV",
+ "143": "UT",
+ "144": "UT",
+ "146": "UT",
+ "147": "UT",
+ "149": "UT",
+ "151": "UT",
+ "200": "UT",
+ "201": "UT",
+ "203": "UT",
+ "205": "UT",
+ "206": "UT",
+ "207": "UT",
+ "213": "UT",
+ "215": "UT",
+ "217": "UT",
+ "218": "UT",
+ "287": "ID",
+ "288": "UT"
+}
View
2  lib/akamod/oai-to-dpla.py
@@ -53,7 +53,7 @@ def is_shown_at_transform(d):
return {
"isShownAt" : {
"@id" : source,
- "format" : d.get("format",None)
+ "format": d.get("format", None)
}
}
View
150 lib/akamod/primo-to-dpla.py
@@ -0,0 +1,150 @@
+from akara import logger
+from akara import request, response
+from akara.services import simple_service
+from amara.lib.iri import is_absolute
+from amara.thirdparty import json
+from functools import partial
+import base64
+import sys
+import re
+from copy import deepcopy
+from dplaingestion.selector import getprop, exists
+
+GEOPROP = None
+RECORD = "PrimoNMBib/record/"
+LINKS = "LINKS/"
+URL = "http://thoth.library.utah.edu:1701/primo_library/libweb/action/dlDisplay.do?vid=MWDL&afterPDS=true&docId="
+
+#FIXME not format specific, move to generic module
+CONTEXT = {
+ "@vocab": "http://purl.org/dc/terms/",
+ "dpla": "http://dp.la/terms/",
+ "edm": "http://www.europeana.eu/schemas/edm/",
+ "LCSH": "http://id.loc.gov/authorities/subjects",
+ "name": "xsd:string",
+ "collection" : "dpla:aggregation",
+ "aggregatedDigitalResource" : "dpla:aggregatedDigitalResource",
+ "originalRecord" : "dpla:originalRecord",
+ "state": "dpla:state",
+ "coordinates": "dpla:coordinates",
+ "stateLocatedIn" : "dpla:stateLocatedIn",
+ "aggregatedCHO" : "edm:aggregatedCHO",
+ "dataProvider" : "edm:dataProvider",
+ "hasView" : "edm:hasView",
+ "isShownAt" : "edm:isShownAt",
+ "object" : "edm:object",
+ "provider" : "edm:provider",
+ "begin" : {
+ "@id" : "dpla:dateRangeStart",
+ "@type": "xsd:date"
+ },
+ "end" : {
+ "@id" : "dpla:end",
+ "@type": "xsd:date"
+ }
+}
+
+def web_resource_transform(d, url):
+ format_field = RECORD + "display/format"
+ format = getprop(d, format_field) if exists(d, format_field) else None
+ return {"@id": url, "format": format} if format else {"@id": url}
+
+def multi_transform(d, key, props):
+ values = []
+
+ for p in props:
+ p = RECORD + p
+ if exists(d, p):
+ v = getprop(d, p)
+ if not v: continue
+ if not isinstance(v, list):
+ v = [v]
+ [values.append(s) for s in v if s not in values]
+
+ return {key: "; ".join(values)} if values else {}
+
+# Structure mapping the original top level property to a function returning a single
+# item dict representing the new property and its value
+CHO_TRANSFORMER = {
+ RECORD + "display/creator" : lambda d, p: {"creator": getprop(d, p)},
+ RECORD + "search/creationdate" : lambda d, p: {"date": getprop(d, p)},
+ RECORD + "search/description" : lambda d, p: {"description": getprop(d, p)},
+ RECORD + "display/lds05" : lambda d, p: {"extent": getprop(d, p)},
+ RECORD + "display/language" : lambda d, p: {"language": getprop(d, p)},
+ RECORD + "display/relation" : lambda d, p: {"relation": getprop(d, p)},
+ RECORD + "display/rights" : lambda d, p: {"rights": getprop(d, p)},
+ RECORD + "display/subject" : lambda d, p: {"subject": getprop(d, p)},
+ RECORD + "display/lds09" : lambda d, p: {"temporal": getprop(d, p)},
+ RECORD + "display/lds18" : lambda d, p: {"type": getprop(d, p)},
+ RECORD + "search/lsr03" : lambda d, p: {"stateLocatedIn": getprop(d, p)}
+}
+
+AGGREGATION_TRANSFORMER = {
+ "id" : lambda d, p: {"id": getprop(d, p), "@id" : "http://dp.la/api/items/"+getprop(d, p)},
+ "_id" : lambda d, p: {"_id": getprop(d, p)},
+ "originalRecord" : lambda d, p: {"originalRecord": getprop(d, p)},
+ "ingestType" : lambda d, p: {"ingestType": getprop(d, p)},
+ "ingestDate" : lambda d, p: {"ingestDate": getprop(d, p)},
+ RECORD + "control/recordid" : lambda d, p: {"isShownAt": web_resource_transform(d, URL + getprop(d, p))},
+ LINKS + "thumbnail" : lambda d, p: {"object": web_resource_transform(d, getprop(d, p))}
+}
+
+@simple_service("POST", "http://purl.org/la/dp/primo-to-dpla", "primo-to-dpla", "application/ld+json")
+def primotodpla(body,ctype,geoprop=None):
+ """
+ Convert output of JSON-ified PRIMO (MWDL) format into the DPLA JSON-LD format.
+
+ Parameter "geoprop" specifies the property name containing lat/long coords
+ """
+
+ try :
+ data = json.loads(body)
+ except:
+ response.code = 500
+ response.add_header("content-type","text/plain")
+ return "Unable to parse body as JSON"
+
+ global GEOPROP
+ GEOPROP = geoprop
+
+ out = {
+ "@context": CONTEXT,
+ "aggregatedCHO": {}
+ }
+
+ # For ARC, "data" is the source record so set it here
+ data["originalRecord"] = deepcopy(data)
+
+ # Apply all transformation rules from original document
+ for p in CHO_TRANSFORMER:
+ if exists(data, p):
+ out["aggregatedCHO"].update(CHO_TRANSFORMER[p](data, p))
+ for p in AGGREGATION_TRANSFORMER:
+ if exists(data, p):
+ out.update(AGGREGATION_TRANSFORMER[p](data, p))
+
+ # Apply transformations that are dependent on more than one
+ # original document field
+ id_props = ["control/recordid", "display/identifier"]
+ sp_props = ["display/lds08", "search/lsr14"]
+ ipo_props = ["display/lds04", "search/lsr13"]
+ title_props = ["display/title", "display/lds10"]
+ out["aggregatedCHO"].update(multi_transform(data, "identifier", id_props))
+ out["aggregatedCHO"].update(multi_transform(data, "spatial", sp_props))
+ out["aggregatedCHO"].update(multi_transform(data, "isPartOf", ipo_props))
+ out["aggregatedCHO"].update(multi_transform(data, "title", title_props))
+
+ dp_props = ["display/lds03", "search/lsr12"]
+ out.update(multi_transform(data, "dataProvider", dp_props))
+
+ # Additional content not from original document
+ if "HTTP_CONTRIBUTOR" in request.environ:
+ try:
+ out["provider"] = json.loads(base64.b64decode(request.environ["HTTP_CONTRIBUTOR"]))
+ except Exception as e:
+ logger.debug("Unable to decode Contributor header value: "+request.environ["HTTP_CONTRIBUTOR"]+"---"+repr(e))
+
+ # Strip out keys with None/null values?
+ out = dict((k,v) for (k,v) in out.items() if v)
+
+ return json.dumps(out)
View
17 lib/oai.py
@@ -115,7 +115,22 @@ def receive_nodes(n):
return sets
def get_record(self, id):
- pass
+ params = {'verb': 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': id}
+ qstr = urllib.urlencode(params)
+ url = self.root + '?' + qstr
+ self.logger.debug('OAI request URL: {0}'.format(url))
+ start_t = time.time()
+ resp, content = self.h.request(url)
+ retrieved_t = time.time()
+ self.logger.debug('Retrieved in {0}s',format(retrieved_t - start_t))
+ doc = bindery.parse(url, model=OAI_GETRECORD_MODEL)
+
+ record, rid = metadata_dict(generate_metadata(doc), nesteddict=False)
+ for id_, props in (record if isinstance(record, list) else [record]):
+ for k, v in props.iteritems():
+ props[k] = [ U(item) for item in v ]
+
+ return {'record' : record}
def search(self, term):
qstr = urllib.urlencode({'verb' : 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': dspace_id})
View
19 profiles/georgia.pjs
@@ -8,27 +8,26 @@
"enrichments_rec": [
"/select-id",
"/oai-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
+ "/sets_prop?prop=aggregatedCHO%2Fdate",
+ "/copy_prop?prop=originalRecord%2Fsource&to_prop=aggregatedCHO%2Fdescription&create=True&no_replace=True",
"/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
+ "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftitle%2CaggregatedCHO%2Ftype%2CaggregatedCHO%2Fcreator",
"/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "/shred?prop=aggregatedCHO%2Fspatial&delim=--",
+ "/move_date_values?prop=aggregatedCHO%2Fspatial&to_prop=aggregatedCHO%2Fdate",
"/enrich_earliest_date",
- "/enrich_date",
"/enrich-subject",
- "/cleanup_value",
"/enrich-type",
"/enrich-format",
- "/georgia_identify_object",
"/enrich_location",
- "/enrich_language"
+ "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=aggregatedCHO%2FstateLocatedIn&create=True&remove=True",
+ "/enrich_location?prop=aggregatedCHO%2FstateLocatedIn",
+ "/copy_prop?prop=aggregatedCHO%2Fcontributor&to_prop=dataProvider&remove=True",
+ "/georgia_identify_object",
+ "/cleanup_value"
],
"subresources": [
"dpla"
],
- "last_checked": "2012-11-04T13:37:38.117078",
"contributor": {
"@id": "http://dp.la/api/contributor/georgia",
"name": "Digital Library of Georgia"
View
24 profiles/kentucky.pjs
@@ -5,23 +5,23 @@
"enrichments_rec": [
"/select-id",
"/oai-to-dpla",
- "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
- "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
- "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
- "/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "/shred?prop=aggregatedCHO%2Fspatial&delim=--",
+ "/sets_prop?prop=collection",
+ "/sets_prop?prop=aggregatedCHO%2Fcontributor",
+ "/shred?prop=aggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
+ "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher",
+ "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftitle%2CaggregatedCHO%2Ftype",
"/enrich_earliest_date",
- "/enrich_date",
+ "/enrich_location",
"/enrich-subject",
- "http://localhost:8875/cleanup_value",
"/enrich-type",
"/enrich-format",
- "/enrich_location",
"/kentucky_identify_object",
- "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
- "/enrich_language"
+ "/sets_prop?prop=aggregatedCHO%2Frelation",
+ "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=dataProvider&create=True",
+ "/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=aggregatedCHO%2FstateLocatedIn&create=True",
+ "/sets_prop?prop=aggregatedCHO%2Fpublisher",
+ "/enrich_location?prop=aggregatedCHO%2FstateLocatedIn",
+ "/cleanup_value"
],
"subresources": [],
"contributor": {
View
33 profiles/mwdl.pjs
@@ -0,0 +1,33 @@
+{
+ "bulk_size": "500",
+ "enrichments_coll": [],
+ "name": "mwdl",
+ "enrichments_rec": [
+ "/select-id?prop=_id",
+ "/primo-to-dpla",
+ "/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
+ "/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
+ "/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
+ "/shred?prop=isShownAt%2Fformat",
+ "/mwdl_enrich_state_located_in",
+ "/move_date_values?prop=aggregatedCHO%2Fsubject",
+ "/move_date_values?prop=aggregatedCHO%2Fspatial",
+ "/enrich_earliest_date",
+ "/enrich_date",
+ "/enrich-subject",
+ "/cleanup_value",
+ "/enrich-type",
+ "/enrich-format",
+ "/enrich_location",
+ "/enrich_location?prop=aggregatedCHO%2FstateLocatedIn",
+ "/enrich_language",
+ "/sets_prop?prop=aggregatedCHO%2FphysicalMedium"
+ ],
+ "last_checked": "2013-03-05T17:30:21.689809",
+ "contributor": {
+ "@id": "http://dp.la/api/contributor/mwdl",
+ "name": "Mountain West Digital Library"
+ },
+ "type": "primo",
+ "endpoint_URL": "http://thoth.library.utah.edu:1701/PrimoWebServices/xservice/search/brief?institution=MWDL&loc=local,scope:%28mw%29&query=facet_tlevel,exact,online_resources"
+}
View
42 scripts/build_profile
@@ -1,42 +0,0 @@
-#!/usr/bin/env python
-#
-# Usage: python build_profile.py <endpoint>
-# Outputs a profile that can be used for a given endpoint with poll_profiles, including all the available sets for that endpoint
-
-import sys, os
-from amara.thirdparty import json, httplib2
-
-LIMIT = "9999"
-AKARA_BASE = "http://localhost:8889/"
-AKARA_SETS = AKARA_BASE + "oai.listsets.json?limit=" + LIMIT + "&endpoint="
-AKARA_RECORDS = AKARA_BASE + "oai.listrecords.json?endpoint="
-
-def build_profile(endpoint_URL):
-
- H = httplib2.Http('/tmp/szymon/.pollcache')
- H.force_exception_as_status_code = True
- resp, content = H.request(AKARA_SETS + endpoint_URL)
- if not resp[u'status'].startswith('2'):
- print >> sys.stderr, ' HTTP error ('+resp[u'status']+') resolving URL: '+endpoint_URL
-
- profile = {}
- profile[u'endpoint_URL'] = AKARA_RECORDS + endpoint_URL + u'&oaiset='
- profile[u'subresources'] = []
-
- try:
- oaisets = json.loads(content)
- for oaiset in oaisets:
- profile[u'subresources'].append(oaiset[0])
-
- except Exception as e:
- profile = None
-
- return json.dumps(profile,indent=4)
-
-if __name__ == '__main__':
-
- if not sys.argv[1:]:
- print >> sys.stderr, 'OAI endpoint required. Aborting.'
- sys.exit(1)
-
- print build_profile(sys.argv[1])
View
8 scripts/dpla-thumbs.ini
@@ -1,8 +0,0 @@
-[thumbs]
-AKARA_SERVER = http://localhost:8868
-GET_DOCUMENTS_URL = dpla-thumbs-list-for-downloading
-GET_DOCUMENTS_LIMIT = 1
-
-UPDATE_DOCUMENT_URL = dpla-thumbs-update-doc
-
-THUMBS_ROOT_PATH = /home/szymon/tmp/thumbs_dir/
View
22 scripts/nara-parse.py
@@ -1,22 +0,0 @@
-import sys, os
-from amara import bindery
-
-item_f = open(sys.argv[1],'r')
-item = bindery.parse(item_f)
-
-hier_items = item.archival_description.hierarchy.hierarchy_item
-for hi in hier_items:
- htype = unicode(hi.hierarchy_item_lod).replace(' ','')
- hid = hi.hierarchy_item_id
-
- if hid:
- hier_fname = os.path.join(os.path.dirname(sys.argv[1]),"%s_%s.xml"%(htype,hid))
- print hier_fname
- hier_f = open(hier_fname,'r')
-
- hier = bindery.parse(hier_f)
- print "... belongs to "+str(hier.archival_description.title)
-
- hier_f.close()
-
-item_f.close()
View
396 scripts/poll_images.py
@@ -1,396 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Usage: python poll_images.py <profiles-glob> <enrichment-service-URI.
-
-from amara.thirdparty import json, httplib2
-from amara.lib.iri import join
-import logging
-import logging.handlers
-import logging.config
-from StringIO import StringIO
-import pprint
-import sys
-import re
-import hashlib
-import os
-import os.path
-import urllib
-
-
-# Used by the logger.
-SCRIPT_NAME = "thumbnails downloader"
-
-# Used for logging nice json in the error logs.
-# This is used for debugging as well.
-pp = pprint.PrettyPrinter(indent=4)
-
-# Used for searching for the thumbnail URL.
-URL_FIELD_NAME = u"preview_source_url"
-
-# Used for storing the path to the local filename.
-URL_FILE_PATH = u"preview_file_path"
-
-
-def generate_file_path(id, file_number, file_extension):
- """
- Generates and returns the file path based in provided params.
-
- Algorithm:
-
- The file path is generated using the following algorithm:
-
- - convert all not allowed characters from the document id to "_"
- - to the above string add number and extension getting FILE_NAME
- - calculate md5 from original id
- - convert to uppercase
- - insert "/" between each to characters of this hash getting CALCULATED_PATH
- - join the MAIN_PATH, CALCULATED_PATH and FILE_NAME
-
- Arguments:
- id - document id from couchdb
- file_number - the number of the file added just before the extension
- file_extension - extension of the file
-
- Returns:
- filepath - path, without file name
- full_filepath - path, with file name
-
- Example:
- Function call:
- generate_file_path('clemsontest--hcc001-hcc016', 1, "jpg")
-
- Generated values for the algorithm steps:
-
- CLEARED_ID: clemsontest__hcc001_hcc016
- FILE_NAME: clemsontest__hcc001_hcc016_1.jpg
- HASHED_ID: 8E393B3B5DA0E0B3A7AEBFB91FE1278A
- PATH: 8E/39/3B/3B/5D/A0/E0/B3/A7/AE/BF/B9/1F/E1/27/8A/
- FULL_NAME: /tmp/szymon/main_pic_dir/8E/39/3B/3B/5D/A0/E0/B3/A7/AE/BF/B9/1F/E1/27/8A/clemsontest__hcc001_hcc016_1.jpg
- """
-
- logging.debug("Generating filename for document")
-
- cleared_id = re.sub(r'[-]', '_', id)
- logging.debug("Cleared id: " + cleared_id)
-
- fname = "%s_%s.%s" % (cleared_id, file_number, file_extension)
- logging.debug("File name: " + fname)
-
- md5sum = hashlib.md5(id).hexdigest().upper()
- logging.debug("Hashed id: " + md5sum)
-
- path = re.sub("(.{2})", "\\1" + os.sep, md5sum, re.DOTALL)
- logging.debug("PATH: " + path)
-
- path = os.path.join(conf['THUMBS_ROOT_PATH'], path)
- full_fname = os.path.join(path, fname)
- logging.debug("FULL PATH: " + full_fname)
-
- return (path, full_fname)
-
-
-def download_image(url, id, file_number=1):
- """
- Downloads the thumbnail from the given url and stores it on disk.
-
- Current implementation stores the file on disk
-
- Params:
- url - the url of the file for downloading
- id - document id, used for the file name generation
- file_number - number of the file for this document
-
- Returns:
- Name of the file where the image was stored - if everything is OK
- False - otherwise
- """
-
- # Get the thumbnail extension from the URL, needed for storing the
- # file on disk with proper extension.
- fileName, fileExtension = os.path.splitext(url)
- file_extension = fileExtension[1:]
-
- # Get the directory path and file path for storing the image.
- (path, fname) = generate_file_path(id, file_number, file_extension)
-
- # Let's create the directory for storing the file name.
- if not os.path.exists(path):
- logging.info("Creating directory: " + path)
- os.makedirs(path)
- else:
- logging.debug("Path exists")
-
- # Open connection to the image using provided URL.
- conn = urllib.urlopen(url)
- if not conn.getcode() / 100 == 2:
- msg = "Got %s from url: [%s] for document: [%s]" % (conn.getcode(), url, id)
- logging.error(msg)
- return False
-
- # Download the image.
- try:
- logging.info("Downloading file to: " + fname)
- local_file = open(fname, 'w')
- local_file.write(conn.read())
- except Exception as e:
- msg = traceback.format_exception(*sys.exc_info())
- logging.error(msg)
- return False
- else:
- conn.close()
- local_file.close()
- logging.debug("File downloaded")
- return fname
-
-
-def parse_documents(documents):
- """
- Parses the provided string with json into object.
-
- Arguments:
- documents String - documents from couchdb in string format
-
- Returns:
- Object with parsed json.
- """
- io = StringIO(documents)
- return json.load(io)
-
-
-def process_document(document):
- """
- Processes one document.
-
- * gets the image url from document
- * downloads the thumbnail
- * updates the document in couchdb
-
- Arguments:
- document Object - document already parsed
-
- Returns:
- None
- """
- id = document[u"id"]
- url = document[u'value'][URL_FIELD_NAME]
- logging.info("Processing document id = " + document["id"])
- logging.info("Found thumbnail URL = " + url)
-
- filepath = download_image(url, id)
- if filepath:
- # so everything is OK and the file is on disk
- doc = update_document(document, filepath)
- save_document(doc)
-
-
-def update_document(document, filepath):
- """
- Updates the document setting a filepath to a proper variable.
-
- Arguments:
- document Object - document for updating (decoded by json module)
- filepath String - filepath to insert
-
- Returns:
- The document from parameter with additional field containing the filepath.
- """
- document[u'value'][URL_FILE_PATH] = filepath
- return document
-
-
-def save_document(document):
- """
- Saves the document in the couchdb.
-
- Arguments:
- document - document to save
-
- Returns:
- If saving succeeded: the value returned by akara.
- If saving failed: a bunch of error logs is written - returns False.
-
- """
- logging.info("Updating document in database")
- h = httplib2.Http()
- h.force_exception_as_status_code = True
- url = join(conf['AKARA_SERVER'], conf['UPDATE_DOCUMENT_URL'], document[u'id'])
- logging.debug("Calling url: " + url)
- doc = json.dumps(document[u'value'])
- resp, content = h.request(url, 'POST', body=doc)
- if str(resp.status).startswith('2'):
- return content
- else:
- logging.error("Couldn't update document [id=%s]" % (document[u'id']))
- logging.error(" … with data: %s" % (pp.pformat(document)))
- logging.error(" … with raw data: %s" % (doc,))
- return False
-
-
-def configure_logger(config_file):
- """
- Configures logging for the script.
-
-
- Currently this is a very simple imeplemtation,
- it just reads the configuration from a file.
-
- Arguments:
- config_file String - path to the config file.
-
- Returns:
- Nothing, however there is an exception thrown if the file is missing,
- or there is something wrong with it.
- """
- logging.config.fileConfig(config_file)
-
-
-def process_config(config_file):
- """
- Reads the config file and parses options.
-
- Arguments:
- config_file String - path to the config file
-
- Returns:
- Dictionary with values read from the config file.
- """
- import ConfigParser
- config = ConfigParser.ConfigParser()
- config.read(config_file)
- res = {}
- # the names of config settings expected to be in the config file
- names = ['AKARA_SERVER', 'GET_DOCUMENTS_URL', 'GET_DOCUMENTS_LIMIT', \
- 'THUMBS_ROOT_PATH', 'UPDATE_DOCUMENT_URL', \
- ]
- for name in names:
- res[name] = config.get('thumbs', name)
- return res
-
-
-def get_documents():
- """
- Downloads a set of documents from couchdb. If there is an error with
- downloading the docuemtns, the script exits.
-
- Arguments:
- None
-
- Returns:
- None
- """
- logging.info('Getting documents from akara.')
- h = httplib2.Http()
- h.force_exception_as_status_code = True
- url = join(conf['AKARA_SERVER'], conf['GET_DOCUMENTS_URL']) + "?limit=%s" % conf['GET_DOCUMENTS_LIMIT']
- logging.debug('Using akara url: ' + url)
- resp, content = h.request(url, 'GET')
- if str(resp.status).startswith('2'):
- return content
- else:
- logging.error("Couldn't get documents using: " + url)
- logging.error("Emergency exit…")
- exit(1)
-
-
-def download_thumbs():
- """
- This is the main script function.
-
- * Downloads documents from couchdb.
- * Downloads images.
- * Updates the documents.
-
- Arguments:
- None
-
- Returns:
- None
- """
- # Get documents from couchdb
- documents = get_documents()
-
- # Convert couchdb reply to json.
- documents = parse_documents(documents)
- logging.info("Got %d documents from akara." % len(documents["rows"]))
-
- # Process all documents.
- for doc in documents["rows"]:
- process_document(doc)
-
-
-def parse_cmd_params():
- """
- Parses options for the script.
-
- Arguments:
- None
-
- Returns:
- (options, args) - pure output from parser.parse_args()
- """
- from optparse import OptionParser
- parser = OptionParser()
- DEFAULT_CONFIG_FILE = 'dpla-thumbs.ini'
- DEFAULT_LOGGER_CONFIG_FILE = 'thumbs.logger.config'
- parser.add_option("-c", "--config",
- dest="config_file",
- help="Config file, if nothing provided, then '%s' will be used." % DEFAULT_CONFIG_FILE,
- default=DEFAULT_CONFIG_FILE)
- parser.add_option("-l", "--logger",
- dest="logger_file",
- help="File with logger configuration, if nothing provided, then %s is used." % DEFAULT_LOGGER_CONFIG_FILE,
- default=DEFAULT_LOGGER_CONFIG_FILE)
- return parser.parse_args()
-
-
-def validate_params(options, args):
- """
- Validates if provided paramters are OK.
-
- Checks if the provided params exist.
- Checks if the config files exist.
-
- Exits program if any of the rules is violated.
-
- Arguments:
- options - object returned by the parse_cmd_params()
- args - object returned by the parse_cmd_params()
-
- Returns:
- None
-
- """
- # Logger is not yet configured:
- print ("Using configuration file: %s" % (options.config_file,))
- print ("Using logger configuration file: %s" % (options.logger_file,))
-
- def check_file_exists(filename):
- from os.path import isfile
- if not isfile(filename):
- print "There is no file %s" % filename
- print "exiting, good bye…"
- exit(1)
-
- check_file_exists(options.config_file)
- check_file_exists(options.logger_file)
-
-#################################################################################
-if __name__ == '__main__':
-
- # Parse program params.
- (options, args) = parse_cmd_params()
-
- # Validate the params.
- validate_params(options, args)
-
- # Process the script config file.
- conf = process_config(options.config_file)
-
- # Set up the logger.
- configure_logger(options.logger_file)
-
- logging.info("Script started.")
-
- # Start processing thumbnails.
- download_thumbs()
View
52 scripts/poll_profiles
@@ -49,10 +49,14 @@ def process_profile(uri_base, profile_f):
if not is_absolute(ENRICH):
ENRICH = URI_BASE + ENRICH
+ getRecord = profile.get(u'get_record', None)
subResources = profile.get(u'subresources')
blacklist = profile.get(u'blacklist',[])
ptype = profile.get(u'type').lower()
- if not subResources: # i.e. all subresources
+ if getRecord:
+ process = TYPE_PROCESSORS.get((ptype,'rec'))
+ process(profile)
+ elif not subResources: # i.e. all subresources
process = TYPE_PROCESSORS.get((ptype,'all'))
process(profile,blacklist)
else:
@@ -67,6 +71,37 @@ def process_profile(uri_base, profile_f):
return True
+def process_primo_all(profile, blacklist=None):
+ # TODO flag to stop requesting
+ request_more = True
+ index = 1
+ while request_more:
+ collection = {}
+ collection['id'] = 1
+ collection['name'] = "mwdl"
+ collection['items'] = []
+ endpoint = "%s&bulkSize=%s&indx=%s" % (profile[u'endpoint_URL'], profile[u'bulk_size'], index)
+
+ resp, content = H.request(endpoint)
+ if not resp[u'status'].startswith('2'):
+ print >> sys.stderr, ' HTTP error (%s) resolving URL: %s' % (resp[u'status'], endpoint)
+ request_more = False
+
+ print >> sys.stderr, "Index: %s" % index
+
+ endpoint_content = ARC_PARSE(content)
+ items = endpoint_content['SEGMENTS']['JAGROOT']['RESULT']['DOCSET']['DOC']
+ for item in (items if isinstance(items, list) else [items]):
+ item['_id'] = item['PrimoNMBib']['record']['control']['recordid']
+ collection['items'].append(item)
+ enrich_coll(profile, collection['name'], json.dumps({'items':collection['items']}))
+ index += int(profile[u'bulk_size'])
+
+ if index > 5000:
+ request_more = False
+
+ return True
+
ARC_PARSE = lambda doc: xmltodict.parse(doc,xml_attribs=True,attr_prefix='',force_cdata=False,ignore_whitespace_cdata=True)
#def skip_cdata(path,key,data):
@@ -157,6 +192,18 @@ def enrich_coll(profile,subr,content):
if not str(resp.status).startswith('2'):
print >> sys.stderr, ' HTTP error with enrichment service: '+repr(resp)
+def process_oai_rec(profile):
+ endpoint = profile[u'get_record']
+ print >> sys.stderr, endpoint
+
+ resp, content = H.request(endpoint)
+ if not str(resp.status).startswith('2'):
+ print >> sys.stderr, ' HTTP error ('+resp[u'status']+') resolving URL: ' + endpoint
+ return False
+
+ subr = profile[u'name']
+ enrich_coll(profile,subr,content)
+
def process_oai_coll(profile,subr):
# For now, a simplifying assumption that string concatenation produces a
# full URI from the combination of the endpoint URL and each subresource id.
@@ -352,6 +399,9 @@ TYPE_PROCESSORS = {
('oai','all'): process_oai_all,
('edan','coll'): None,
('edan','all'): process_edan_all,
+ ('oai','rec'): process_oai_rec,
+ ('primo','coll'): None,
+ ('primo','all'): process_primo_all
}
def define_arguments():
View
50 scripts/thumbs.logger.config
@@ -1,50 +0,0 @@
-[loggers]
-keys=root
-
-[logger_root]
-handlers=screen,errorFile,infoFile,debugFile
-level=NOTSET
-
-[handlers]
-keys=screen,errorFile,infoFile,debugFile
-
-[handler_errorFile]
-class=handlers.TimedRotatingFileHandler
-interval=midnight
-backupCount=7
-formatter=simple
-args=('logs/thumbs.error.log',)
-level=ERROR
-
-[handler_infoFile]
-class=handlers.TimedRotatingFileHandler
-interval=midnight
-backupCount=7
-formatter=simple
-args=('logs/thumbs.info.log',)
-level=INFO
-
-[handler_debugFile]
-class=handlers.TimedRotatingFileHandler
-interval=midnight
-backupCount=7
-formatter=simple
-args=('logs/thumbs.debug.log',)
-level=DEBUG
-
-[handler_screen]
-class=StreamHandler
-formatter=simple
-level=INFO
-args=(sys.stdout,)
-
-[formatters]
-keys=simple
-
-[formatter_simple]
-format=%(asctime)s - %(name)s - %(levelname)s [%(lineno)d] - %(message)s
-datefmt=
-
-
-
-
View
2  setup.py
@@ -10,5 +10,5 @@
url='http://dp.la',
package_dir={'dplaingestion':'lib'},
packages=['dplaingestion','dplaingestion.akamod'],
- scripts=['scripts/poll_profiles','scripts/build_profile'],
+ scripts=['scripts/poll_profiles'],
)
View
3  test/server_support.py
@@ -117,7 +117,8 @@ class Akara:
"dplaingestion.akamod.copy_prop",
"dplaingestion.akamod.cleanup_value",
"dplaingestion.akamod.sets_prop",
- "dplaingestion.akamod.enrich_language"
+ "dplaingestion.akamod.enrich_language",
+ "dplaingestion.akamod.dpla-get-record"
]
class download_preview:
View
155 test/test_copy_prop.py
@@ -8,7 +8,7 @@
H = httplib2.Http()
def _get_server_response(body, prop=None, to_prop=None, create=None, key=None,
- remove=None):
+ remove=None, no_replace=None):
url = server() + "copy_prop?prop=%s&to_prop=%s" % (prop, to_prop)
if create:
url = "%s&create=%s" % (url, create)
@@ -16,6 +16,8 @@ def _get_server_response(body, prop=None, to_prop=None, create=None, key=None,
url = "%s&key=%s" % (url, key)
if remove:
url = "%s&remove=%s" % (url, remove)
+ if no_replace:
+ url = "%s&no_replace=%s" % (url, no_replace)
return H.request(url, "POST", body=body, headers=CT_JSON)
def test_copy_prop_rights1():
@@ -537,5 +539,156 @@ def test_copy_prop_to_prop_dict_no_key():
assert resp.status == 200
assert json.loads(content) == EXPECTED
+def test_copy_prop_no_replace1():
+ """Should create list of prop string and append to_prop"""
+ prop = "aggregatedCHO/source"
+ to_prop = "aggregatedCHO/description"
+ no_replace = True
+
+ INPUT = {
+ "aggregatedCHO": {
+ "description" : "Description string.",
+ "source": "Source string."
+ }
+ }
+ EXPECTED = {
+ "aggregatedCHO": {
+ "description": [
+ "Description string.",
+ "Source string."
+ ],
+ "source": "Source string."
+ }
+ }
+
+ resp,content = _get_server_response(json.dumps(INPUT), prop=prop,
+ to_prop=to_prop, no_replace=no_replace)
+ assert resp.status == 200
+ assert json.loads(content) == EXPECTED
+
+def test_copy_prop_no_replace2():
+ """Should create list of prop string and append to_prop"""
+ prop = "aggregatedCHO/source"
+ to_prop = "aggregatedCHO/description"
+ no_replace = True
+
+ INPUT = {
+ "aggregatedCHO": {
+ "description" : "Description string.",
+ "source": ["Source string1.", "Source string2."]
+ }
+ }
+ EXPECTED = {
+ "aggregatedCHO": {
+ "description": [
+ "Description string.",
+ "Source string1.",
+ "Source string2."
+ ],
+ "source": ["Source string1.", "Source string2."]
+ }
+ }
+
+ resp,content = _get_server_response(json.dumps(INPUT), prop=prop,
+ to_prop=to_prop, no_replace=no_replace)
+ assert resp.status == 200
+ assert json.loads(content) == EXPECTED
+
+def test_copy_prop_no_replace3():
+ """Should create list of prop string and append to_prop"""
+ prop1 = "aggregatedCHO/source1"
+ prop2 = "aggregatedCHO/source2"
+ to_prop = "aggregatedCHO/description"
+ no_replace = True
+
+ INPUT = {
+ "aggregatedCHO": {
+ "description" : "Description string.",
+ "source1": "Source1 string1.",
+ "source2": ["Source2 string1.", "Source2 string2."]
+ }
+ }
+ EXPECTED1 = {
+ "aggregatedCHO": {
+ "description": [
+ "Description string.",
+ "Source1 string1."
+ ],
+ "source1": "Source1 string1.",
+ "source2": ["Source2 string1.", "Source2 string2."]
+ }
+ }
+ EXPECTED2 = {
+ "aggregatedCHO": {
+ "description": [
+ "Description string.",
+ "Source1 string1.",
+ "Source2 string1.",
+ "Source2 string2."
+ ],
+ "source1": "Source1 string1.",
+ "source2": ["Source2 string1.", "Source2 string2."]
+ }
+ }
+
+ resp,content = _get_server_response(json.dumps(INPUT), prop=prop1,
+ to_prop=to_prop, no_replace=no_replace)
+ assert resp.status == 200
+ assert json.loads(content) == EXPECTED1
+
+ resp,content = _get_server_response(json.dumps(EXPECTED1), prop=prop2,
+ to_prop=to_prop, no_replace=no_replace)
+ assert resp.status == 200
+ assert json.loads(content) == EXPECTED2
+
+def test_copy_prop_to_prop_create_dict_key1():
+ """Should copy to_prop into new dict with key"""
+ prop1 = "key1"
+ prop2 = "aggregatedCHO/key2"
+ to_prop = "aggregatedCHO/to_dict"
+ key1 = "key1"
+ key2 = "key2"
+ create = True
+
+ INPUT = {
+ "key1": "value1",
+ "aggregatedCHO": {
+ "key2": "value2",
+ "key3": "value3"
+ },
+ "key4": "value4"
+ }
+ EXPECTED1 = {
+ "key1": "value1",
+ "aggregatedCHO": {
+ "key2": "value2",
+ "key3": "value3",
+ "to_dict" : {"key1": "value1"}
+ },
+ "key4": "value4"
+ }
+ EXPECTED2 = {
+ "key1": "value1",
+ "aggregatedCHO": {
+ "key2": "value2",
+ "key3": "value3",
+ "to_dict" : {
+ "key1": "value1",
+ "key2": "value2"
+ }
+ },
+ "key4": "value4"
+ }
+
+ resp,content = _get_server_response(json.dumps(INPUT), prop=prop1,
+ to_prop=to_prop, key=key1, create=create)
+ assert resp.status == 200
+ assert json.loads(content) == EXPECTED1
+
+ resp,content = _get_server_response(json.dumps(EXPECTED1), prop=prop2,
+ to_prop=to_prop, key=key2, create=create)
+ assert resp.status == 200
+ assert json.loads(content) == EXPECTED2
+
if __name__ == "__main__":
raise SystemExit("Use nosetest")
View
1  test/test_enrich_date.py
@@ -263,6 +263,7 @@ def test_range_with_brackets():
("[ 1960-05-01 - 1960-05-15 ]", "1960-05-01 - 1960-05-15"),
("[1960-05-01 - 1960-05-15]", "1960-05-01 - 1960-05-15"),
("[1960-05-01 / 1960-05-15]", "1960-05-01 / 1960-05-15"),
+ ("[1960-05-01/1960-05-15]", "1960-05-01/1960-05-15"),
]
for r in ranges:
View
31 test/test_move_date_values.py
@@ -2,10 +2,12 @@
from server_support import server, print_error_log, H
from amara.thirdparty import json
-def _get_server_response(body, prop=None):
+def _get_server_response(body, prop=None, to_prop=None):
url = server() + "move_date_values"
if prop:
url = "%s?prop=%s" % (url, prop)
+ if to_prop:
+ url = "%s&to_prop=%s" % (url, to_prop)
return H.request(url,"POST",body=body)
def test_move_date_values_no_prop():
@@ -202,5 +204,32 @@ def test_move_date_values_subject3():
assert resp.status == 200
assert json.loads(content) == EXPECTED
+def test_move_date_values_to_date():
+ """
+ Should remove subject field if only element is a date.
+ """
+ prop = "aggregatedCHO/spatial"
+ to_prop = "aggregatedCHO/date"
+ INPUT = {
+ "aggregatedCHO": {
+ "spatial" : [
+ "1861-12-30/1862-07-13",
+ "(1862/12/30 - 1863/07/13)"
+ ]
+ }
+ }
+ EXPECTED = {
+ "aggregatedCHO": {
+ "date": [
+ "1861-12-30/1862-07-13",
+ "1862/12/30 - 1863/07/13"
+ ]
+ }
+ }
+
+ resp,content = _get_server_response(json.dumps(INPUT),prop,to_prop)
+ assert resp.status == 200
+ assert json.loads(content) == EXPECTED
+
if __name__ == "__main__":
raise SystemExit("Use nosetest")
Please sign in to comment.
Something went wrong with that request. Please try again.