Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
...
  • 6 commits
  • 3 files changed
  • 0 commit comments
  • 1 contributor
View
296 lib/akamod/edan_to_dpla.py
@@ -46,13 +46,27 @@ def transform_description(d):
description = None
items = arc_group_extraction(d, "freetext", "notes")
for item in (items if isinstance(items, list) else [items]):
- if "@label" in item and item["@label"] == "Notes":
+ if "@label" in item and item["@label"] == "Description":
if "#text" in item:
description = item["#text"]
break;
return {"description": description} if description else {}
+def transform_date(d):
+ logger.debug("DATE")
+ date = None
+ dates = arc_group_extraction(d, "freetext", "date")
+ for item in dates:
+ logger.debug(item)
+ if "@label" in item and "#text" in item:
+ logger.debug("A")
+ if item["@label"] == "Date":
+ date = item["#text"]
+ break
+ logger.debug("END DATE")
+ return {"date": date} if date else {}
+
def extract_date(d, group_key, item_key):
dates = []
items = arc_group_extraction(d, group_key, item_key)
@@ -80,9 +94,13 @@ def is_part_of_transform(d):
items = arc_group_extraction(d, "freetext", "setName")
for item in (items if isinstance(items, list) else [items]):
if "#text" in item:
- is_part_of.append(item["#text"])
+ is_part_of.append({"title": item["#text"]})
- return {"isPartOf": "; ".join(is_part_of)} if is_part_of else {}
+ res = is_part_of
+ if len(res) == 1:
+ res = res[0]
+
+ return {"isPartOf": res} if res else {}
def source_transform(d):
@@ -94,29 +112,38 @@ def source_transform(d):
return {"source": source} if source else {}
-def is_shown_at_transform(d):
- object = "http://research.archives.gov/description/%s" % d["arc-id-desc"]
-
- return {"isShownAt": object}
+def transform_is_shown_at(d):
+ propname = "descriptiveNonRepeating/record_link"
+ obj = getprop(d, propname, False)
+ return {"isShownAt": obj} if obj else {}
def collection_transform(d):
+ import re
collections = []
items = arc_group_extraction(d, "freetext", "setName")
for item in (items if isinstance(items, list) else [items]):
if "#text" in item:
- collections.append(item["#text"])
+ c = item["#text"]
+ c = re.sub(r'[,]', '', c)
+ c = re.sub(r'\s+', '--', c)
+ collections.append(c)
return {"collection": collections} if collections else {}
def creator_transform(d):
- creator = None
+ creator = []
creators = arc_group_extraction(d, "freetext", "name")
for c in (creators if isinstance(creators, list) else [creators]):
if c["@label"] in creator_field_names:
- creator = c["#text"]
- break;
- return {"creator": creator} if creator else {}
+ if "#text" in c:
+ creator.append(c["#text"])
+
+ res = creator
+ if len(creator) == 1:
+ res = res[0]
+
+ return {"creator": res} if res else {}
def transform_format(d):
@@ -125,20 +152,27 @@ def transform_format(d):
formats = arc_group_extraction(d, "freetext", "physicalDescription")
[f.append(e["#text"]) for e in formats if e["@label"] in labels]
- return {"format": f} if f else {}
+ res = f
+ if len(res) == 1:
+ res = res[0]
+ return {"format": res} if res else {}
def transform_rights(d):
p = []
ps = arc_group_extraction(d, "freetext", "creditLine")
if ps != [None]:
- [p.append(e["#text"]) for e in ps if "@label" in e and e["@label"] == "Credit line"]
+ [p.append(e["#text"]) for e in ps if "@label" in e and e["@label"] == "Credit Line"]
ps = arc_group_extraction(d, "freetext", "objectRights")
if ps != [None]:
[p.append(e["#text"]) for e in ps if "@label" in e and e["@label"] == "Rights"]
- return {"rights": p} if p else {}
+ res = p
+ if len(p) == 1:
+ res = p[0]
+
+ return {"rights": res} if res else {}
def transform_publisher(d):
@@ -150,26 +184,167 @@ def transform_publisher(d):
return {"publisher": p} if p else {}
-def transform_place(d):
+def transform_spatial(d):
+ result = []
place = []
- labels = ["Place", "Country", "Site"]
+ location_states = []
+
places = arc_group_extraction(d, "freetext", "place")
- [place.append(e["#text"]) for e in places if e["@label"] in labels]
+ for p in places:
+ if isinstance(p, dict):
+ if "#text" in p:
+ place.append(p["#text"])
+
+ if len(place) == 1:
+ place = place[0]
+
+
+ def convert_location(location, name):
+ """Converts one location to a spatial record."""
+ city_keys = ["City", "Town"]
+ state_keys = ["State", "Province", "Department", "Country", "District", "Republic", "Sea", "Gulf", "Bay"]
+ county_keys = ["County", "Island"]
+ country_L1_keys = ["Continent", "Ocean"]
+ country_L2_keys = ["Country", "Nation", "Sea", "Gulf", "Bay", "Sound"]
+ cities = []
+ states = []
+ counties = []
+ countries = []
+ regions = []
+ points = []
+
+ res = {}
+ def update(res, name, val):
+ if not val:
+ return
+ if len(val) == 1:
+ res.update({name: val[0]})
+ elif val:
+ res.update({name: val})
+
+ for k, v in location.items():
+ logger.debug("k:%s, v:%s" % (k,v))
+ if not ("#text" in v and "@type" in v):
+ continue
+ tp = v["@type"]
+ tx = v["#text"]
+
+ if k == "L5" and tp in city_keys:
+ cities.append(tx)
+ elif k == "L3" and tp in state_keys:
+ states.append(tx)
+ location_states.append(tx)
+ elif k == "L4" and tp in county_keys:
+ counties.append(tx)
+ #elif k == "L1" and tp in country_L1_keys:
+ # countries.append(tx)
+ elif k == "L2" and tp in country_L2_keys:
+ countries.append(tx)
+ elif k in [ "L2", "L3", "L4", "L5"]:
+ regions.append(tx)
+ elif k == "points":
+ logger.debug("POINTS: " + str(v))
+
+
+ update(res, "name", place)
+ update(res, "city", cities)
+ update(res, "state", states)
+ update(res, "county", counties)
+ update(res, "country", countries)
+ update(res, "region", regions)
+ update(res, "lat_long", points)
+
+ return res
+
+ logger.debug("ID:" + d["_id"])
+ geo = arc_group_extraction(d, "indexedStructured", "geoLocation")
+ logger.debug("GEO SIZE: " + str(len(geo)))
+
+ for g in geo:
+
+ if not g:
+ continue
+
+ logger.debug("GEO: " + str(g))
+ loc = convert_location(g, place)
+ if loc:
+ result.append(loc)
+
+
+ logger.debug(result)
+
+ ret = {}
+ if len(result) == 1:
+ ret = {"spatial": result[0]}
+ elif result:
+ ret = {"spatial": result}
+
+ # Also add currentLocation
+ l = list(set(location_states))
+ if len(l) == 1:
+ l = l[0]
+
+ if l:
+ ret.update({"currentLocation": l})
- return {"place": place} if place else {}
+ logger.debug("RESULT: " + str(ret))
+ return ret
+def transform_online_media(d):
+
+
+ media = arc_group_extraction(d, "descriptiveNonRepeating", "online_media")
+ if media == [None]:
+ return {}
+
+ media = media[0]
+ c = 0
+ if "@mediaCount" in media:
+ c = media["@mediaCount"]
+ try:
+ c = int(c)
+ except ValueError as e:
+ logger.error("Couldn't convert %s to int" % c)
+ return {}
+ if not "media" in media:
+ return {}
+
+ m = media
+ if c == 1:
+ m = [media["media"]]
+
+ res = []
+ for mm in m:
+ item = {}
+ if "@type" in mm:
+ item["format"] = mm["@type"]
+ if "rights" in mm:
+ item["rights"] = mm["rights"]
+ if item.keys():
+ res.append(item)
+
+ if len(res) == 1:
+ return {"hasView": res[0]}
+ if len(res) > 1:
+ return {"hasView": res}
+ return {}
+
def transform_title(d):
- p = []
+ p = None
labels = ["Title", "Object Name"]
- ps = arc_group_extraction(d, "title")
+ ps = arc_group_extraction(d, "descriptiveNonRepeating", "title")
+ logger.debug("TITLE")
if ps != [None]:
- [p.append(e["#text"]) for e in ps if e["@label"] in labels]
+ for e in ps:
+ if e["@label"] in labels:
+ p = e["#text"]
+ logger.debug("TITLE " + str(ps))
return {"title": p} if p else {}
-
def transform_subject(d):
+
p = []
ps = arc_group_extraction(d, "freetext", "topic")
if ps != [None]:
@@ -179,21 +354,50 @@ def transform_subject(d):
if ps != [None]:
[p.append(e["#text"]) for e in ps if e["@label"] == "Nationality"]
- return {"subject": p} if p else {}
+ fields = ["topic","name","culture","tax_kingdom","tax_phylum",
+ "tax_division","tax_class","tax_order","tax_family",
+ "tax_sub-family","scientific_name","common_name","strat_group",
+ "strat_formation","strat_member"]
+ if "freetext" in d:
+ for key, item in d["freetext"].items():
+ if key in fields:
+ if "#text" in item:
+ p.append(item["#text"])
+
+ res = list(set(p))
+ #logger.debug("SUBJECT:" + str(res))
+ if len(res) == 1:
+ res = res[0]
+
+ #logger.debug("SUBJECT:" + str(res))
+ return {"subject": res} if res else {}
def transform_identifier(d):
- extent = []
- extents = arc_group_extraction(d, "freetext", "identifier")
- [extent.append(e) for e in extents if e["@label"].startswith("Catalog") or e["@label"].startswith("Accession")]
+ identifier = []
+ ids = arc_group_extraction(d, "freetext", "identifier")
+ [identifier.append(e["#text"]) for e in ids if e["@label"].startswith("Catalog") or e["@label"].startswith("Accession")]
+
+ id = identifier
+ if len(identifier) == 1:
+ id = identifier[0]
+
+ return {"identifier": id} if id else {}
- return {"extent": extent} if extent else {}
+
+def transform_data_provider(d):
+ ds = None
+ dss = arc_group_extraction(d, "descriptiveNonRepeating", "dataProvider")
+ if dss != [None]:
+ ds = dss[0]
+
+ return {"data_source": ds} if ds else {}
def extent_transform(d):
extent = []
extents = arc_group_extraction(d, "freetext", "physicalDescription")
- [extent.append(e) for e in extents if e["@label"] == "Dimensions"]
+ [extent.append(e["#text"]) for e in extents if e["@label"] == "Dimensions"]
return {"extent": extent} if extent else {}
@@ -230,6 +434,22 @@ def subject_and_spatial_transform(d):
return v
+def slugify_field(data, fieldname):
+ if exists(data, fieldname):
+ import re
+ p = getprop(data, fieldname)
+ parts = p.split("/")
+ c = parts[-1:]
+ if c:
+ c = c[0]
+ c = re.sub(r'[,]', '', c)
+ c = re.sub(r'\s+', '--', c)
+ parts[len(parts)-1] = c
+ slugged = "/".join(parts)
+ logger.debug("SLUG:[%s][%s]" % (p, slugged))
+ setprop(data, fieldname, slugged)
+
+
def type_transform(d):
type = []
@@ -326,14 +546,15 @@ def arc_group_extraction(d, groupKey, itemKey, nameKey=None):
"freetext/physicalDescription" : extent_transform,
"freetext/name" : creator_transform,
"freetext/setName" : is_part_of_transform,
- "freetext/date" : lambda d: extract_date(d,"freetext","date"),
+ "freetext/date" : transform_date,
"freetext/notes" : transform_description,
"freetext/identifier" : transform_identifier,
"language" : lambda d: {"language": d.get("language") },
"freetext/physicalDescription" : transform_format,
- "freetext/place" : transform_place,
"freetext/publisher" : transform_publisher,
- "title" : transform_title,
+ "descriptiveNonRepeating/title" : transform_title,
+ "descriptiveNonRepeating/data_source" : transform_data_provider,
+ #"descriptiveNonRepeating/online_media" : transform_online_media,
}
AGGREGATION_TRANSFORMER = {
@@ -345,7 +566,7 @@ def arc_group_extraction(d, groupKey, itemKey, nameKey=None):
"collection" : lambda d: {"collection": d.get("collection")},
}
-@simple_service("POST", "http://purl.org/la/dp/edan-to-dpla", "edan-to-dpla", "application/ld+json")
+@simple_service("POST", "http://purl.org/la/dp/edan_to_dpla", "edan_to_dpla", "application/ld+json")
def edantodpla(body,ctype,geoprop=None):
"""
Convert output of JSON-ified EDAN (Smithsonian) format into the DPLA JSON-LD format.
@@ -382,9 +603,18 @@ def edantodpla(body,ctype,geoprop=None):
#out["aggregatedCHO"].update(type_transform(data))
out["aggregatedCHO"].update(transform_rights(data))
out["aggregatedCHO"].update(transform_subject(data))
+ out["aggregatedCHO"].update(transform_spatial(data))
+ logger.debug(out["aggregatedCHO"])
#out["aggregatedCHO"].update(subject_and_spatial_transform(data))
#out.update(has_view_transform(data))
+ out.update(transform_is_shown_at(data))
+
+
+ logger.debug(out)
+
+ slugify_field(out, "collection/@id")
+
logger.debug("x"*60)
if exists(out, "aggregatedCHO/date"):
View
2  lib/akamod/enrich-format.py
@@ -79,7 +79,7 @@ def is_imt(s):
# Setting the type if it is empty.
f = getprop(data, typefield, True)
- if not f:
+ if not f and exists(data, prop):
format = getprop(data, prop)
use_format = None
if isinstance(format, list) and len(format) > 0:
View
10 profiles/smithsonian.pjs
@@ -4,30 +4,26 @@
"list_sets": "",
"enrichments_rec": [
"/select-id?prop=descriptiveNonRepeating%2Frecord_link",
- "/edan-to-dpla",
+ "/edan_to_dpla",
"/shred?prop=aggregatedCHO%2Fcontributor%2CaggregatedCHO%2Fcreator%2CaggregatedCHO%2Fdate",
"/shred?prop=aggregatedCHO%2Flanguage%2CaggregatedCHO%2Fpublisher%2CaggregatedCHO%2Frelation",
"/shred?prop=aggregatedCHO%2Fsubject%2CaggregatedCHO%2Ftype",
"/shred?prop=aggregatedCHO%2Fsubject&delim=%3Cbr%3E",
"/shred?prop=isShownAt%2Fformat",
- "/move_date_values?prop=aggregatedCHO%2Fsubject",
- "/move_date_values?prop=aggregatedCHO%2Fspatial",
- "/shred?prop=aggregatedCHO%2Fspatial&delim=--",
"/enrich_earliest_date",
"/enrich_date",
"/enrich-subject",
"/cleanup_value",
"/enrich-type",
"/enrich-format",
- "/enrich_location",
"/copy_prop?prop=aggregatedCHO%2Fpublisher&to_prop=dataProvider&create=True&remove=True",
"/enrich_language"
],
"subresources": [],
"contributor": {
"@id": "http://dp.la/api/contributor/smithsonian",
- "name": "Smithsonian"
+ "name": "Smithsonian Institutions"
},
"type": "edan",
- "endpoint_URL": "file:/home/szymon/smithsonian_demo/"
+ "endpoint_URL": "file:/home/szymon/smithsonian/"
}

No commit comments for this range

Something went wrong with that request. Please try again.