Permalink
Browse files

First working version of ingesting smithsonian.

  • Loading branch information...
1 parent 4ebe16a commit dcf408db80bece91ffd50dd5d8006490f09e7882 Szymon Guz committed Mar 12, 2013
Showing with 86 additions and 44 deletions.
  1. +86 −44 scripts/poll_profiles
View
@@ -218,82 +218,124 @@ def process_oai_all(profile,blacklist=[]):
process_oai_coll(profile,subr)
time.sleep(sleep)
+
+def get_current_username():
+ """TODO"""
+ import os
+ import pwd
+ return pwd.getpwuid( os.getuid() )[ 0 ]
+
+
+def create_temp_dir(operation=""):
+ """Returns a new temp dir"""
+ import tempfile
+ prefix = "%s_%s" % (get_current_username(), operation)
+ return tempfile.mkdtemp(prefix=prefix)
+
+
+def normalize_collection_name(collection_name):
+ """Removes bad characters from collection names."""
+ import re
+ x = re.sub(r'[/() \t]', r'_', collection_name)
+ x = re.sub(r'_+', r'_', x)
+ return x.lower()
+
+
# Used for Smithsonian data
+collections = {}
+
def process_edan_all(profile, blacklist=None):
src_URL = profile.get('endpoint_URL')
assert src_URL.startswith('file:/') # assumes no authority and the non-broken use of //
src_dir = src_URL[5:]
-
+
+ global collections
collections = {}
- os.system("rm /tmp/coll_*")
+ cache_dir = create_temp_dir("ingest_edan")
+ print "Using cache dir: " + cache_dir
print "Walking directory: " + src_dir
- for (root, dirs, files) in os.walk(src_dir):
- for filename in fnmatch.filter(files, '*_DPLA.xml'):
- item_fn = os.path.join(root,filename)
- item_f = open(item_fn,'r')
- documents = ARC_PARSE(item_f)['response']['result']
- item_f.close()
- for item in documents["doc"]:
- desc_non = item["descriptiveNonRepeating"]
- item["_id"] = desc_non["record_ID"]
+ def cache_file_name(cache_dir, collection):
+ f = os.path.join(cache_dir, "coll_" + normalize_collection_name(collection))
+ #print "Using %s for collection: [%s] and cache_dir: [%s]" % (f, collection, cache_dir)
+ return f
- hier_items = item["freetext"]
+ def handle_document(_, item):
+ global collections
- for k, v in hier_items.items():
- x = (v if isinstance(v, list) else [v])
+ desc_non = item["descriptiveNonRepeating"]
+ item["_id"] = desc_non["record_ID"]
- for hi in x:
- if not 'label' in hi:
- continue
+ hier_items = item["freetext"]
- hid = hi['label']
- htitle = hi['#text']
+ for k, v in hier_items.items():
+ x = (v if isinstance(v, list) else [v])
- if hid not in collections:
- # Grab series information from item
- coll = {}
- coll['id'] = hid
- coll['title'] = htitle
- coll['items'] = []
- collections[hid] = coll
- else:
- coll = collections[hid]
+ for hi in x:
+ if not '@label' in hi:
+ continue
+ hid = hi['@label']
+ htitle = hi['#text']
- # Create tmp file to hold collections items
- coll_fn = "/tmp/coll_%s" % coll['id']
- coll_f = open(coll_fn,'a')
- coll_f.write(str(item)+"\n")
- coll_f.close()
-
+ if hid not in collections:
+ # Grab series information from item
+ coll = {}
+ coll['id'] = hid
+ coll['title'] = htitle
+ coll['items'] = []
+ collections[hid] = coll
+ else:
+ coll = collections[hid]
+ # Create tmp file to hold collections items
+ coll_fn = cache_file_name(cache_dir, coll['id'])
+ coll_f = open(coll_fn,'a')
+ coll_f.write(str(item)+"\n")
+ coll_f.close()
+
+ return True
+
+ for (root, dirs, files) in os.walk(src_dir):
+ for filename in fnmatch.filter(files, '*_DPLA.xml'):
+ item_fn = os.path.join(root,filename)
+ print "Processing file: " + item_fn
+ try:
+ item_f = open(item_fn,'r')
+ xmltodict.parse(item_f, item_depth=3, item_callback=handle_document)
+ except Exception as e:
+ print >> sys.stderr, '[ERROR]', e.message
+ else:
+ item_f.close()
+
from collections import OrderedDict
limit = 1000
for cid in collections:
# Open tmp collection file and append items
- coll_fn = "/tmp/coll_%s" % cid
+ coll_fn = cache_file_name(cache_dir, cid)
coll_f = open(coll_fn, 'r')
- lines = coll_f.readlines()
- coll_f.close()
- os.remove(coll_fn)
- step = 0
i = 0
- for line in lines:
+ step = 0
+ for line in coll_f:
collections[cid]['items'].append(eval(line))
i += 1
- if i == limit or line == lines[-1]:
- print >> sys.stderr, "Enriching collection " + cid
-
+ if i == limit:
+ print >> sys.stderr, "Enriching collection [%s]" % cid
enrich_coll(profile,cid,json.dumps(collections[cid]))
del collections[cid]['items'][:]
i = 0
- step += 1
+ if collections[cid]['items']:
+ print >> sys.stderr, "Enriching collection [%s]" % cid
+ enrich_coll(profile,cid,json.dumps(collections[cid]))
+ del collections[cid]['items'][:]
del collections[cid]['items']
+ coll_f.close()
+ print >> sys.stderr, "Removing cache dir [%s]" % cache_dir
+ os.system("rm -rf " + cache_dir)
TYPE_PROCESSORS = {
('arc','coll'): None,

0 comments on commit dcf408d

Please sign in to comment.