Combination of the following steps & notebooks:
1. Download SI dataset
2. Unzip (from Matt Miller)
3. Find Units (map codes to names)
4. Split by unit
5. Meta-analysis (from Matt Miller)
6. Find Outdoor Sculpture

In [33]:
from datetime import datetime
import bz2
import collections
import glob
import json
import re
import ujson

In [21]:
from datetime import datetime
startTime = datetime.now()
print("Started: " + str(startTime))

stepTime = datetime.now()

Started: 2020-03-04 09:50:01.780202


In [27]:
def printStepTime(text):
    global stepTime
    endTime = datetime.now()
    print("Finished " + text + ": " + str(endTime) + ", took " + str(endTime - stepTime))
    stepTime = endTime

In [3]:
# Download SI data
#!curl -L https://github.com/Smithsonian/OpenAccess/archive/master.zip --output data.zip

printStepTime("downloading")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   127  100   127    0     0    158      0 --:--:-- --:--:-- --:--:--   158
100 2627M    0 2627M    0     0   465k      0 --:--:--  1:36:17 --:--:--  318k   0   508k      0 --:--:--  0:00:28 --:--:--  510k   520k      0 --:--:--  0:00:52 --:--:--  552k 0     0   506k      0 --:--:--  0:01:29 --:--:--  298k   0   508k      0 --:--:--  0:01:38 --:--:--  524k--:--  0:03:58 --:--:--  308kk      0 --:--:--  0:05:14 --:--:--  440k1k      0 --:--:--  0:05:43 --:--:--  390k    0     0   493k      0 --:--:--  0:05:55 --:--:--  566k   0   492k      0 --:--:--  0:05:57 --:--:--  431k-:--:--  0:07:26 --:--:--  405kk      0 --:--:--  0:07:34 --:--:--  655k 0 --:--:--  0:08:31 --:--:--  460kk      0 --:--:--  0:08:54 --:--:--  509k--  0:09:01 --:--:--  549k-:--  0:09:11 --:--:--  471k   0   493k      0 --:--:--  0:09:45 --:--:--  487k--:--:-

In [None]:
# Unzip repo
#!unzip -q data.zip

printStepTime("unzipping repo archive")

In [14]:
# Iterate through data in repo and uncompress it
# Put in one giant file; we'll process better later
with open('all_data.ndjson','w') as out:
  for div in glob.glob('./OpenAccess-master/metadata/objects/*'):
    print('Working on: ',div)
    for file in glob.glob(f'{div}/*'):
      with bz2.open(file, "rb") as f:
        out.write(f.read().decode())

printStepTime("uncompressing data")

Working on:  ./OpenAccess-master/metadata/objects/NAA
Working on:  ./OpenAccess-master/metadata/objects/HAC
Working on:  ./OpenAccess-master/metadata/objects/HSFA
Working on:  ./OpenAccess-master/metadata/objects/SIL
Working on:  ./OpenAccess-master/metadata/objects/SIA
Working on:  ./OpenAccess-master/metadata/objects/CHNDM
Working on:  ./OpenAccess-master/metadata/objects/NPG
Working on:  ./OpenAccess-master/metadata/objects/NMNHFISHES
Working on:  ./OpenAccess-master/metadata/objects/NASM
Working on:  ./OpenAccess-master/metadata/objects/FSG
Working on:  ./OpenAccess-master/metadata/objects/FSA
Working on:  ./OpenAccess-master/metadata/objects/NMNHBOTANY
Working on:  ./OpenAccess-master/metadata/objects/FBR
Working on:  ./OpenAccess-master/metadata/objects/NMAAHC
Working on:  ./OpenAccess-master/metadata/objects/ACM
Working on:  ./OpenAccess-master/metadata/objects/NMNHHERPS
Working on:  ./OpenAccess-master/metadata/objects/NMNHINV
Working on:  ./OpenAccess-master/metadata/objects/N

In [35]:
# Get unit codes and names; count records while at it

count = 0
unitMap = {}
unitCount = {}

# Find units in data
with open('all_data.ndjson') as infile:
    for line in infile:
        data = ujson.loads(line)
        
        count += 1
        
        unitCode = data['unitCode']
        dataSource = data['content']['descriptiveNonRepeating']['data_source']
        if not unitCode in unitMap:
            unitMap[unitCode] = []
        if not dataSource in unitMap[unitCode]:
            unitMap[unitCode].append(dataSource)
        
        if not unitCode in unitCount:
            unitCount[unitCode] = 0
        unitCount[unitCode] += 1

In [36]:
# Sort & output results
print("Found " + str(count) + " records")
od = collections.OrderedDict(sorted(unitMap.items()))
with open('units.txt','w') as out:
    out.write("Total: " + str(count) + " records\n")
    out.write("")
    for k, v in od.items():
        unitStr = k + " (" + str(unitCount[k]) + " records)" + ": " + ";".join(v)
        print(unitStr)
        out.write(unitStr + "\n")


printStepTime("finding units and counting")

Found 11355839 records
ACAH (13 records): Archives Center, National Museum of American History
ACM (249 records): Anacostia Community Museum
CHNDM (40196 records): Cooper Hewitt, Smithsonian Design Museum
FBR (1517 records): Smithsonian Field Book Project
FSA (29 records): Freer Gallery of Art and Arthur M. Sackler Gallery Archives
FSG (3133 records): Freer Gallery of Art and Arthur M. Sackler Gallery
HAC (252 records): Smithsonian Gardens
HMSG (502 records): Hirshhorn Museum and Sculpture Garden
HSFA (78 records): Human Studies Film Archives
NAA (15 records): National Anthropological Archives
NASM (141 records): National Air and Space Museum
NMAAHC (2388 records): National Museum of African American History and Culture
NMAH (1293132 records): National Museum of American History
NMAfA (136 records): National Museum of African Art
NMNHANTHRO (479775 records): NMNH - Anthropology Dept.
NMNHBIRDS (555055 records): NMNH - Vertebrate Zoology - Birds Division
NMNHBOTANY (3675609 records): NM

In [None]:
# Split by units
# TODO: Move into decompress?
#   PRO: Split early
#   CON: Adds more overhead to decompress at that point

with open('all_data.ndjson') as infile:
    for line in infile:
        data = ujson.loads(line)
        unitCode = data['unitCode']
        
        with open('data_unit_' + unitCode + '.ndjson','a') as out:
            out.write(line)

printStepTime("splitting by unit")

In [25]:
# Matt Miller's meta

# this just holds the over all record count
count = 0

# these hold all of the lookup counts we are going to be making
has_media_loopup = {}
has_no_media_loopup = {}
freetext_fields = {}
indexedStructured_fields = {}
date_index = {}
topic_index = {}
topic_index_by_dept = {}
country_index = {}
width_height = {}

# these are the regular expression patterns to look for the mesurments in cm
cmx4 = re.compile(r"([0-9]+\.*[0-9]*)\s+c*m*[×x ]*\s([0-9]+\.*[0-9]*)\s+c*m*[×x ]*([0-9]+\.*[0-9]*)\s+c*m*[×x ]*([0-9]+\.*[0-9]*)\s*cm")
cmx3 = re.compile(r"([0-9]+\.*[0-9]*)\s+c*m*[×x ]*([0-9]+\.*[0-9]*)\s+c*m*[×x ]*([0-9]+\.*[0-9]*)\s*cm")
cmx2 = re.compile(r"([0-9]+\.*[0-9]*)\s+c*m*[×x ]*([0-9]+\.*[0-9]*)\s*c*m")

with open('all_data.ndjson') as infile:

	# ope the big json file and loop through line by line
	for line in infile:

		# parse the line of json into data
		data = ujson.loads(line)	

		count+=1
#		# every 100K records let us know where we are at
#		if count % 100000 == 0:
#			print(count)


		# the department is always there
		dept = data['content']['descriptiveNonRepeating']['data_source']

		# look through each date field and add it to the lookup count
		if 'indexedStructured' in data['content']:
			if 'date' in data['content']['indexedStructured']:
				for date in data['content']['indexedStructured']['date']:
					if date not in date_index:
						date_index[date] = 0
					date_index[date]+=1

		# look through each topic field and add it to the lookup count
		if 'indexedStructured' in data['content']:
			if 'topic' in data['content']['indexedStructured']:
				for topic in data['content']['indexedStructured']['topic']:
					if topic not in topic_index:
						topic_index[topic] = 0
					topic_index[topic]+=1

					if dept not in topic_index_by_dept:
						topic_index_by_dept[dept] = {}

					if topic not in topic_index_by_dept[dept]:
						topic_index_by_dept[dept][topic] = 0

					topic_index_by_dept[dept][topic]+=1



		# look through each geoLocation field and add it to the lookup count
		if 'indexedStructured' in data['content']:
			if 'geoLocation' in data['content']['indexedStructured']:
				for x in data['content']['indexedStructured']['geoLocation']:
					if 'L2' in x:
						if isinstance(x['L2'], str):
							k =x['L2']
						else:
							k = x['L2']['content']

						if k not in country_index:
							country_index[k] = 0

						country_index[k]+=1

		# look through each physicalDescription field see if it has the string "cm" in it, if it does try our 3 reg ex patterns to try and pull out the height and width
		if 'freetext' in data['content']:
			if 'physicalDescription' in data['content']['freetext']:
				for x in data['content']['freetext']['physicalDescription']:
					if 'cm' in x['content']:

						height = None
						width = None

						s = cmx4.search(x['content'])
						if s:
							# print(s)
							# print(s.group(1), s.group(2),s.group(3),s.group(4))
							width = s.group(1)
							height = s.group(2)
						else:
							s = cmx3.search(x['content'])
							if s:
								# print(s)
								# print(s.group(1), s.group(2),s.group(3))
								width = s.group(1)
								height = s.group(2)

							else:
								s = cmx2.search(x['content'])
								if s:

									width = s.group(1)
									height = s.group(2)									
									# print(s)
									# print(s.group(1), s.group(2))
								# else:
								# 	print("SHHHHITTT")
								# 	print(x['content'])


						if width:
							wh_key = width + 'x' + height
							if wh_key not in width_height:
								width_height[wh_key] = {'count':0, 'hw': [width,height]}

							width_height[wh_key]['count']+=1




		
		# Build the basic, has media and field counts
		if 'online_media' in data['content']['descriptiveNonRepeating']:
			if data['content']['descriptiveNonRepeating']['data_source'] not in has_media_loopup:
				has_media_loopup[data['content']['descriptiveNonRepeating']['data_source']] = 0
			

			has_media_loopup[data['content']['descriptiveNonRepeating']['data_source']]+=1
		else:
			if data['content']['descriptiveNonRepeating']['data_source'] not in has_no_media_loopup:
				has_no_media_loopup[data['content']['descriptiveNonRepeating']['data_source']] = 0


			has_no_media_loopup[data['content']['descriptiveNonRepeating']['data_source']]+=1

		if 'freetext' in data['content']:
			for key in data['content']['freetext'].keys():
				if key not in freetext_fields:
					freetext_fields[key] = 0
				freetext_fields[key]+=1


		if 'indexedStructured' in data['content']:
			for key in data['content']['indexedStructured'].keys():
				if key not in indexedStructured_fields:
					indexedStructured_fields[key] = 0
				indexedStructured_fields[key]+=1

In [26]:
# write it out 
ujson.dump(has_media_loopup,open('stats/has_media_loopup.json','w'),indent=2)
ujson.dump(has_no_media_loopup,open('stats/has_no_media_loopup.json','w'),indent=2)
ujson.dump(indexedStructured_fields,open('stats/indexedStructured_fields.json','w'),indent=2)
ujson.dump(freetext_fields,open('stats/freetext_fields.json','w'),indent=2)
ujson.dump(width_height,open('stats/width_height.json','w'),indent=2)
ujson.dump(date_index,open('stats/date_index.json','w'),indent=2)
ujson.dump(topic_index,open('stats/topic_index.json','w'),indent=2)
ujson.dump(country_index,open('stats/country_index.json','w'),indent=2)
ujson.dump(topic_index_by_dept,open('stats/topic_index_by_dept.json','w'),indent=2)

printStepTime("Matt Miller's meta")

Finished Matt Miller's meta: 2020-03-04 11:37:20.455714, took 1:47:18.675512


In [28]:
# Search for Outdoor Sculpture in SI records; output subset

!grep -i "outdoor sculpture" all_data.ndjson > osculpt_grep.ndjson

# Convert ndjson to json
!echo "[" > osculpt_grep.json
!head --lines=-1 osculpt_grep.ndjson | awk '{ print $0 "," }' >> osculpt_grep.json
!tail --lines=1 osculpt_grep.ndjson >> osculpt_grep.json
!echo "]" >> osculpt_grep.json

In [34]:
# Pretty-print json
with open('osculpt_grep.json') as infile:
    parsed = json.load(infile)
    print("Found " + str(len(parsed)) + " Outdoor Sculpture records")
    
    with open('osculpt_grep_pretty.json','w') as out:
        out.write(json.dumps(parsed, indent=4))

printStepTime("searching for Outdoor Sculpture")

Found 17 Outdoor Sculpture records
Finished searching for Outdoor Sculpture: 2020-03-04 11:43:36.673748, took 0:02:23.634237


In [31]:
endTime = datetime.now()
print("Finished: " + str(endTime))
print("Total Time: " + str(endTime - startTime))

Finished: 2020-03-04 11:41:34.127095
Total Time: 1:51:32.346893
