# SI Collection Stats - stats.json generation

(`ujson` reads/writes faster, but escapes text improperly, so use `json` for final output)

In [1]:
import datetime
import json
import re
import ujson

Key for variable naming:
`stats` is the JSON object that will be output at the end

We're looking for the following properties overall and per dept:
* Count (total)
* Departments
* Country
* On exhibit
* Taxonomy (K,P only)
* ~Type (general & specific)~ (TODO)
* ~Public Domain~ (all meta is CC0 if here, should get total from collections.si.edu or look at has online media && online media is CC0)
* ~Culture~ (TODO)

In [2]:
stats = {}
stats["total"] = 0
stats["depts"] = {}
#stats["type_general"] = {}
#stats["type_specific"] = {}
#stats["public_domain"] = 0
stats["on_exhibit"] = 0
stats["country"] = {}
stats["age"] = {}
stats["taxonomy"] = {}
stats["taxonomy"]["count"] = 0
stats["taxonomy"]["kingdom"] = {}

In [3]:
current_year = datetime.datetime.now().year

In [4]:
valid_countries = []
with open('countries.json','r') as cin: # scraped from topojson data
    valid_countries = json.load(cin)


# Mapping of the following to topojson country names:
# * Typos
# * Other country name formattings (e.g. common vs historical vs formal vs topojson names)
# * Abbreviations
# * Accented letters
# Assumes "Islands" has been replaced with "Is." (since that fixes many topojson nuance cases)
c_map = {
    "England": "United Kingdom",
    "Britain": "United Kingdom",
    "Great Britain": "United Kingdom",
    "United States": "United States of America",
    "US": "United States of America",
    "USA": "United States of America",
    "United Staes": "United States",
    'Antigua and Barbuda': 'Antigua and Barb.',
    'Bolivia (Plurinational State of)': 'Bolivia',
    'Bosnia and Herzegovina': 'Bosnia and Herz.',
    'Brunei Darussalam': 'Brunei',
    'Central African Republic': 'Central African Rep.',
    "Democratic People's Republic of Korea": 'North Korea',
    'Democratic Republic of the Congo': 'Dem. Rep. Congo',
    'Dominican Republic': 'Dominican Rep.',
    'Equatorial Guinea': 'Eq. Guinea',
    'Iran (Islamic Republic of)': 'Iran',
    "Lao People's Democratic Republic": 'Laos',
    'Micronesia (Federated States of)': 'Micronesia',
    'Republic of Korea': 'South Korea',
    'Republic of Moldova': 'Moldova',
    'Russian Federation': 'Russia',
    'Saint Kitts and Nevis': 'St. Kitts and Nevis',
    'Saint Vincent and the Grenadines': 'St. Vin. and Gren.',
    'Sao Tome and Principe': 'São Tomé and Principe',
    'South Sudan': 'S. Sudan',
    'Swaziland': 'eSwatini',
    'Syrian Arab Republic': 'Syria',
    'The former Yugoslav Republic of Macedonia': 'Macedonia',
    'United Republic of Tanzania': 'Tanzania',
    'Venezuela (Bolivarian Republic of)': 'Venezuela',
    'Viet Nam': 'Vietnam',
    "People's Republic of China": "China",
    "Republic of the Congo": "Congo",
    "Burma": "Myanmar",
    "New Guinea": "Papua New Guinea",
    "Colonial America": "United States of America",
    'Micronesia, Federated States of': 'Micronesia',
    "Czech Republic": "Czechia",
    "Colonial America": "United States of America",
    "French Canada": "Canada",
    "U.S. Virgin Islands": "U.S. Virgin Is.",
    "Cape Verde": "Cabo Verde",
    "Curacao": "Curaçao",
    "French Guiana": "Guyana",
    "French Polynesia": "Fr. Polynesia",
    "Northern Mariana Is.": "N. Mariana Is.",
    "México": "Mexico",
    "British Virgin Islands": "British Virgin Is.",
    "Virgin Islands of the United States": "U.S. Virgin Is.",
    "Congo, Democratic Rep. Of": "Dem. Rep. Congo",
    "Burma (Myanmar)": "Myanmar",
    "Federated States of Micronesia": "Micronesia",
    "British Indian Ocean Territory": "Br. Indian Ocean Ter.",
    "Cote d'Ivoire": "Côte d'Ivoire",
    "Congo, Democratic Republic of": "Dem. Rep. Congo",
    "Western Samoa": "Samoa",
    "Belau": "Palau",
    "Belau [Palau]": "Palau",
    "Republic of Palau": "Palau",
    "St. Lucia": "Saint Lucia",
    "Ivory Coast": "Côte d'Ivoire",
    "United States Minor Outlying Is.": "U.S. Minor Outlying Is.",
    "Trinidad": "Trinidad and Tobago",
    "Tobago": "Trinidad and Tobago",
    "Republic of Ireland": "Ireland",
    "Virgin Is. of the United States": "U.S. Virgin Is.",
    "Northwestern Hawaiian Is.": "United States of America",
    "Kingdom of Tonga": "Tonga",
    "Republic of Fiji": "Fiji",
    "Pitcairn Island": "Pitcairn Is.",
    "West Germany": "Germany",
    "Congo (Kinshasa)": "Congo",
    "Main Hawaiian Is.": "United States of America",
    "Holland": "Netherlands",
    "Antigua": "Antigua and Barb.",
    "St. Helena": "Saint Helena",
    "Saint Helena Island": "Saint Helena",
    "Hawaiian Is.": "United States of America",
    "Myanmar [Burma]": "Myanmar",
    "Kazakstan": "Kazakhstan",
    "South Sandwich Is.": "S. Geo. and the Is.",
    "Vanuatu (Republic of)": "Vanuatu",
    "Tahiti": "Fr. Polynesia",
    "Zaire": "Dem. Rep. Congo",
    "Virgin Is. (U.S.)": "U.S. Virgin Is.",
    "Johnston Atoll": "U.S. Minor Outlying Is.",
    "Navassa Island": "U.S. Minor Outlying Is.",
    "Sicily": "Italy",
    "Etruria": "Italy",
}

# Sanitizes country names
# Checks valid topojson names
# Does mappings for many common cases (see c_map above)
# Checks case issues
# Returns "" if cannot sanitize
def sanitize_country(i):
    if i == "":
        return ""
    
    if i in valid_countries:
        return i
    
    # remove entries with uncertainty and multiple origins
    if "?" in i:
        return ""
    if "/" in i:
        return ""
    if " or " in i:
        return ""
    if " and " in i:
        return ""
    if "not certain" in i:
        return ""
    if "Unknown" in i:
        return ""
    if "Not Stated" in i:
        return ""
    if "Not Given" in i:
        return ""
    
    # copy to local var
    c = i
    
    # Remove regions
    c = re.sub(r"^now ","",c)
    c = re.sub(r"^[Tt]he ","",c)
    c = c.replace("Islands","Is.")
    
    c_mapi = {}
    for x in c_map:
        c_mapi[x.lower()] = c_map[x]
    for x in valid_countries:
        c_mapi[x.lower()] = x
        
    if c.lower() in c_mapi:
        return c_mapi[c.lower()]

    # cannot sanitize
    return ""

In [5]:
# Summarize
with open('data/all_data.ndjson') as infile:
    for line in infile:
        data = ujson.loads(line)
        
        # Get object properties
        unitCode = data['unitCode']
        dataSource = data['content']['descriptiveNonRepeating']['data_source']
        #type_general = data['type']
        #public_domain = False
        #if "online_media_rights" in data['content']['indexedStructured']:
        #    if data['content']['indexedStructured']['online_media_rights'] == 'No Restrictions':
        #        public_domain = True
        on_exhibit = False
        country = ""
        age = ""
        
        # Check b/c not all do -_-
        if 'indexedStructured' in data['content']:
            # Check if on exhibit
            if "onPhysicalExhibit" in data['content']['indexedStructured']:
                if data['content']['indexedStructured']['onPhysicalExhibit'] == "Yes":
                    on_exhibit = True
            # Check if we can get a country; prefer most accurate field (Matt Miller method)
            if 'geoLocation' in data['content']['indexedStructured']:
                for x in data['content']['indexedStructured']['geoLocation']:
                    if 'L2' in x:
                        if isinstance(x['L2'], str):
                            country = sanitize_country(x['L2'])
                        else:
                            country = sanitize_country(x['L2']['content'])
            # Check if we can get an age (most accurate level of detail is decade)
            if "date" in data['content']['indexedStructured']:
                for x in data['content']['indexedStructured']['date']:
                    age_tmp = x.replace("BCE ", "-").replace("s", "")
                    if int(age_tmp) <= current_year: # sanity check for typos
                        age = age_tmp
            # Check for taxonomy data; this is on its own b/c it's across several units and so I'm tracking it as a special case
            if "tax_kingdom" in data['content']['indexedStructured']:
                # Kingdom; assume only one value in list
                kingdom = data['content']['indexedStructured']["tax_kingdom"][0]
                
                stats["taxonomy"]["count"] += 1
                
                if not kingdom in stats["taxonomy"]["kingdom"]:
                    stats["taxonomy"]["kingdom"][kingdom] = {}
                    stats["taxonomy"]["kingdom"][kingdom]["count"] = 0
                    stats["taxonomy"]["kingdom"][kingdom]["phylum"] = {}
                
                stats["taxonomy"]["kingdom"][kingdom]["count"] += 1
                
                # Phylum; assume only one value in list
                if "tax_phylum" in data['content']['indexedStructured']:
                    phylum = data['content']['indexedStructured']["tax_phylum"][0]

                    if not phylum in stats["taxonomy"]["kingdom"][kingdom]["phylum"]:
                        stats["taxonomy"]["kingdom"][kingdom]["phylum"][phylum] = {}
                        stats["taxonomy"]["kingdom"][kingdom]["phylum"][phylum]["count"] = 0

                    stats["taxonomy"]["kingdom"][kingdom]["phylum"][phylum]["count"] += 1
                else:
                    if not "Unknown" in stats["taxonomy"]["kingdom"][kingdom]["phylum"]:
                        stats["taxonomy"]["kingdom"][kingdom]["phylum"]["Unknown"] = {}
                        stats["taxonomy"]["kingdom"][kingdom]["phylum"]["Unknown"]["count"] = 0

                    stats["taxonomy"]["kingdom"][kingdom]["phylum"]["Unknown"]["count"] += 1
        
        # If department does not exist, create it
        if not dataSource in stats["depts"]:
            stats["depts"][dataSource] = {}
            stats["depts"][dataSource]["count"] = 0
        #    stats["depts"][dataSource]["public_domain"] = 0
        #    stats["depts"][dataSource]["type_general"] = {}
            #stats["depts"][dataSource]["type_specific"] = {}
            stats["depts"][dataSource]["on_exhibit"] = 0
            stats["depts"][dataSource]["country"] = {}
            stats["depts"][dataSource]["age"] = {}
            stats["depts"][dataSource]["unit_code"] = unitCode
        
        # If general type does not exist, create it
        #if not type_general in stats["type_general"]:
        #    stats["type_general"][type_general] = {}
        #    stats["type_general"][type_general]["count"] = 0
        #    stats["type_general"][type_general]["depts"] = {}
        #    stats["type_general"][type_general]["public_domain"] = 0
        
        # If specific type does not exist, create it
        # ...
        
        # Create cross-references one level deep
        #if not type_general in stats["depts"][dataSource]["type_general"]:
        #    stats["depts"][dataSource]["type_general"][type_general] = 0
        #if not dataSource in stats["type_general"][type_general]["depts"]:
        #    stats["type_general"][type_general]["depts"][dataSource] = 0
        
        # Add stats for this object
        stats["total"] += 1
        stats["depts"][dataSource]["count"] += 1
        #stats["depts"][dataSource]["type_general"][type_general] += 1
        #stats["type_general"][type_general]["count"] += 1
        #stats["type_general"][type_general]["depts"][dataSource] += 1
        #if public_domain:
        #    stats["public_domain"] += 1
        #    stats["depts"][dataSource]["public_domain"] += 1
        #    stats["type_general"][type_general]["public_domain"] += 1
        if on_exhibit:
            stats["on_exhibit"] += 1
            stats["depts"][dataSource]["on_exhibit"] += 1
        if country != "":
            if not country in stats["country"]:
                stats["country"][country] = 0
            if not country in stats["depts"][dataSource]["country"]:
                stats["depts"][dataSource]["country"][country] = 0
            stats["country"][country] += 1
            stats["depts"][dataSource]["country"][country] += 1
        if age != "":
            if not age in stats["age"]:
                stats["age"][age] = 0
            if not age in stats["depts"][dataSource]["age"]:
                stats["depts"][dataSource]["age"][age] = 0
            stats["age"][age] += 1
            stats["depts"][dataSource]["age"][age] += 1

print("Done")

Done


In [8]:
stats

{'total': 11355839,
 'depts': {'National Anthropological Archives': {'count': 15,
   'on_exhibit': 0,
   'country': {'United States of America': 2,
    'Italy': 1,
    'Pakistan': 1,
    'Slovakia': 1,
    'Venezuela': 1},
   'age': {'1870': 1, '1880': 5, '1890': 3, '1950': 1, '1970': 5},
   'unit_code': 'NAA'},
  'Smithsonian Gardens': {'count': 252,
   'on_exhibit': 0,
   'country': {'United Kingdom': 136,
    'France': 33,
    'China': 5,
    'Italy': 4,
    'Germany': 3,
    'United States of America': 5,
    'India': 3},
   'age': {'1830': 245, '1860': 1, '1910': 2, '1980': 1, '1870': 1},
   'unit_code': 'HAC'},
  'Human Studies Film Archives': {'count': 78,
   'on_exhibit': 0,
   'country': {'United States of America': 54, 'Canada': 22, 'El Salvador': 1},
   'age': {'1970': 43, '1960': 26, '1980': 3, '1950': 3, '1940': 1, '1930': 1},
   'unit_code': 'HSFA'},
  'Smithsonian Libraries': {'count': 15218,
   'on_exhibit': 0,
   'country': {'Norway': 7,
    'Italy': 26,
    'Panama': 

In [7]:
# Write stats to file
with open('docs/stats.json','w') as out:
    out.write(json.dumps(stats))
    #out.write(json.dumps(stats, indent=4)) # Pretty print