In [None]:
import json
import operator
import pandas as pd
import re

In [None]:
# Data should be downloaded from: https://github.com/metmuseum/openaccess/blob/master/MetObjects.csv
# Remember to update "Last Updated" date in HTML page
# Last commit pulled: 2019-11-18

df = pd.read_csv("MetObjects.csv",
    dtype={
        'Object Number': str,
        'Is Highlight': bool,
        'Is Public Domain': bool,
        'Object ID': str,
        'Department': str,
        'Object Name': str,
        'Title': str,
        'Culture': str,
        'Period': str,
        'Dynasty': str,
        'Reign': str,
        'Portfolio': str,
        'Artist Role': str,
        'Artist Prefix': str,
        'Artist Display Name': str,
        'Artist Display Bio': str,
        'Artist Suffix': str,
        'Artist Alpha Sort': str,
        'Artist Nationality': str,
        'Artist Begin Date': str,
        'Artist End Date': str,
        'Object Date': str,
        'Object Begin Date': str,
        'Object End Date': str,
        'Medium': str,
        'Dimensions': str,
        'Credit Line': str,
        'Geography Type': str,
        'City': str,
        'State': str,
        'County': str,
        'Country': str,
        'Region': str,
        'Subregion': str,
        'Locale': str,
        'Locus': str,
        'Excavation': str,
        'River': str,
        'Classification': str,
        'Rights and Reproduction': str,
        'Link Resource': str,
        'Metadata Date': str,
        'Repository': str,
        'Tags': str
    }
)

In [None]:
df.describe()

In [None]:
trimmed = df.loc[:, ['Is Highlight', 'Is Public Domain', 'Object ID', 'Department', 'Title', 'Culture', 'Medium', 'Link Resource']]
trimmed = trimmed.fillna(value={'Department':'Unknown','Culture':'Unknown'})
trimmed

In [None]:
trimmed.describe()

In [None]:
stats = {}
stats["total"] = len(trimmed.index)
stats["isHighlight"] = len(trimmed.loc[lambda df: df['Is Highlight'] == True].index)
stats["isPublicDomain"] = len(trimmed.loc[lambda df: df['Is Public Domain'] == True].index)
stats["cultures"] = len(trimmed["Culture"].unique())
stats["numDepts"] = len(trimmed["Department"].unique())

stats

In [None]:
depts = trimmed["Department"].unique()
len(depts)

In [None]:
depts

In [None]:
stats["depts"] = {}

for d in depts:
    dStats = trimmed.loc[lambda df: df['Department'] == d]
    stats["depts"][d] = {}
    stats["depts"][d]["total"] = len(dStats.index)
    stats["depts"][d]["isHighlight"] = len(dStats.loc[lambda df: df['Is Highlight'] == True].index)
    stats["depts"][d]["isPublicDomain"] = len(dStats.loc[lambda df: df['Is Public Domain'] == True].index)
    stats["depts"][d]["pdHighlights"] = list(map(lambda url: url.replace('https://','http://').replace('http://www.metmuseum.org/art/collection/search/',''), list(dStats.loc[lambda df: ((df['Is Public Domain'] == True) & (df['Is Highlight'] == True))]["Link Resource"].unique())))
    stats["depts"][d]["cultures"] = len(dStats["Culture"].unique())
    stats["depts"][d]["mediums"] = len(dStats["Medium"].unique())
    
    # Dept. culture stats
    cultures = dStats["Culture"].unique()
    tmpO = {}
    for c in cultures:
        # Clean up culture names (deduplicates similar and weird names)
        # Whoever is in the Arms & Armor department, you're making my life hard
        # This code looks awful because it is because it works
        simpleC = c
        if d == "Arms and Armor":
            simpleC = re.sub(r"(; )?(crossbow|belt|[bB]lade|[hH]ilt|saddle|collar|bowl|plate|scabbard|staff|flint|guard|gorget|horn|helmet|greaves|sabatons|lever|quiver|[bB]arrels?|[kK]nife|cuirass|armor|spurs|sheath| made | bit |^bit | j |04\.3\.326|cranequin|mail shirt|harness|fittings|lock|gun|spearhead|[mM]ounts?|[gG]rip|restorations|in the style of [a-zA-Z]+|decoration on|decorated)( and)?",'',simpleC)
            simpleC = simpleC.replace(';', ' and')
        simpleC = re.sub(
            r"((for (((the )?[a-zA-Z]+)|export) market)|(((,)? )?([pP]ossibl[ey]|[pP]robably|provincial|[pP]ossilby|[nN]\.[pP]\.|frame|fabric|unknown|Archaeological Complex)(,)?)|([nN]/[aA]\.?))",
            '',
            simpleC.split(' (')[0].split(':')[0].split(';')[0].split('with')[0].split('district')[0].split('upon')[0].replace('?','').strip()
            ).replace('china','Chinese').replace('So. German','German').replace('Germany','German')
        if d == "Arms and Armor":
            simpleC = simpleC.strip(',').strip().replace(',', ' and')
        simpleC = simpleC.replace('()', '').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace('and and', 'and').replace('and and', 'and').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').strip().strip(',').strip()
        if d != "Arms and Armor":
            simpleC = simpleC.split(',')[0].strip()
            simpleC = re.sub(r"^and", '', simpleC).strip()
        if simpleC == '':
            simpleC = 'Unknown'
        
        # Collect by cleaned names
        cNum = len(dStats.loc[lambda df: df['Culture'] == c].index)
        if simpleC in tmpO:
            tmpO[simpleC] += cNum
        else:
            tmpO[simpleC] = cNum

    # Combine cultures with <1% of objects in department b/c not stat significant
    tmpN = {"<1% ea.": 0}
    for simpleC in tmpO:
        if simpleC == "Unknown" or (tmpO[simpleC] != 1 and (float(tmpO[simpleC]) / float(len(dStats.index))) >= (0.01)):
            tmpN[simpleC] = tmpO[simpleC]
        else:
            tmpN["<1% ea."] += tmpO[simpleC]
    if tmpN["<1% ea."] > 1:
        tmpO = tmpN
    
    # Sort
    tmpList = sorted(tmpO.items(), key=lambda kv: kv[1], reverse=True)
    stats["depts"][d]["culturesSimplified"] = {}
    for k,v in tmpList:
        stats["depts"][d]["culturesSimplified"][k] = v
    
stats

In [None]:
with open('stats.json', 'w') as f:
    print(json.dumps(stats), file=f)