# Met Collections Stats

Data should be downloaded from: https://github.com/metmuseum/openaccess/blob/master/MetObjects.csv

Remember to update "Last Updated" date in HTML page

Last commit pulled: 2020-03-24

In [1]:
import json
import operator
import pandas as pd
import re

In [2]:
!head -n 1 MetObjects.csv

﻿Object Number,Is Highlight,Is Public Domain,Is Timeline Work,Object ID,Department,AccessionYear,Object Name,Title,Culture,Period,Dynasty,Reign,Portfolio,Artist Role,Artist Prefix,Artist Display Name,Artist Display Bio,Artist Suffix,Artist Alpha Sort,Artist Nationality,Artist Begin Date,Artist End Date,Artist Gender,Artist ULAN URL,Artist Wikidata URL,Object Date,Object Begin Date,Object End Date,Medium,Dimensions,Credit Line,Geography Type,City,State,County,Country,Region,Subregion,Locale,Locus,Excavation,River,Classification,Rights and Reproduction,Link Resource,Object Wikidata URL,Metadata Date,Repository,Tags,Tags AAT URL


In [3]:
df = pd.read_csv("MetObjects.csv",
    usecols=['Is Highlight', 'Is Public Domain', 'Object ID', 'Department', 'Title', 'Culture', 'Medium', 'Link Resource'],
    dtype={
        'Is Highlight': bool,
        'Is Public Domain': bool,
        'Object ID': str,
        'Department': str,
        'Title': str,
        'Culture': str,
        'Medium': str,
        'Link Resource': str
    }
)

In [4]:
df

Unnamed: 0,Is Highlight,Is Public Domain,Object ID,Department,Title,Culture,Medium,Link Resource
0,False,False,1,The American Wing,One-dollar Liberty Head Coin,,Gold,http://www.metmuseum.org/art/collection/search/1
1,False,False,2,The American Wing,Ten-dollar Liberty Head Coin,,Gold,http://www.metmuseum.org/art/collection/search/2
2,False,False,3,The American Wing,Two-and-a-Half Dollar Coin,,Gold,http://www.metmuseum.org/art/collection/search/3
3,False,False,4,The American Wing,Two-and-a-Half Dollar Coin,,Gold,http://www.metmuseum.org/art/collection/search/4
4,False,False,5,The American Wing,Two-and-a-Half Dollar Coin,,Gold,http://www.metmuseum.org/art/collection/search/5
...,...,...,...,...,...,...,...,...
474127,False,False,843932,Drawings and Prints,"Naval Scenes–or Sketches Afloat, No. 3: Cooking",,"Etching, hand-colored",http://www.metmuseum.org/art/collection/search...
474128,False,False,843933,Drawings and Prints,Dining on Air–Premeditated Dining–Dining by Ch...,,"Etching, hand-colored",http://www.metmuseum.org/art/collection/search...
474129,False,False,843934,Drawings and Prints,Pie-ty!!!,,"Etching, hand-colored",http://www.metmuseum.org/art/collection/search...
474130,False,False,843935,Drawings and Prints,"Rural Enjoyment, No. 1: ""I Say Bob, How Your M...",,"Etching, hand-colored",http://www.metmuseum.org/art/collection/search...


In [5]:
df = df.fillna(value={'Department':'Unknown','Culture':'Unknown'})
df.describe()

Unnamed: 0,Is Highlight,Is Public Domain,Object ID,Department,Title,Culture,Medium,Link Resource
count,474132,474132,474132,474132,444173,474132,466976,474132
unique,2,2,474132,19,235229,7065,63480,474132
top,False,False,648925,Drawings and Prints,Terracotta fragment of a kylix (drinking cup),Unknown,Terracotta,http://www.metmuseum.org/art/collection/search...
freq,472140,466162,1,165995,6415,268053,23506,1


In [6]:
stats = {}
stats["total"] = len(df.index)
stats["isHighlight"] = len(df.loc[lambda df: df['Is Highlight'] == True].index)
stats["isPublicDomain"] = len(df.loc[lambda df: df['Is Public Domain'] == True].index)
stats["cultures"] = len(df["Culture"].unique())
stats["numDepts"] = len(df["Department"].unique())

stats

{'total': 474132,
 'isHighlight': 1992,
 'isPublicDomain': 7970,
 'cultures': 7065,
 'numDepts': 19}

In [7]:
depts = df["Department"].unique()
len(depts)

19

In [8]:
depts

array(['The American Wing', 'European Sculpture and Decorative Arts',
       'Modern and Contemporary Art', 'Arms and Armor', 'Medieval Art',
       'Asian Art', 'Islamic Art', 'Costume Institute',
       'Arts of Africa, Oceania, and the Americas', 'Drawings and Prints',
       'Greek and Roman Art', 'Photographs', 'Ancient Near Eastern Art',
       'European Paintings', 'Robert Lehman Collection', 'The Cloisters',
       'Musical Instruments', 'Egyptian Art', 'The Libraries'],
      dtype=object)

In [9]:
stats["depts"] = {}

for d in depts:
    dStats = df.loc[lambda df: df['Department'] == d]
    stats["depts"][d] = {}
    stats["depts"][d]["total"] = len(dStats.index)
    stats["depts"][d]["isHighlight"] = len(dStats.loc[lambda df: df['Is Highlight'] == True].index)
    stats["depts"][d]["isPublicDomain"] = len(dStats.loc[lambda df: df['Is Public Domain'] == True].index)
    #stats["depts"][d]["pdHighlights"] = list(map(lambda url: url.replace('https://','http://').replace('http://www.metmuseum.org/art/collection/search/',''), list(dStats.loc[lambda df: ((df['Is Public Domain'] == True) & (df['Is Highlight'] == True))]["Link Resource"].unique())))
    stats["depts"][d]["cultures"] = len(dStats["Culture"].unique())
    stats["depts"][d]["mediums"] = len(dStats["Medium"].unique())
    
    # Dept. culture stats
    cultures = dStats["Culture"].unique()
    tmpO = {}
    for c in cultures:
        # Clean up culture names (deduplicates similar and weird names)
        # Whoever is in the Arms & Armor department, you're making my life hard
        # This code looks awful because it is because it works
        simpleC = c
        if d == "Arms and Armor":
            simpleC = re.sub(r"(; )?(crossbow|belt|[bB]lade|[hH]ilt|saddle|collar|bowl|plate|scabbard|staff|flint|guard|gorget|horn|helmet|greaves|sabatons|lever|quiver|[bB]arrels?|[kK]nife|cuirass|armor|spurs|sheath| made | bit |^bit | j |04\.3\.326|cranequin|mail shirt|harness|fittings|lock|gun|spearhead|[mM]ounts?|[gG]rip|restorations|in the style of [a-zA-Z]+|decoration on|decorated)( and)?",'',simpleC)
            simpleC = simpleC.replace(';', ' and')
        simpleC = re.sub(
            r"((for (((the )?[a-zA-Z]+)|export) market)|(((,)? )?([pP]ossibl[ey]|[pP]robably|provincial|[pP]ossilby|[nN]\.[pP]\.|frame|fabric|unknown|Archaeological Complex)(,)?)|([nN]/[aA]\.?))",
            '',
            simpleC.split(' (')[0].split(':')[0].split(';')[0].split('with')[0].split('district')[0].split('upon')[0].replace('?','').strip()
            ).replace('china','Chinese').replace('So. German','German').replace('Germany','German')
        if d == "Arms and Armor":
            simpleC = simpleC.strip(',').strip().replace(',', ' and')
        simpleC = simpleC.replace('()', '').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').replace('and and', 'and').replace('and and', 'and').replace('  ', ' ').replace('  ', ' ').replace('  ', ' ').strip().strip(',').strip()
        if d != "Arms and Armor":
            simpleC = simpleC.split(',')[0].strip()
            simpleC = re.sub(r"^and", '', simpleC).strip()
        if simpleC == '':
            simpleC = 'Unknown'
        
        # Collect by cleaned names
        cNum = len(dStats.loc[lambda df: df['Culture'] == c].index)
        if simpleC in tmpO:
            tmpO[simpleC] += cNum
        else:
            tmpO[simpleC] = cNum

    # Combine cultures with <1% of objects in department b/c not stat significant
    tmpN = {"<1% ea.": 0}
    for simpleC in tmpO:
        if simpleC == "Unknown" or (tmpO[simpleC] != 1 and (float(tmpO[simpleC]) / float(len(dStats.index))) >= (0.01)):
            tmpN[simpleC] = tmpO[simpleC]
        else:
            tmpN["<1% ea."] += tmpO[simpleC]
    if tmpN["<1% ea."] > 1:
        tmpO = tmpN
    
    # Sort
    tmpList = sorted(tmpO.items(), key=lambda kv: kv[1], reverse=True)
    stats["depts"][d]["culturesSimplified"] = {}
    for k,v in tmpList:
        stats["depts"][d]["culturesSimplified"][k] = v
    
stats

{'total': 474132,
 'isHighlight': 1992,
 'isPublicDomain': 7970,
 'cultures': 7065,
 'numDepts': 19,
 'depts': {'The American Wing': {'total': 18027,
   'isHighlight': 175,
   'isPublicDomain': 770,
   'cultures': 168,
   'mediums': 3607,
   'culturesSimplified': {'American': 13591,
    'Unknown': 1900,
    'British': 811,
    'Chinese': 658,
    '<1% ea.': 573,
    'French': 260,
    'Mexican': 234}},
  'European Sculpture and Decorative Arts': {'total': 42800,
   'isHighlight': 82,
   'isPublicDomain': 671,
   'cultures': 2189,
   'mediums': 5675,
   'culturesSimplified': {'French': 12064,
    'British': 7031,
    'Italian': 6728,
    'German': 4971,
    '<1% ea.': 4855,
    'European': 1389,
    'Dutch': 1181,
    'Spanish': 1062,
    'Chinese': 894,
    'Flemish': 764,
    'Russian': 746,
    'Austrian': 520,
    'Swiss': 468,
    'Unknown': 127}},
  'Modern and Contemporary Art': {'total': 13911,
   'isHighlight': 116,
   'isPublicDomain': 371,
   'cultures': 34,
   'mediums': 393

In [10]:
with open('stats.json', 'w') as f:
    print(json.dumps(stats), file=f)