In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time

In [2]:
api_endpoint = "https://collectionapi.metmuseum.org/public/collection/v1/objects"

In [3]:
params = {
        "departmentIds": 1,  # American Wing department ID
        "isOnView": True,
        "hasImages": True,
        "artistGender":"Female",
        "q": "American"
    }

In [4]:
response = requests.get(api_endpoint, params)

In [5]:
response_data = response.json()

In [6]:
count = response_data['total']
count

18558

In [7]:
response_data.keys()

dict_keys(['total', 'objectIDs'])

In [8]:
all_ids = pd.Series((response_data['objectIDs']))
groups = pd.Series(np.floor(all_ids.index.get_level_values(0) / 80))

In [9]:
id_df = pd.concat([all_ids, groups], axis = 1)
id_df.columns = ['objectID','group']

In [10]:
groups = id_df.groupby('group')

In [11]:
data = []

In [12]:
from tqdm import tqdm

In [13]:
for i,group in tqdm(groups):
    new_data = [requests.get('{}/{}'.format(api_endpoint, oid)).json() for oid in group['objectID']]
    data.append(new_data)
    time.sleep(0.25)    

100%|██████████| 232/232 [19:40<00:00,  5.09s/it]


In [14]:
data_cleaned = np.concatenate(data).tolist()

In [15]:
cleaned_df = pd.DataFrame(data_cleaned)

In [16]:
cleaned_df.to_csv('american_wing.csv', header = True)

In [20]:
cleaned_df.to_pickle('american_wing.pkl')

In [22]:
df_in = pd.read_pickle('american_wing.pkl')

In [24]:
df_in.shape

(18558, 57)

In [26]:
df_in.columns

Index(['objectID', 'isHighlight', 'accessionNumber', 'accessionYear',
       'isPublicDomain', 'primaryImage', 'primaryImageSmall',
       'additionalImages', 'constituents', 'department', 'objectName', 'title',
       'culture', 'period', 'dynasty', 'reign', 'portfolio', 'artistRole',
       'artistPrefix', 'artistDisplayName', 'artistDisplayBio', 'artistSuffix',
       'artistAlphaSort', 'artistNationality', 'artistBeginDate',
       'artistEndDate', 'artistGender', 'artistWikidata_URL', 'artistULAN_URL',
       'objectDate', 'objectBeginDate', 'objectEndDate', 'medium',
       'dimensions', 'measurements', 'creditLine', 'geographyType', 'city',
       'state', 'county', 'country', 'region', 'subregion', 'locale', 'locus',
       'excavation', 'river', 'classification', 'rightsAndReproduction',
       'linkResource', 'metadataDate', 'repository', 'objectURL', 'tags',
       'objectWikidata_URL', 'isTimelineWork', 'GalleryNumber'],
      dtype='object')

In [33]:
highlight_count = df_in['isHighlight'].value_counts()
highlight_count

isHighlight
False    18124
True       434
Name: count, dtype: int64

In [37]:
highlights  = df_in.loc[df_in['isHighlight']]
highlights.shape

(434, 57)

In [39]:
highlights.groupby('artistGender').count() / highlight_count[True]

Unnamed: 0_level_0,objectID,isHighlight,accessionNumber,accessionYear,isPublicDomain,primaryImage,primaryImageSmall,additionalImages,constituents,department,...,classification,rightsAndReproduction,linkResource,metadataDate,repository,objectURL,tags,objectWikidata_URL,isTimelineWork,GalleryNumber
artistGender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,0.889401,0.889401,0.889401,0.889401,0.889401,0.889401,0.889401,0.889401,0.746544,0.889401,...,0.889401,0.889401,0.889401,0.889401,0.889401,0.889401,0.612903,0.889401,0.889401,0.889401
Female,0.110599,0.110599,0.110599,0.110599,0.110599,0.110599,0.110599,0.110599,0.110599,0.110599,...,0.110599,0.110599,0.110599,0.110599,0.110599,0.110599,0.087558,0.110599,0.110599,0.110599


In [42]:
non_highlights = df_in.loc[df_in['isHighlight'] == False]

In [44]:
non_highlights['artistGender'].value_counts() / non_highlights['artistGender'].count()

artistGender
          0.966343
Female    0.033657
Name: count, dtype: float64