In [1]:
import pandas as pd
import numpy as np

# API requests to the MET website 

These requests are made to retrieve the dimensions information of the art pieces as well as the images pictures url. 

The first piece of code performs the requests. However, because it takes a lot of time to run, it is written here in 'raw' in order to avoid running. Instead, the output files are provided. 

The code was run two times, because an error occured the first time around. After this first time, a error log was created and to save time the procedure was not restarted from beginning but a little before the error occured. 

Below, I join the two output files together and treat the acquired table to extract the dimensions I want.

In [2]:
# open the output files
API_file1 = pd.read_json('API_output1.txt',orient='values',lines=True)
API_file2 = pd.read_json('API_output2.txt',orient='values',lines=True)

In [3]:
# join the output files
API_file = pd.concat([API_file1,API_file2],ignore_index=True)

In [4]:
# remove duplicates
print(API_file.shape)
API_file.drop_duplicates(subset=['objectID'],inplace=True)
print(API_file.shape)

(216477, 58)
(214481, 58)


In [5]:
# take a look at the errors
with open('API_error_log.txt') as elog:
    line = elog.readline()
    count=1
    while line:
        print('Error',count)
        print(line.strip())
        line=elog.readline()
        count=count+1

Error 1
Error opening url for ID 242189
Error 2
Error opening url for ID 306661


In [6]:
cols_keep = ['objectID','measurements','primaryImageSmall']

In [7]:
cols_drop = list(set(API_file) - set(cols_keep))

In [8]:
# Keep only the columns of interest
API_file.drop(columns=cols_drop,inplace=True)
API_file.shape

(214481, 3)

In [9]:
API_file.head()

Unnamed: 0,measurements,objectID,primaryImageSmall
0,"[{'elementName': 'Overall', 'elementDescriptio...",34.0,https://images.metmuseum.org/CRDImages/ad/web-...
1,"[{'elementName': 'Overall', 'elementDescriptio...",37.0,https://images.metmuseum.org/CRDImages/ad/web-...
2,"[{'elementName': 'Overall', 'elementDescriptio...",38.0,https://images.metmuseum.org/CRDImages/ad/web-...
3,"[{'elementName': 'Overall', 'elementDescriptio...",39.0,https://images.metmuseum.org/CRDImages/ad/web-...
4,"[{'elementName': 'Other', 'elementDescription'...",40.0,https://images.metmuseum.org/CRDImages/ad/web-...


In [10]:
# Different dimensions are stored, create a table that separated these types of dimensions
API_file['measurements'] = API_file['measurements'].astype(str)
measurements = API_file['measurements'].str.split('{',expand=True)
print(measurements.shape)
measurements.head()

(214481, 115)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,105,106,107,108,109,110,111,112,113,114
0,[,"'elementName': 'Overall', 'elementDescription'...","'Depth': 13.0175, 'Height': 61.9, 'Width': 37....",,,,,,,,...,,,,,,,,,,
1,[,"'elementName': 'Overall', 'elementDescription'...",'Height': 30.5}}],,,,,,,,...,,,,,,,,,,
2,[,"'elementName': 'Overall', 'elementDescription'...",'Height': 31.4}}],,,,,,,,...,,,,,,,,,,
3,[,"'elementName': 'Overall', 'elementDescription'...","'Height': 27.9, 'Width': 22.86}}]",,,,,,,,...,,,,,,,,,,
4,[,"'elementName': 'Other', 'elementDescription': ...","'Diameter': 7.62, 'Height': 16.6688}}]",,,,,,,,...,,,,,,,,,,


In [11]:
# Focus on the dimensions of 'Overall'
cnt = 0
for i in range(114):
    cnt = cnt + measurements[i].str.contains('Overall').sum()
    print('col',i,
          ', count"Overall": ', measurements[i].str.contains('Overall').sum(),
          ', NULL values: ',round((measurements[i].isna().sum())/measurements.shape[0]*100,1),' %')
cnt

col 0 , count"Overall":  0 , NULL values:  0.0  %
col 1 , count"Overall":  105910 , NULL values:  19.8  %
col 2 , count"Overall":  0 , NULL values:  19.8  %
col 3 , count"Overall":  9857 , NULL values:  86.6  %
col 4 , count"Overall":  0 , NULL values:  86.6  %
col 5 , count"Overall":  2036 , NULL values:  97.0  %
col 6 , count"Overall":  0 , NULL values:  97.0  %
col 7 , count"Overall":  444 , NULL values:  99.2  %
col 8 , count"Overall":  0 , NULL values:  99.2  %
col 9 , count"Overall":  212 , NULL values:  99.6  %
col 10 , count"Overall":  0 , NULL values:  99.6  %
col 11 , count"Overall":  144 , NULL values:  99.8  %
col 12 , count"Overall":  0 , NULL values:  99.8  %
col 13 , count"Overall":  93 , NULL values:  99.9  %
col 14 , count"Overall":  0 , NULL values:  99.9  %
col 15 , count"Overall":  57 , NULL values:  99.9  %
col 16 , count"Overall":  0 , NULL values:  99.9  %
col 17 , count"Overall":  49 , NULL values:  99.9  %
col 18 , count"Overall":  0 , NULL values:  99.9  %
col

118986

In [12]:
# Drop the columns 61 to 114 as they do not contain dimensions of 'Overall'
measurements.drop(labels=np.arange(61,115),axis='columns',inplace=True)

In the following cells, I retrieve the dimensions of 'Overall' for each piece, and when no 'Overall' dimensions are given, I take the dimensions given first.

In [13]:
mask_overall = pd.DataFrame(np.zeros(measurements.shape))
for col in range(measurements.shape[1]):
    mask_overall[col] = measurements[col].str.contains("Overall",na=False)
mask_overall = ~mask_overall.astype(bool)

In [14]:
mask_overall.drop(columns=[60],inplace=True)
measurements.drop(columns=[0],inplace=True)

In [15]:
dict_shift = dict(zip(np.arange(0,60), np.arange(1,61)))
mask_overall.rename(columns=dict_shift,inplace=True)

In [16]:
meas_overall = measurements.mask(mask_overall, other=np.nan)

In [17]:
sel_measurements = meas_overall[2]
for col in range(4,61,2):
    sel_measurements.fillna(value=meas_overall[col],inplace=True)

sel_measurements.fillna(value=measurements[2],inplace=True)

In [18]:
print('% of missing values:', round(sel_measurements.isna().sum()/sel_measurements.shape[0]*100,1),'%')

% of missing values: 19.8 %


In [19]:
sel_measurements = sel_measurements.str.split(pat='}',expand=True)[0]

In [20]:
API_file['measurements'] = sel_measurements

Now that I have selected the first "block" of dimensions, I want to extract them into numerical columns.

In [21]:
uniqueDim = API_file['measurements'].str.split(',',expand=True)
print('Size of dataframe after expanding: ', uniqueDim.shape)
uniqueDim = uniqueDim.stack(dropna=True)
uniqueDim = uniqueDim.reset_index(drop=True)
print('Size of dataframe after stacking: ', uniqueDim.shape)

Size of dataframe after expanding:  (214481, 6)
Size of dataframe after stacking:  (335084,)


In [22]:
# Unique names of dimensions
uniqueDim = uniqueDim.str.split(':',expand=True)[0]
uniqueDim = uniqueDim.str.replace(pat=' ',repl='')
uniqueDim.value_counts()

'Height'            145515
'Width'             120896
'Depth'              26789
'Diameter'           18113
'Length'             16379
'Weight'              3662
'Thickness'           3126
'Rim'                  167
'LengthatCB'           121
'Foot'                 105
'Body'                  54
'(notspecified)'        40
'Other'                 35
'Unknown'               25
'Circumference'         18
'Base'                  16
'RimExterior'           10
'Caliber'                4
'Capacity'               4
'RimInterior'            3
'Neck'                   2
Name: 0, dtype: int64

In [23]:
# New columns corresponding to unique dimensions name to add
new_col = uniqueDim.unique().tolist()
new_col.remove("'(notspecified)'") # --> Other
new_col.remove("'Unknown'") # --> Other
new_col.remove("'LengthatCB'") # --> Length
new_col.remove("'RimExterior'") # --> Rim
new_col.remove("'RimInterior'") # --> Rim
new_col

["'Depth'",
 "'Height'",
 "'Width'",
 "'Diameter'",
 "'Length'",
 "'Weight'",
 "'Thickness'",
 "'Caliber'",
 "'Other'",
 "'Foot'",
 "'Rim'",
 "'Base'",
 "'Body'",
 "'Circumference'",
 "'Capacity'",
 "'Neck'"]

In [24]:
# New columns are added to the main table and all lines filled in with zeros
for col in new_col: 
    API_file[col] = np.zeros(shape=API_file.shape[0])

In [25]:
# Zeros are replaced by a value when appropriate for each new column
for col in new_col:
    print(col)
    col_split = col + ": "
    selection = API_file['measurements'].str.split(pat=col_split,expand=True)[1]
    selection = selection.str.split(pat=',',expand=True)[0]
    API_file[col] = selection

'Depth'
'Height'
'Width'
'Diameter'
'Length'
'Weight'
'Thickness'
'Caliber'
'Other'
'Foot'
'Rim'
'Base'
'Body'
'Circumference'
'Capacity'
'Neck'


In [26]:
# For a few dimensions, I have decided to store their numerical information in the previously listed new columns
renamed_dim = ['(notspecified): ', 'Unknown: ', 'LengthatCB: ', 'RimExterior: ', 'RimInterior: ']
to_col = ["'Other'","'Other'","'Length'","'Rim'","'Rim'"]
cnt = 0
for dim in renamed_dim: 
    print(dim)
    selection = API_file['measurements'].str.split(pat=col_split,expand=True)[1]
    selection = selection.str.split(pat=',',expand=True)[0]
    col = to_col[cnt]
    API_file[col].fillna(selection, inplace=True)
    cnt=cnt+1

(notspecified): 
Unknown: 
LengthatCB: 
RimExterior: 
RimInterior: 


Finally, the null values are replaced by 0 and the columns are renamed without the hyphen.

The table is then stored in a csv file to be used on the main script.

In [27]:
API_file.dropna(axis='index',how='all',inplace=True)
API_file.fillna(0,inplace=True)

new_col_rename = pd.Series(new_col).str.split(pat="'",expand=True)[1].tolist()

dict_col_rename = dict(zip(new_col, new_col_rename))
API_file.rename(columns=dict_col_rename,inplace=True)

API_file.head(10)

Unnamed: 0,measurements,objectID,primaryImageSmall,Depth,Height,Width,Diameter,Length,Weight,Thickness,Caliber,Other,Foot,Rim,Base,Body,Circumference,Capacity,Neck
0,"'Depth': 13.0175, 'Height': 61.9, 'Width': 37.1",34.0,https://images.metmuseum.org/CRDImages/ad/web-...,13.0175,61.9,37.1,0.0,0,0,0,0,0,0,0,0,0,0,0,0
1,'Height': 30.5,37.0,https://images.metmuseum.org/CRDImages/ad/web-...,0.0,30.5,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
2,'Height': 31.4,38.0,https://images.metmuseum.org/CRDImages/ad/web-...,0.0,31.4,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
3,"'Height': 27.9, 'Width': 22.86",39.0,https://images.metmuseum.org/CRDImages/ad/web-...,0.0,27.9,22.86,0.0,0,0,0,0,0,0,0,0,0,0,0,0
4,"'Diameter': 7.62, 'Height': 16.6688",40.0,https://images.metmuseum.org/CRDImages/ad/web-...,0.0,16.6688,0.0,7.62,0,0,0,0,0,0,0,0,0,0,0,0
5,"'Diameter': 7.4613, 'Height': 15.875",41.0,https://images.metmuseum.org/CRDImages/ad/web-...,0.0,15.875,0.0,7.4613,0,0,0,0,0,0,0,0,0,0,0,0
6,"'Depth': 35.2426, 'Height': 78.4, 'Width': 63.5",42.0,https://images.metmuseum.org/CRDImages/ad/web-...,35.2426,78.4,63.5,0.0,0,0,0,0,0,0,0,0,0,0,0,0
7,"'Depth': 35.2, 'Height': 78.4227, 'Width': 63.5",43.0,https://images.metmuseum.org/CRDImages/ad/web-...,35.2,78.4227,63.5,0.0,0,0,0,0,0,0,0,0,0,0,0,0
8,'Height': 68.5801,44.0,https://images.metmuseum.org/CRDImages/ad/web-...,0.0,68.5801,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0
9,'Height': 68.5801,45.0,https://images.metmuseum.org/CRDImages/ad/web-...,0.0,68.5801,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0


In [28]:
API_file.to_csv('API_file.txt',index=False)