# Metadata: Prepare to Apply Manual Metadata to Trade Statistics of the Treaty Ports (1863-1872)


## About
- This script prepares the volume: `Trade statistics of the treaty ports, for the period 1863-1872` for manual metadata application. 
    - **HOLLIS:** https://id.lib.harvard.edu/alma/990058255570203941/catalog
    - **DRS:** https://iiif.lib.harvard.edu/manifests/view/drs:44319007$1i
- Note that this script is tailored specifically for use with the Trade Statistics volume and is not suitable for generalization.
- **Created:** 2023/03/02
- **Updated:** 2023/03/03

## Globals

In [2]:
# path to local util code module
g_util_module_path = '../util'

# path to current mapped inventory file
g_mapped_inventory = './mapped_vendor_inventory.csv'

# path to output file
g_output_file = './manual_metadata_inventory.csv'

Add local path to Jupyter system path

In [3]:
import sys
if g_util_module_path not in sys.path:
    sys.path.append(g_util_module_path)

## Modules

In [4]:
import pandas as pd

## Prepare Inventory

### Read mapped inventory

In [6]:
# read the mapped inventory into a dataframe
mapped_inventory_df = pd.read_csv(g_mapped_inventory)

display(mapped_inventory_df)

Unnamed: 0,file_type,mimetype,filename,filepath,path,drs_id,filename_osn,filepath_osn
0,image,image/jpeg,44319541.jpg,data/image/44319541.jpg,data,44319541,005825557_pt1_00001.innodata.jpg,data/image/005825557_pt1_00001.innodata.jpg
1,image,image/jpeg,44319542.jpg,data/image/44319542.jpg,data,44319542,005825557_pt1_00002.innodata.jpg,data/image/005825557_pt1_00002.innodata.jpg
2,image,image/jpeg,44319543.jpg,data/image/44319543.jpg,data,44319543,005825557_pt1_00003.innodata.jpg,data/image/005825557_pt1_00003.innodata.jpg
3,image,image/jpeg,44319544.jpg,data/image/44319544.jpg,data,44319544,005825557_pt1_00004.innodata.jpg,data/image/005825557_pt1_00004.innodata.jpg
4,image,image/jpeg,44319545.jpg,data/image/44319545.jpg,data,44319545,005825557_pt1_00005.innodata.jpg,data/image/005825557_pt1_00005.innodata.jpg
...,...,...,...,...,...,...,...,...
1581,csv,text/csv,44319948_a.csv,data/csv/44319948_a.csv,data,44319948,005825557_pt3_00138_a.innodata.csv,data/csv/005825557_pt3_00138_a.innodata.csv
1582,csv,text/csv,44319948_b.csv,data/csv/44319948_b.csv,data,44319948,005825557_pt3_00138_b.innodata.csv,data/csv/005825557_pt3_00138_b.innodata.csv
1583,csv,text/csv,44319949.csv,data/csv/44319949.csv,data,44319949,005825557_pt3_00139.innodata.csv,data/csv/005825557_pt3_00139.innodata.csv
1584,csv,text/csv,44319950_a.csv,data/csv/44319950_a.csv,data,44319950,005825557_pt3_00140_a.innodata.csv,data/csv/005825557_pt3_00140_a.innodata.csv


### Create two `DataFrames`: one containing JPEGs and the other containing CSVs

In [7]:
image_df = mapped_inventory_df.loc[mapped_inventory_df['file_type'] == 'image']
print('Num images: {}'.format(len(image_df)))

csv_df = mapped_inventory_df.loc[mapped_inventory_df['file_type'] == 'csv']
print('Num csvs: {}'.format(len(csv_df)))

Num images: 412
Num csvs: 350


### Create new `DataFrame` combining image paths with csv paths
- This routine uses the DRS id to link a specific image with a CSV file

In [37]:
df = pd.DataFrame()
df['drs_id'] = None
df['image_filename_osn'] = None
df['image_path'] = None
df['csv_filename_osn'] = None
df['csv_path'] = None
df['hierarchical_columns'] = False
df['hierarchical_row'] = False
df['computation_ready'] = False
df['table_group'] = False
df['table_group_members'] = None
table_group = False
members = None
for row in image_df.iterrows():
    # get image informtation
    drs_id = row[1].get('drs_id')
    image_filename_osn = row[1].get('filename_osn')
    image_path = row[1].get('filepath_osn')
    image_path_html = '=HYPERLINK("http://localhost/~ceilynboyd/hlhistd/{}")'.format(image_path)
    # get csvs with matching drs_id
    matching_csvs = csv_df.loc[csv_df['drs_id'] == drs_id]
    if (len(matching_csvs)>1):
        table_group = True
        members = ';'.join(list(matching_csvs['filename_osn']))
    for csv_row in matching_csvs.iterrows():
        csv_filename_osn = csv_row[1].get('filename_osn')
        csv_path = csv_row[1].get('filepath_osn')
        csv_path_html = '=HYPERLINK("http://localhost/~ceilynboyd/hlhistd/{}")'.format(csv_path)
        # add a row to the dataframe we're creating
        df.loc[len(df.index)] = [drs_id, image_filename_osn, image_path_html,
                                 csv_filename_osn, csv_path_html,
                                 False, False, False, table_group, members]
display(df)
print('Num rows: {}'.format(len(df)))
df.to_csv(g_output_file,index=False)
    

Unnamed: 0,drs_id,image_filename_osn,image_path,csv_filename_osn,csv_path,hierarchical_columns,hierarchical_row,computation_ready,table_group,table_group_members
0,44319547,005825557_pt1_00007.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt1_00007_iii-iv.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,False,
1,44319552,005825557_pt1_00012.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt1_00012_a.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,True,005825557_pt1_00012_a.innodata.csv;005825557_p...
2,44319552,005825557_pt1_00012.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt1_00012_b.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,True,005825557_pt1_00012_a.innodata.csv;005825557_p...
3,44319553,005825557_pt1_00013.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt1_00013_a.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,True,005825557_pt1_00013_a.innodata.csv;005825557_p...
4,44319553,005825557_pt1_00013.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt1_00013_b.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,True,005825557_pt1_00013_a.innodata.csv;005825557_p...
...,...,...,...,...,...,...,...,...,...,...
345,44319948,005825557_pt3_00138.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt3_00138_a.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,True,005825557_pt3_00138_a.innodata.csv;005825557_p...
346,44319948,005825557_pt3_00138.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt3_00138_b.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,True,005825557_pt3_00138_a.innodata.csv;005825557_p...
347,44319949,005825557_pt3_00139.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt3_00139.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,True,005825557_pt3_00138_a.innodata.csv;005825557_p...
348,44319950,005825557_pt3_00140.innodata.jpg,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",005825557_pt3_00140_a.innodata.csv,"=HYPERLINK(""http://localhost/~ceilynboyd/hlhis...",False,False,False,True,005825557_pt3_00140_a.innodata.csv;005825557_p...


Num rows: 350


### Add additional experimental summary metadata to the `DataFrame`


In [8]:
display(csv_df)

Unnamed: 0,file_type,mimetype,filename,filepath,path,drs_id,filename_osn,filepath_osn
1236,csv,text/csv,44319547_iii-iv.csv,data/csv/44319547_iii-iv.csv,data,44319547,005825557_pt1_00007_iii-iv.innodata.csv,data/csv/005825557_pt1_00007_iii-iv.innodata.csv
1237,csv,text/csv,44319552_a.csv,data/csv/44319552_a.csv,data,44319552,005825557_pt1_00012_a.innodata.csv,data/csv/005825557_pt1_00012_a.innodata.csv
1238,csv,text/csv,44319552_b.csv,data/csv/44319552_b.csv,data,44319552,005825557_pt1_00012_b.innodata.csv,data/csv/005825557_pt1_00012_b.innodata.csv
1239,csv,text/csv,44319553_a.csv,data/csv/44319553_a.csv,data,44319553,005825557_pt1_00013_a.innodata.csv,data/csv/005825557_pt1_00013_a.innodata.csv
1240,csv,text/csv,44319553_b.csv,data/csv/44319553_b.csv,data,44319553,005825557_pt1_00013_b.innodata.csv,data/csv/005825557_pt1_00013_b.innodata.csv
...,...,...,...,...,...,...,...,...
1581,csv,text/csv,44319948_a.csv,data/csv/44319948_a.csv,data,44319948,005825557_pt3_00138_a.innodata.csv,data/csv/005825557_pt3_00138_a.innodata.csv
1582,csv,text/csv,44319948_b.csv,data/csv/44319948_b.csv,data,44319948,005825557_pt3_00138_b.innodata.csv,data/csv/005825557_pt3_00138_b.innodata.csv
1583,csv,text/csv,44319949.csv,data/csv/44319949.csv,data,44319949,005825557_pt3_00139.innodata.csv,data/csv/005825557_pt3_00139.innodata.csv
1584,csv,text/csv,44319950_a.csv,data/csv/44319950_a.csv,data,44319950,005825557_pt3_00140_a.innodata.csv,data/csv/005825557_pt3_00140_a.innodata.csv


In [48]:
tmp_df = pd.read_csv('~/Desktop/005825557_pt1_00030_b.innodata.csv',sep=',',index_col=0)
#tmp_df.info()
#print(tmp_df.columns)
#print(tmp_df.index)
#print(tmp_df.values)
print(tmp_df.shape)
print(tmp_df.size)
#print(tmp_df.axes)


Index(['1863.', 'Unnamed: 2', '1864.', 'Unnamed: 4', '1865.', 'Unnamed: 6',
       '1866.', 'Unnamed: 8', '1867.', 'Unnamed: 10', '1868.', 'Unnamed: 12',
       '1869.', 'Unnamed: 14', '1870.', 'Unnamed: 16', '1871.', 'Unnamed: 18',
       '1872.', 'Unnamed: 20'],
      dtype='object')
(8, 20)
160


In [50]:
tmp2_df = pd.read_csv('~/Desktop/44319604_b.csv',sep=',',index_col=0)
print(tmp2_df.columns)
print(tmp2_df.shape)
print(tmp2_df.size)


Index(['1863.', '1864.', '1865.', '1866.', '1867.', '1868.', '1869.', '1870.',
       '1871.', '1872.'],
      dtype='object')
(12, 10)
120


----------

### Merge Manual File QC with Manual Metadata 

In [1]:
#
# load required files
#

# load full vendor inventory
vendor_inventory_df = pd.read_csv('./mapped_vendor_inventory.csv',index_col=None)

# load manually reviewed qc file 
manual_qc_df = pd.read_csv('../tmp/manual_file_qc.csv',index_col=None)

# load manually updated metadata file (metadata about csv files)
manual_metadata_df = pd.read_csv('../tmp/manual_metadata.csv',index_col=None)

#
# create new inventory dataframe
#

# new columns
columns = ['drs_id','url','filename_osn', 'filepath_osn', 'file_type','related_tables',
           'table_title','table_type','multilevel_columns','multilevel_rows','computation_ready',
           'table_group','table_group_members','shape','size',
           'image_handwriting','image_two_page','related_image']

# scope notes for the new columns
column_scope_notes = {
    'drs_id':'DRS id associated with the file',
    'url':'IIIF url to resource in DRS',
    'filename_osn':'Name of file mapped to owner-supplied name',
    'filepath_osn':'Full path to file using owner-supplied name',
    'file_type':'One of: image, txt, alto, or csv',
    'related_tables':'List of csv tables related to this file',
    'table_title':'Title of the csv table, if file_type = csv',
    'table_type':'One of Categorical, Comparison, Empty, Summary, Missing, or Other',
    'multilevel_columns':'True or False',
    'multilevel_rows':'True or False',
    'computation_ready':'True or False',
    'table_group':'True or False',
    'table_group_members':'List of filenames of related tables',
    'shape':'Array: [width, height]',
    'size':'Total number of cells in the table, width x height',
    'image_handwriting':'True or False',
    'image_two_page':'True or False',
    'related_image':'Filename of related image'
}

# create new dataframe with specififed columns
combined_df = pd.DataFrame(columns=columns)

# gather some information keyed on DRS id as we traverse vendor_inventory_df, manual_qc_df, and manual_metadata_df
drs_urls = {}
drs_image_filesnames = {}
drs_csv_filenames = {}
drs_handwriting = {}
drs_two_page = {}
drs_related_tables = {}

# process the vendor_inventory_df
for row in vendor_inventory_df.iterrows():
    # collect variables
    drs_id = row[1].get('drs_id')
    file_type = row[1].get('file_type')
    filename_osn = row[1].get('filename_osn')
    filepath_osn = row[1].get('filepath_osn')
    # set conditional variable values
    url = ''
    table_title = ''
    table_type = pd.NA
    multilevel_columns = pd.NA
    multilevel_rows = pd.NA
    computation_ready = pd.NA
    related_tables = []
    table_group = False
    table_group_members = []
    image_handwriting = False
    image_two_page = False
    shape = []
    size = pd.NA
    related_image = ''
    
    # append the row to the combined dataframe
    combined_df.loc[len(combined_df.index)] = [drs_id,url,filename_osn,filepath_osn,file_type,related_tables,
                                                table_title,table_type,multilevel_columns,multilevel_rows,computation_ready,
                                                table_group,table_group_members,shape,size,
                                                image_handwriting,image_two_page,related_image]

display(combined_df)

NameError: name 'pd' is not defined

In [10]:
# load the files
manual_qc_df = pd.read_csv('../tmp/manual_file_qc.csv',index_col=None)
manual_metadata_df = pd.read_csv('../tmp/manual_metadata.csv',index_col=None)
# create new dataframe
columns = ['drs_id','url','image_filename_osn','csv_filename_osn','is_table','is_image','is_text',
           'title','table_type','multilevel_columns','multilevel_rows','computation_ready',
           'table_group','table_group_members',
           'handwriting','two_page','shape','size']
combined_df = pd.DataFrame(columns=columns)

# gather some information as we traverse the two dataframes
drs_urls = {}
drs_image_filesnames = {}
drs_csv_filenames = {}
drs_handwriting = {}
drs_two_page = {}

# add data to dataframe
for row in manual_qc_df.iterrows():
    drs_id = row[1].get('drs_id')
    image_filename_osn = ''
    csv_filename_osn = ''
    is_table = False
    is_image = False
    is_text = False
    title = ''
    table_type = pd.NA
    multilevel_columns = pd.NA
    multilevel_rows = pd.NA
    computation_ready = pd.NA
    table_group = False
    table_group_members = []
    handwriting = False
    two_page = False
    shape = []
    size = pd.NA
    url = row[1].get('url')
    drs_urls[drs_id] = url
    note = row[1].get('note')
    if note == '2-page tables':
        two_page = True
    elif note == 'handwriting':
        handwriting = True
    drs_handwriting[drs_id] = handwriting
    drs_two_page[drs_id] = two_page
    combined_df.loc[len(combined_df.index)] = [drs_id,url,image_filename_osn,csv_filename_osn,
                                               is_table,is_image,is_text,
                                               title,table_type,
                                               multilevel_columns,multilevel_rows,computation_ready,
                                               table_group,table_group_members,
                                               handwriting,two_page,shape,size]

# add data to dataframe
for row in manual_metadata_df.iterrows():
    drs_id = row[1].get('drs_id')
    image_filename_osn = row[1].get('image_filename_osn')
    drs_image_filesnames[drs_id] = image_filename_osn
    csv_filename_osn = row[1].get(csv_filename_osn)
    drs_csv_filenames[drs_id] = csv_filename_osn
    title = row[1].get('title')
    table_type = row[1].get('table_type')
    multilevel_columns = row[1].get('multilevel_columns')
    multilevel_rows = row[1].get('multilevel_rows')
    computation_ready = row[1].get('computation_ready')
    table_group = row[1].get('table_group')
    table_group_members = row[1].get('table_group_members')
    handwriting = False
    two_page = False
    shape = []
    size = pd.NA
    url = drs_urls[drs_id]
    combined_df.loc[len(combined_df.index)] = [drs_id,url,image_filename_osn,csv_filename_osn,
                                               is_table,title,table_type,
                                               multilevel_columns,multilevel_rows,computation_ready,
                                               table_group,table_group_members,
                                               handwriting,two_page,shape,size]

# go back and reprocess the dataframe, filling in stored information
for row in combined_df.iterrows():
    drs_id = row[1].get('drs_id')
    matching_df = combined_df.loc[combined_df['drs_id'] == drs_id]
    for r in matching_df.iterrows():
        # populate missing fields, if needed
        index = row[0]
        



**End document.**