# QC: Trade Statistics of the Treaty Ports (1863-1872)


## About
- Interactive QC workflow for the volume: `Trade statistics of the treaty ports, for the period 1863-1872`
    - **HOLLIS:** https://id.lib.harvard.edu/alma/990058255570203941/catalog
    - **DRS:** https://iiif.lib.harvard.edu/manifests/view/drs:44319007$1i
- **Created:** 2023/01/25
- **Updated:** 2023/01/26

## Globals

In [None]:
# path to local util code module
g_util_module_path = '../util'

# valid mets files
mets = {
    'trade_statistics':'../data/trade_statistics/trade_statistics.xml',
}

# valid iiif json manifest files 
iiif_json = {
    'trade_statistics':'../data/trade_statistics/trade_statistics_iiif_manifest.json',
}

# valid output directories
outputs = {
    'trade_statistics':'./outputs/trade_statistics',
}

g_qc_mets_file = mets.get('trade_statistics')
# test iiif json manifest file
g_qc_iiif_json_file = iiif_json.get('trade_statistics')
# data file directory
g_qc_data_directory = ''
# output directory
g_qc_output_dir = outputs.get('trade_statistics')
# qc output filenames
g_qc_outputs = {
    'digital_object_inventory': g_qc_output_dir  + '/do_inventory.csv',
    'vendor_inventory': g_qc_output_dir  + '/vendor_inventory.csv',
    'digital_object_inventory': g_qc_output_dir  + '/do_inventory.csv',
    'csv_inventory_report': g_qc_output_dir + '/csv_inventory_report.csv',
    'txt_inventory_report': g_qc_output_dir + '/txt_inventory_report.csv',
    'missing_drs_ids_report': g_qc_output_dir + '/missing_drs_ids.csv',
    'csv_drs_ids_report': g_qc_output_dir + '/csv_drs_ids_report.csv',
    'txt_drs_ids_report': g_qc_output_dir + '/txt_drs_ids_report.csv',
    'full_drs_ids_report': g_qc_output_dir + '/full_drs_ids_report.csv'
}

Add local path to Jupyter system path

In [None]:
import sys
if g_util_module_path not in sys.path:
    sys.path.append(g_util_module_path)

## Modules

In [None]:
import pandas as pd
import pprint
import util # local module

## Metadata Analysis

### 1. Download and Process IIIF Manifest (`JSON` format)

In [None]:
# print function documentation
print('{}'.format(util.iiif_to_dataframe.__doc__))

# load the iiif manifest file
iiif_df = util.iiif_to_dataframe(g_qc_iiif_json_file)

# print number of files
print('Num files: {}'.format(len(iiif_df)))

# display result
display(iiif_df)

### 2. Download and Process METS File (`XML` format)

In [None]:
# print function documentation
print('{}'.format(util.mets_to_dataframe.__doc__))

# load the mets file
mets_df = util.mets_to_dataframe(g_qc_mets_file)

# print number of files
print('Num files: {}'.format(len(mets_df)))

# display result
display(mets_df)

### 3. Create Digital Object and Vendor File Inventories

#### Create digital object inventory
- Based upon IIIF manifest

In [None]:
# print function documentation
print('{}'.format(util.create_digital_object_inventory.__doc__))

# create the digital object inventory based uopn the iiif_df
do_inventory_df = util.create_digital_object_inventory(iiif_df,itype='iiif')

# print the number of inventory files
print('Num files: {}'.format(len(do_inventory_df)))

# write inventory to file
filename = g_qc_outputs['digital_object_inventory']
do_inventory_df.to_csv(filename,index=False)

# display inventory
display(do_inventory_df)

#### Create vendor inventory
- Based upon METS file output. Assumes that vendor filenames are based upon DRS id.

In [None]:
# print function documentation
print('{}'.format(util.create_vendor_inventory.__doc__))

# create the vendor inventory based upon the mets dataframe
vendor_inventory_df = util.create_vendor_inventory(mets_df, drsids=True, path=g_qc_data_directory)

# print the number of files in the inventory
print('Num files: {}'.format(len(vendor_inventory_df)))

# write inventory to file
filename = g_qc_outputs['vendor_inventory']
vendor_inventory_df.to_csv(filename,index=False)

# display inventory
display(vendor_inventory_df)

### Create Transcription Inventories

#### Create `csv` transcription inventory

In [None]:
# print function documentation
print('{}'.format(util.extract_transcription_inventory.__doc__))

# extract csv transcription files
csv_inventory_df = util.extract_transcription_inventory(vendor_inventory_df, ttype='csv', path=False)

# write inventory to file
filename = g_qc_outputs['csv_inventory_report']
csv_inventory_df.to_csv(filename,index=False)

# diplay the results
display(csv_inventory_df)

Generate `csv` transcription inventory report

In [None]:
# print function documentation
print('{}'.format(util.generate_transcription_report.__doc__))

# create the csv transcription report
csv_inventory_report_df = util.generate_transcription_report(csv_inventory_df)

# display the results
display(csv_inventory_report_df)

#### Create `txt` transcription inventory

In [None]:
# print function documentation
print('{}'.format(util.extract_transcription_inventory.__doc__))

# extract the txt files from the vendor inventory
txt_inventory_df = util.extract_transcription_inventory(vendor_inventory_df, ttype='txt', path=False)

# write inventory to file
filename = g_qc_outputs['txt_inventory_report']
txt_inventory_df.to_csv(filename,index=False)

# display the results
display(txt_inventory_df)

Generate `txt` transcription inventory report

In [None]:
# print function documentation
print('{}'.format(util.generate_transcription_report.__doc__))

# generate the txt transcription report
txt_inventory_report_df = util.generate_transcription_report(txt_inventory_df)

# display the results
display(txt_inventory_report_df)

### Compare Digital Object Files to Vendor Files

#### Check for missing DRS ids

In [None]:
# print function documentation
print('{}'.format(util.find_missing_reference_ids.__doc__))

# get digital object drs ids
do_drs_ids = do_inventory_df['drs_id']

# get vendor drs ids
vendor_drs_ids = vendor_inventory_df['drs_id']

# get dataframe of drs ids in digital object that are missing from vendor inventory
missing_drs_ids_df = util.find_missing_reference_ids(do_drs_ids, vendor_drs_ids)

# report number of missing drs ids
print('Num missing DRS ids: {}'.format(len(missing_drs_ids_df)))

# display the missing drs ids, if any
display(missing_drs_ids_df)
if (len(missing_drs_ids_df) > 0):
    filename = g_qc_outputs['missing_drs_ids_report']
    missing_drs_ids_df.to_csv(filename,index=False)

#### Look for DRS ids missing from transcription inventories
- Note: In some cases, the missing transcriptions may be valid. For instance, a blank page will not have a `.csv` or `.txt` transcription associated with it.

Find missing DRS ids in `csv` transcription report

In [None]:
# get missing transcription drs ids
csv_missing_drs_ids_df = util.find_missing_transcription_reference_ids(do_inventory_df, csv_inventory_report_df, reftype='drs')

# write results to file
filename = g_qc_outputs['csv_drs_ids_report']
csv_missing_drs_ids_df.to_csv(filename,index=False)

# display results
display(csv_missing_drs_ids_df)

Find missing DRS ids in `txt` transcription report

In [None]:
# get missing transcription drs ids
txt_missing_drs_ids_df = util.find_missing_transcription_reference_ids(do_inventory_df, txt_inventory_report_df, reftype='drs')

# write results to file
filename = g_qc_outputs['txt_drs_ids_report']
txt_missing_drs_ids_df.to_csv(filename,index=False)

# display results
display(txt_missing_drs_ids_df)

#### Generate a transcription report for combined `csv` and `txt` transcriptions
- TO DO: Define special-purpose function in `util.py` if needed.

In [None]:
# local function: get the number of small files (<7bytes) in a directory
def get_small_files(path):
    info = util.get_file_info(path)
    small_files = []
    for key in info.keys():
        size = info[key].get('size')
        if (size < 10):
            small_files.append(key)
    return small_files
    
# local function: test for presence/absence of transcription based upon count
def has_transcription(csv_count, txt_count):
        if (csv_count + txt_count > 0):
                return True
        else: 
                return False

# merge the csv and txt transcription reports
df = txt_missing_drs_ids_df.merge(csv_missing_drs_ids_df,on='drs_id',how='outer',suffixes=['_txt','_csv'])

# rename a column in place
df.rename(columns = {'url_txt':'url'}, inplace = True)

# drop some duplicated and/or unneeded columns
df.drop(['url_csv','mimetype_txt','mimetype_csv','file_type_txt','file_type_csv'],axis=1, inplace=True)

# reorder remaining columns for readability
df = df.loc[:,['drs_id','url', 'filename_csv','filename_txt','count_csv','count_txt']]

# add a bool small_txt column
df['has_small_txt'] = False

# get the list of small txt files
path = 'TODO:REPLACE WITH LOCAL PATH TO TXT FILES'
small_txt_files = get_small_files(path)

# remove small text files from the count
for row in df.iterrows():
        index = row[0]
        filename_txt = row[1].get('filename_txt')
        txts = filename_txt.split(';')
        count_txt = row[1].get('count_txt')
        small_txt = row[1].get('has_small_txt')
        # update fields based upon presence of small txt files
        if (len(txts) > 1):
                for txt in txts:
                        if (txt in small_txt_files):
                                df.at[index,'has_small_txt'] = True
                                df.at[index, 'count_txt'] = count_txt - 1                            
        else:
                if (filename_txt in small_txt_files):
                        df.at[index,'has_small_txt'] = True
                        df.at[index, 'count_txt'] = count_txt - 1

# derive a new has_transcription column
df['has_transcription'] = df.apply(lambda row: has_transcription(row['count_csv'], row['count_txt']), axis=1)

# write the results to a file
filename = g_qc_outputs['full_drs_ids_report']
df.to_csv(filename,index=False)

display(df)


**End document.**