# Test: QC Workflow Functions

## About
- Interactive tests of QC workflow
- **Created:** 2023/01/11
- **Updated:** 2023/01/16

## Globals

In [None]:
# path to local util code module
g_util_module_path = '../util'
# test mets file
g_qc_mets_file = '../data/trade_statistics/trade_statistics.xml'
# test iiif json manifest file
g_qc_iiif_json_file = '../data/trade_statistics/trade_statistics_iiif_manifest.json'
# data file directory
g_qc_data_directory = ''
# output directory
g_qc_output_dir = './outputs'
# qc output filenames
g_qc_outputs = {
    'digital_object_inventory': g_qc_output_dir  + '/do_inventory.csv',
    'vendor_inventory': g_qc_output_dir  + '/vendor_inventory.csv',
    'digital_object_inventory': g_qc_output_dir  + '/do_inventory.csv',
    'csv_inventory_report': g_qc_output_dir + '/csv_inventory_report.csv',
    'txt_inventory_report': g_qc_output_dir + '/txt_inventory_report.csv',
    'missing_drs_ids_report': g_qc_output_dir + '/missing_drs_ids.csv',
    'csv_drs_ids_report': g_qc_output_dir + '/csv_drs_ids_report.csv',
    'txt_drs_ids_report': g_qc_output_dir + '/txt_drs_ids_report.csv'
}

Add local path to Jupyter system path

In [None]:
import sys
if g_util_module_path not in sys.path:
    sys.path.append(g_util_module_path)

## Modules

In [None]:
import pandas as pd
import pprint
import util # local module

## Metadata Analysis

### Download and Process IIIF Manifest (`JSON` format)

In [None]:
# print function documentation
print('{}'.format(util.iiif_to_dataframe.__doc__))

# load the iiif manifest file
iiif_df = util.iiif_to_dataframe(g_qc_iiif_json_file)

# print number of files
print('Num files: {}'.format(len(iiif_df)))

# display result
display(iiif_df)

### Download and Process METS File (`XML` format)

In [None]:
# print function documentation
print('{}'.format(util.mets_to_dataframe.__doc__))

# load the mets file
mets_df = util.mets_to_dataframe(g_qc_mets_file)

# print number of files
print('Num files: {}'.format(len(mets_df)))

# display result
display(mets_df)

### Create Digital Object and Vendor File Inventories

#### Create digital object inventory
- Based upon IIIF manifest

In [None]:
# print function documentation
print('{}'.format(util.create_digital_object_inventory.__doc__))

# create the digital object inventory based uopn the iiif_df
do_inventory_df = util.create_digital_object_inventory(iiif_df)

# print the number of inventory files
print('Num files: {}'.format(len(do_inventory_df)))

# write inventory to file
filename = g_qc_outputs['digital_object_inventory']
do_inventory_df.to_csv(filename,index=False)

# display inventory
display(do_inventory_df)

#### Create vendor inventory
- Based upon METS file output. Assumes that vendor filenames are based upon DRS id.

In [None]:
# print function documentation
print('{}'.format(util.create_vendor_inventory.__doc__))

# create the vendor inventory based upon the mets dataframe
vendor_inventory_df = util.create_vendor_inventory(mets_df, g_qc_data_directory)

# print the number of files in the inventory
print('Num files: {}'.format(len(vendor_inventory_df)))

# write inventory to file
filename = g_qc_outputs['vendor_inventory']
vendor_inventory_df.to_csv(filename,index=False)

# display inventory
display(vendor_inventory_df)

### Create Transcription Inventories

#### Create `csv` transcription inventory

In [None]:
# print function documentation
print('{}'.format(util.extract_transcription_inventory.__doc__))

# extract csv transcription files
csv_inventory_df = util.extract_transcription_inventory(vendor_inventory_df, ttype='csv', path=False)

# write inventory to file
filename = g_qc_outputs['csv_inventory_report']
csv_inventory_df.to_csv(filename,index=False)

# diplay the results
display(csv_inventory_df)

Generate `csv` transcription inventory report

In [None]:
# print function documentation
print('{}'.format(util.generate_transcription_report.__doc__))

# create the csv transcription report
csv_inventory_report_df = util.generate_transcription_report(csv_inventory_df)

# display the results
display(csv_inventory_report_df)

#### Create `txt` transcription inventory

In [None]:
# print function documentation
print('{}'.format(util.extract_transcription_inventory.__doc__))

# extract the txt files from the vendor inventory
txt_inventory_df = util.extract_transcription_inventory(vendor_inventory_df, ttype='txt', path=False)

# write inventory to file
filename = g_qc_outputs['txt_inventory_report']
txt_inventory_df.to_csv(filename,index=False)

# display the results
display(txt_inventory_df)

Generate `txt` transcription inventory report

In [None]:
# print function documentation
print('{}'.format(util.generate_transcription_report.__doc__))

# generate the txt transcription report
txt_inventory_report_df = util.generate_transcription_report(txt_inventory_df)

# display the results
display(txt_inventory_report_df)

### Compare Digital Object Files to Vendor Files

#### Check for missing DRS ids

In [None]:
# print function documentation
print('{}'.format(util.find_missing_drs_ids.__doc__))

# get digital object drs ids
do_drs_ids = do_inventory_df['drs_id']

# get vendor drs ids
vendor_drs_ids = vendor_inventory_df['drs_id']

# get dataframe of drs ids in digital object that are missing from vendor inventory
missing_drs_ids_df = util.find_missing_drs_ids(do_drs_ids, vendor_drs_ids)

# report number of missing drs ids
print('Num missing DRS ids: {}'.format(len(missing_drs_ids_df)))

# display the missing drs ids, if any
display(missing_drs_ids_df)
if (len(missing_drs_ids_df) > 0):
    filename = g_qc_outputs['missing_drs_ids_report']
    missing_drs_ids_df.to_csv(filename,index=False)

#### Look for DRS ids missing from transcription inventories
- Note: In some cases, the missing transcriptions may be valid. For instance, a blank page will not have a `.csv` or `.txt` transcription associated with it.

Find missing DRS ids in `csv` transcription report

In [None]:
# get missing transcription drs ids
csv_missing_drs_ids_df = util.find_missing_transcription_drs_ids(do_inventory_df, csv_inventory_report_df)

# write results to file
filename = g_qc_outputs['csv_drs_ids_report']
csv_missing_drs_ids_df.to_csv(filename,index=False)

# display results
display(csv_missing_drs_ids_df)

Find missing DRS ids in `txt` transcription report

In [None]:
display(util.find_missing_transcription_drs_ids(do_inventory_df, txt_inventory_report_df))

# get missing transcription drs ids
txt_missing_drs_ids_df = util.find_missing_transcription_drs_ids(do_inventory_df, txt_inventory_report_df)

# write results to file
filename = g_qc_outputs['txt_drs_ids_report']
txt_missing_drs_ids_df.to_csv(filename,index=False)

# display results
display(txt_missing_drs_ids_df)

**End document.**