# Resolve PIDS

Ian Thomas
Research Capability Unit, 
RMIT University


## Introduction

The script takes a list of library persistent identifiers (e.g., ISBN, LCCN) and return the corresponding
data from the [openlibary.org](http://openlibrary.org).  It creates a CSV file with selected fields, with blank lines for missing data.

## Instructions

Replace the list of PIDS in the next cell with one PID per line with the correct prefix.

In [31]:

pids = """
    ISBN:9780980200447
    LCCN:93005405
    ISBN:123141244124
    """

When you are ready, select the above cell and press the run button above to advance through the next cell. Press that button to advance through the next cell and wait for output.

When it completes you will see a new file in left hand side panel. Select that CSV file and download to your desktop.


In [None]:
import csv
import sys
import json
from jsonpath_rw import jsonpath, parse
from time import sleep
import requests
import random
import logging
from pprint import pformat
from IPython.display import display

logger = logging.getLogger(__name__)

#logging.basicConfig(level=logging.DEBUG)
logging.basicConfig(level=logging.INFO)

# The query for openlibrary {} is replaced with the pids
query_template = "https://openlibrary.org/api/books?bibkeys={}&jscmd=data&format=json"

# The set of patterns for selecting which fields in the response to put into the csv
# Uses the jsonpath schema: http://goessner.net/articles/JsonPath/
paths = [
    ("$.*.title", "Title"),
    ("$.*.authors[*].name", "Author(s) Name"),
    ("$.*.publish_date", "Publish Date"),
    ("$.*.url","URL")
    ]

output_file = "final.csv"
pid_column_name = "pid"

# Try to find parse errors earlier
for path,pname in paths:
    try:
        jsonpath_expr = parse(path)
        value = [match.value for match in jsonpath_expr.find([])]
    except Exception as e:
        logger.error(f"Possible path error in {path} gave error: {repr(e)}")
        raise e

pids_list =  pids.split()
logger.info(f"Pids to Check: {', '.join(pids_list)}")
logger.info("checking...")
# TODO: Check api encoding 
results = []
for pid in pids_list:
    trim_pids = pid.strip() # remove trailing whitespace
    query = query_template.format(trim_pids)
    logger.info(f"query = {query}")
    # TODO: add retries for this request
    res = requests.get(query)
    if res:
        results.append((trim_pids,res.json()))
    sleep(random.randint(5,10)) # sleep to avoid flooding api

logger.debug(f"query results: {pformat(results)}")

with open(output_file, 'w') as csvfile:   
    writer = csv.DictWriter(csvfile, fieldnames=[pid_column_name] + [pname for (p, pname) in paths])
    writer.writeheader()
    for pid, v in results:
        logger.debug(f"v={v}")
        row = {}
        row[pid_column_name] = pid
        for p,pname in paths:
            try:
                jsonpath_expr = parse(p)
                value = [match.value for match in jsonpath_expr.find(v)]
            except Exception as e:
                logger.warning(f"Parse error in entry {pid} gave error: {e}")
                value = []
            row[pname] = ', '.join(value)
        writer.writerow(row)
        
logger.info("Done")