In [None]:
%config Completer.use_jedi = False

In [24]:
import pandas as pd
import numpy as np
import requests
import glob, os
import random
from datetime import datetime
import json
import re
from json import JSONDecodeError
from json_repair import repair_json
from openai import OpenAI
from islandora7_rest import IslandoraClient
import cv2
import torch
from PIL import Image
import io
import base64
import layoutparser as lp
import time
import sys
from tqdm.notebook import tqdm

# Import prompts
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
import prompts

In [2]:
# Setup Islandora client
isURL = "https://digital.lib.ku.edu/islandora/rest"
is_client = IslandoraClient(isURL)

try:
    is_client.solr_query('PID:*root')
    print('Islandora client working okay')
except Exception as e:
    print(f'Islandora client not connecting to REST: {str(e)}')
    

Islandora client working okay


In [3]:
key = input('Enter LLM key')

Enter LLM key UzfXwCIvp6q0ZsxjH3NaBsp5JxfHcmgC


In [4]:
client = OpenAI(api_key=key, base_url="https://ellm.nrp-nautilus.io/v1", max_retries=0)
llm_model = 'glm-v'

# Test LLM connection
try:
    completion = client.chat.completions.create(
        model=llm_model,
        messages=[{"role": "system", "content": ""},
                 {"role": "user", "content": "Just checking to see if you're awake."}])
    print('LLM connection successful')
except Exception as e:
    print(f'LLM connection failed: {str(e)}')

LLM connection successful


In [5]:
# Load layoutparser model
def load_newspaper_navigator():
    try:
        config_path = 'lp://NewspaperNavigator/faster_rcnn_R_50_FPN_3x/config'
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        return lp.models.Detectron2LayoutModel(
            config_path=config_path,
             extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5], # limit confidence score
             device=device
            )
    except ValueError:
        # Load Newspaper Navigator model from local files
        # have had trouble with downloading in some cases
        model_dir = os.path.expanduser("~/newspaper_navigator_model")
        config_path = os.path.join(model_dir, "config.yml")
        model_path = os.path.join(model_dir, "model_final.pth")
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        if not (os.path.exists(config_path) and os.path.exists(model_path)):
            raise FileNotFoundError(f"Model files not found in {model_dir}. Run the download script first.")
        
        return lp.models.Detectron2LayoutModel(
            config_path=config_path,
            model_path=model_path,
            extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5], # limit confidence score
            device=device
        )

print("Loading layoutparser model...")
try:
    lp_model = load_newspaper_navigator()
    print("Layoutparser model loaded successfully")
except Exception as e:
    print(f"Failed to load layoutparser model: {str(e)}")

Loading layoutparser model...
Layoutparser model loaded successfully


In [6]:
def filter_lp(results):
    # for items with matching bounding boxes, return only the highest confidence
    max_items = {}
    for item in results:
        key = (item['x_1'], item['y_1'], item['x_2'], item['y_2'])
        if key not in max_items or item['score'] > max_items[key]['score']:
            max_items[key] = item
    return list(max_items.values())

def run_lp(pid, identifier, lp=True):

    image = get_image(pid)
    results = []
    # START - comment out to skip layoutparser (2 of 2)
    if lp:
        image_for_lp = np.array(image)
        layout = lp_model.detect(image_for_lp)
    
        for l in layout:
            results.append({
                    'x_1': l.block.x_1, 'y_1': l.block.y_1, 'x_2': l.block.x_2, 'y_2': l.block.y_2,
                    'score': l.score, 'type': l.type,
                    'identifier': identifier, 'pid': pid,
                    })
    
        results = filter_lp(results)
        print(f'Layout Parser complete with {len(results)} items')
    # END - comment out to skip layoutparser
    return results, image

In [7]:
def get_image(pid, max_retries=5):

    url = f'https://digital.lib.ku.edu/islandora/object/{pid}/datastream/OBJ/view'

    # Retry loop for GET request
    for attempt in range(max_retries):
        try:
            response = requests.get(url, timeout=60)
            response.raise_for_status()
            image = Image.open(io.BytesIO(response.content))
            if image.mode != 'RGB':
                image = image.convert('RGB')
            return image
        except Exception as e:
            if attempt == max_retries - 1:  # Last attempt
                raise
            time.sleep(3 ** attempt)  # Exponential backoff: 1s, 3s, 9s

In [8]:
def parse_dates(s):
    s = s.replace('udk_','').replace('udk-','')
    try:
        if len(s.split('_')) == 2:
            start_str, end_str = s.split('_')
            start = datetime.strptime(start_str, '%m-%d-%Y')
            end = datetime.strptime(end_str, '%m-%d-%Y')
            return start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')
        elif len(s.split('_')) == 6:
            start_m, start_d, start_y, end_m, end_d, end_y = s.split('_')
            start = datetime(int(start_y), int(start_m), int(start_d))
            end = datetime(int(end_y), int(end_m), int(end_d))
            return start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')
        elif len(s.split('-')) == 6:
            start_m, start_d, start_y, end_m, end_d, end_y = s.split('-')
            start = datetime(int(start_y), int(start_m), int(start_d))
            end = datetime(int(end_y), int(end_m), int(end_d))
            return start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')
        elif '_to_' in s:
            parts = s.split('_to_')
            start = datetime.strptime(parts[0].replace('_', '/'), '%m/%d/%Y')
            end = datetime.strptime(parts[1].replace('_', '/'), '%m/%d/%Y')
            return start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')
        elif len(s.split('_')) == 4:
            start_str, end_m, end_d, end_y = s.split('_')
            start_m, start_d, start_y = start_str.split('-')
            start = datetime(int(start_y), int(start_m), int(start_d))
            end = datetime(int(end_y), int(end_m), int(end_d))
            return start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')
        else:
            _, start_str, end_str = s.split('-')
            start = datetime.strptime(start_str, '%Y%m%d')
            end = datetime.strptime(end_str, '%Y%m%d')
            return start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')
    except ValueError as e:
        print(f'Unknown date format: {s}, error: {str(e)}')
        return None, None

In [9]:
def encode_img(image):
    buffer = io.BytesIO()
    image.save(buffer, format='JPEG', quality=95, optimize=True, subsampling=0)
    buffer.seek(0)
    return base64.b64encode(buffer.read()).decode("utf-8")

In [10]:
def crop_and_encode(image, header=False, coords=None):
    if header:
        w, h = image.size
        img = image
    elif coords:
        img = image.crop((coords['x_1'], coords['y_1'], coords['x_2'], coords['y_2']))
    else:
        img = image
    if img.mode in ('RGBA', 'LA', 'P'):
        img = img.convert('RGB')

    max_file_size = 3355443  # 3.2MB
    max_size = 4000 # pixel length

    # Try original image first
    image_encode = encode_img(img)
    image_encode_size = len(image_encode)

    if image_encode_size <= max_file_size:
        print(f"Image size OK: {image_encode_size / (1024 * 1024):.2f}MB")
        return image_encode

    while max_size >= 100:
        # Calculate new dimensions
        width, height = img.size
        scale = max_size / max(width, height)

        if scale >= 1:
            resized_img = img
        else:
            new_width = int(width * scale)
            new_height = int(height * scale)
            resized_img = img.resize((new_width, new_height), Image.LANCZOS)

        image_encode = encode_img(resized_img)
        print(f'Resized image: {len(image_encode)/(1024*1024):.2f}MB')

        # Check size
        if len(image_encode) <= max_file_size:
            return image_encode

        # Calculate next size
        size_ratio = max_file_size / len(image_encode)
        max_size = int(max_size * (size_ratio ** 0.5) * 0.93)

    return image_encode


In [11]:
def fix_json_values(text):
    try:
        text = repair_json(text)
        return text
    except Exception as e:
        # Fallback to manual fixes if json-repair fails
        logger.debug(f"json-repair failed: {e}, trying manual fixes")
        text = re.sub(r'("[^"]+"):\s*([0-9]+[A-Za-z][A-Za-z0-9]*)', r'\1: "\2"', text)
        text = re.sub(r',(\s*[}\]])', r'\1', text)
        return text

def decode_message(message):
    try:
        text = message.content[0].text
    except:
        text = message

    to_strip = [r'json\n', '<|end_of_box|>', '<|start_of_box|>','<|begin_of_box|>',
                '<think>', '</think>', '```json', '```']

    for t in to_strip:
        try:
            text = text.strip().replace(t, '')
        except (IndexError, AttributeError):
            continue

    cleaned = text.replace('\n', '').strip()

    if cleaned and cleaned[0] != '{':
        cleaned = '{' + cleaned
    if cleaned and not cleaned.endswith('}'):
        cleaned = cleaned + '}'

    for i, char in enumerate(cleaned):
        if char == '{':
            bracket_count = 0
            for j in range(i, len(cleaned)):
                if cleaned[j] == '{':
                    bracket_count += 1
                elif cleaned[j] == '}':
                    bracket_count -= 1
                    if bracket_count == 0:
                        candidate = cleaned[i:j+1]

                        try:
                            data = json.loads(candidate)
                            return data
                        except json.JSONDecodeError as e:
                            try:
                                data = json.loads(fix_json_values(candidate))
                                return data
                            except json.JSONDecodeError as e:
                                print(f'JSON decode error at position {e.pos}: {e.msg}')
                                print(f'Problematic text: {candidate[max(0, e.pos-50):e.pos+50]}')

                                return {"error": "Badly formed JSON response"}

    print(f'JSON decode error: {cleaned[:200]}')
    return {"error":"Badly formed JSON response"}

In [12]:
def llm_query(pid, identifier, date, image, header=False, coords=None, max_retries=5):
    """LLM query with retry logic and rate limiting"""

    # Determine prompt and image based on query type
    if header:
        img_enc = crop_and_encode(image, header=True)
        url = f"data:image/jpeg;base64,{img_enc}"
        sys_prompt = prompts.page_prompt()
    elif coords:
        if coords[0] == 'ads':
            sys_prompt = prompts.ad_prompt()
        else:
            sys_prompt = prompts.ed_comics_prompt()
        img_enc = crop_and_encode(image, coords=coords[1])
        url = f"data:image/jpeg;base64,{img_enc}"
    else:

        # url = f'https://digital.lib.ku.edu/islandora/object/{pid}/datastream/OBJ/view'
        # alt method of sending pre-encoded image
        img_enc = crop_and_encode(image)
        url = f"data:image/jpeg;base64,{img_enc}"
        sys_prompt = prompts.item_prompt()

    text = """Process this image according to system directions."""
    if date:
        text += f"Likely date range for this item is {date}."

    # Retry loop with exponential backoff
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
                model=llm_model,
                messages=[
                    {"role": "system", "content": sys_prompt},
                    {
                        "role": "user",
                        "content": [{
                            "type": "text",
                            "text": text
                        },
                        {"type": "image_url",
                         "image_url": {"url": url}}]
                    },
                    {"role": "assistant", "content": "{"}
                ],
            )

            msg = completion.choices[0].message.content

            # Add small delay between successful calls to avoid hammering LLM
            # time.sleep(0.5)
            # test for valid json
            try:
                result = json.loads(msg)
                result['model'] = completion.model
                return result
            except JSONDecodeError:
                decoded_msg = decode_message(msg)
                decoded_msg['model'] = completion.model
                return decoded_msg

        except Exception as e:
            error_str = str(e)
            base_delay = 2

            if attempt < max_retries - 1:
                delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
                print(f"LLM error for {pid} (attempt {attempt+1}/{max_retries}), retrying in {delay:.1f}s: {error_str}")
                time.sleep(delay)
                continue
            # Non-retryable error or out of retries
            raise

In [13]:
# Ensure output directory exists
outdir = input('Path to data directory: ')
os.makedirs(outdir, exist_ok=True)

Path to data directory:  /Users/e996w533/Documents/collections/udk-microfilm/scripts/workflow/nrp-jobs/production_1/data


In [28]:
output_files = {
    'lp_items': '{}/lp_items_{}_{}.csv',
    'pages': '{}/pages_{}_{}.csv',
    'llm_items': '{}/llm_items_{}_{}.csv',
    'ads': '{}/ads_{}_{}_{}.csv',
    'ed_comics': '{}/ed_comics_{}_{}_{}.csv',
    'errors': '{}/errors_{}_{}_{}.csv',
}

def save_results(num='16'):
    """Save current results to CSV"""

    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S%f')

    for data in [(lp_results, 'lp_items'),(page_results,'pages'),
        (llm_item_results,'llm_items'),(ad_results,'ads'),
        (edc_results,'ed_comics'),(error_results,'errors')]:
        if data[0]:
            fn = output_files[data[1]].format(outdir, timestamp, num)
            pd.DataFrame(data[0]).to_csv(fn, index=False)
            print(f"Saved {len(data[0])} {fn}")

    print(f"Results saved successfully")

In [30]:
# Initialize result lists
lp_results = []
page_results = []
llm_item_results = []
ad_results = []
edc_results = []
error_results = []

In [31]:
# pid and identifier are both required
# for testing, enter as a list of tuples: [(pid, identifier)]
tasks = [
 ('ku-udk:200538',),
 ('ku-udk:199673',),
        ]

In [32]:
lp_process = False
try:
    df_items.shape
except:
    df_items = pd.DataFrame(columns=['pid'])

for task in tqdm(tasks):
    # print(task)
    res = is_client.solr_query(f'PID:"{task[0]}"', fl=['PID','mods_identifier_local_displayLabel_ms'])
    ident = res['response']['docs'][0]['mods_identifier_local_displayLabel_ms'][0]

    pid = task[0]
    # identifier = task[1]
    identifier = ident
    
    if pid in df_items.pid.unique():
        continue
    try:
        # layout parser
        lp_data, image = run_lp(pid, identifier, lp=lp_process)
        print("Image retrieved successfully")
        consecutive_errors = 0
    
        # Store results
        if lp_data:
            lp_results.extend(lp_data)
            print("LP data added")
    
    except Exception as e:
                print(e)
                # consecutive_errors = log_error(pid, identifier, e, task, error_count, consecutive_errors)
                if consecutive_errors >= 10:
                    print("Too many consecutive errors, exiting")
                    break
                continue

    # llm queries
    try:
        # LLM data
        start_date, end_date = parse_dates(identifier.split('/')[0])
        date_range = f"{start_date} to {end_date}" if start_date and end_date else "unknown"

        # # START - comment out to skip page-level LLM (1 of 1)
        # # Page metadata - header
        # page_query = llm_query(pid, identifier, date_range, image, header=True)
        # page_results.append({'pid': pid, "identifier": identifier, **page_query})
        # print("Page processed successfully")
        # # END - comment out to skip page-level LLM (1 of 1)

        # START - comment out to skip item-level LLM (1 of 1)
        # LLM items
        llm_item_query = llm_query(pid, identifier, date_range, image)
        if len(llm_item_query.get('items', [])) > 0:
            for item in llm_item_query['items']:
                llm_item_results.append({'pid': pid, "identifier": identifier, **item})
        print("Items processed successfully")
        # END - comment out to skip item-level LLM

        # # START - comment out to skip ads via LLM (requires layoutparser) (1 of 1)
        # # # Ads
        # lp_ads = [d for d in lp_data if d['type'] == 6]
        # xy_coords = ['x_1', 'x_2', 'y_1', 'y_2']
        
        # if len(lp_ads) == 0:
        #     ad_results.append({'pid': pid, 'identifier': identifier, 'error': 'No ads found by LLM'})
        # else:
        #     for ad_dict in lp_ads:
        #         ad_coords = {k: ad_dict[k] for k in xy_coords if k in ad_dict}
        #         ad_query = llm_query(pid, identifier, date_range, image, coords=('ads',ad_coords))
        #         ad_results.append({'pid': pid, "identifier": identifier, **ad_coords, **ad_query})
        # print("Ads processed successfully")
        # # END - comment out to skip ads

        # # START - comment out to skip editorial comics via LLM (requires layoutparser) (1 of 1)
        # # editorial comics

        # # OPTION A - set lp_edc from existing lp_df
        # # # lp_data = lp_df[(lp_df.pid==pid) & (lp_df.type==4)]
        # # # lp_edc = lp_data.to_dict('records')
        # #
        # # # OPTION B - set lp_edc from just-run lp_data
        # lp_edc = [d for d in lp_data if d['type'] == 4]
        # xy_coords = ['x_1', 'x_2', 'y_1', 'y_2']
        
        # if len(lp_edc) == 0:
        #     pass
        #     # edc_results.append({'pid': pid, 'identifier': identifier, 'error': 'No editorial comics found by LP'})
        # else:
        #     for edc_dict in lp_edc:
        #         edc_coords = {k: edc_dict[k] for k in xy_coords if k in edc_dict}
        #         edc_query = llm_query(pid, identifier, date_range, image, coords=('edc',edc_coords))
        #         edc_results.append({'pid': pid, "identifier": identifier, **edc_coords, **edc_query})
        #     print("Editorial cartoons processed successfully")
        # # END - comment out to skip editorial comics

        consecutive_errors = 0  # Reset error counter on success
        print(f"Successfully processed {pid}")

        # optional logging to keep running count
        for data in [(lp_results, 'lp_items'),(page_results,'pages'),
            (llm_item_results,'llm_items'),(ad_results,'ads'),
            (edc_results,'ed_comics'),(error_results,'errors')]:
            if data[0]:
                print(f"  -- Current count: {len(data[0])} {data[1]}")

    except Exception as e:
        # consecutive_errors = log_error(pid, identifier, e, task, error_count, consecutive_errors)
        print(e)
        if consecutive_errors >= 10:
            print("Too many consecutive errors, exiting")
            break
        continue

  0%|          | 0/2 [00:00<?, ?it/s]

Image retrieved successfully
Resized image: 6.89MB
Resized image: 3.04MB
Items processed successfully
Successfully processed ku-udk:200538
  -- Current count: 36 llm_items
Image retrieved successfully
Resized image: 6.84MB
Resized image: 3.04MB
Items processed successfully
Successfully processed ku-udk:199673
  -- Current count: 55 llm_items


In [34]:
# check items
page_results

[]

In [35]:
save_results(num='17')

Saved 55 /Users/e996w533/Documents/collections/udk-microfilm/scripts/workflow/nrp-jobs/production_1/data/llm_items_20251116_145121608180_17.csv
Results saved successfully


In [None]:
# review as needed
df_items = pd.DataFrame(llm_item_results)
df_items.count()

In [None]:
df_items.pid.unique()

In [None]:
df_items.to_csv('../../production_1/data/merged_data_llm_14.csv', index=None)