# core

>Core routines, and also where the main `formalyzer` workflow is defined.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

## Basic File I/O

In [None]:
#| export
import os 

def read_text_file(filename:str) -> list:
    "generic, read any text file" 
    with open(os.path.expanduser(filename)) as f:
        return f.read()

In [None]:
def read_recc_info(info_file:str) -> list:
    "read a text file of info on the reviewer" 
    return read_text_file(info_file)

In [None]:
recc_info = read_recc_info("../example/recc_info.txt") 
print(recc_info)

Reccomender Name: Teacher Person 
Title: Professor of Cleverness 

Address: 
Department of Curiosities
Generic University 
1337 Generic Pl. 
Springfield, WA 31416 USA

Phone: 555-123-1337
Email: teacher.person@generic.edu



In [None]:
#| export
def read_urls_file(urls_file:str) -> list:
    "read a text file where each line is a url of a submission site" 
    text = read_text_file(urls_file)
    return [line for line in text.splitlines() if line]

In [None]:
urls = read_urls_file("../example/sample_urls.txt") 
print(f"{len(urls)} urls in list")
for i, url in enumerate(urls): 
    print(f"{i+1} of {len(urls)}: {url}")

1 urls in list
1 of 1: http://localhost:8000/sample_form.html


In [None]:
#| export
from pypdf import PdfReader
import logging
logging.getLogger("pypdf").setLevel(logging.ERROR)

def read_pdf_text(pdf_file):
    reader = PdfReader(os.path.expanduser(pdf_file))
    return "\n".join(page.extract_text() for page in reader.pages)

In [None]:
letter_text = read_pdf_text("../example/sample_letter.pdf")
print(letter_text)

   Dear Graduate Admissions Committee,  I am writing to recommend Student Person for admission to your graduate program. Having worked closely with them for two years in both teaching and research capacities, I can say they are among the strongest students I have encountered in over a decade of academic work.  Student Person took several of my advanced courses — Quantum Rollercoasters, Physics of Impossible Machines, and a seminar on Neural Networks for Curious Minds. They also worked with me on an independent research project. In every setting, they showed sharp intellectual ability, creative thinking, and real persistence. Their coursework went beyond surface-level competence; they clearly grasped the deeper principles at play. As a researcher, they brought fresh perspectives while staying receptive to guidance.  What stands out most is their dependability. They consistently met deadlines and produced high-quality work. During our independent project, they actually moved ahead of sch

## Parsing HTML (Form) Page

In [None]:
#| export
from bs4 import BeautifulSoup
import json, re


def scrape_form_fields(html):
    """Extract all fillable form fields from HTML"""
    soup = BeautifulSoup(html, 'html.parser')
    fields = []
    for inp in soup.find_all(['input', 'select', 'textarea']):
        field_id = inp.get('id') or inp.get('name', '')
        if not field_id: continue
        field_type = inp.get('type', inp.name)
        if field_type in ['hidden', 'submit', 'button']: continue
        
        label = soup.find('label', {'for': field_id})
        label_text = label.get_text(strip=True) if label else ''
        current_value = inp.get('value', '')
        
        options = None
        if inp.name == 'select':
            options = [opt.get_text(strip=True) for opt in inp.find_all('option') if opt.get_text(strip=True)]
        
        fields.append({
            'id': field_id, 'label': label_text, 'type': field_type,
            'options': options, 'prefilled': bool(current_value and field_type not in ['radio','checkbox'] and inp.name != 'select')
        })
    return fields

In [None]:
html = read_text_file("../example/sample_form.html") 
fields = scrape_form_fields(html) 
[f['id'] for f in fields]

['applicant_name',
 'applicant_message',
 'ferpa_waiver',
 'ferpa_waiver',
 'program',
 'discipline',
 'prefix',
 'first_name',
 'middle_name',
 'last_name',
 'organization',
 'title',
 'phone',
 'email',
 'addr1',
 'addr2',
 'city',
 'state',
 'zip',
 'country',
 'months_known',
 'years_range',
 'capacity',
 'rating_intellectual',
 'rating_scientific',
 'rating_research',
 'rating_prev_work',
 'rating_lab',
 'rating_oral_1',
 'rating_oral_2',
 'rating_oral_3',
 'rating_oral_4',
 'rating_oral_5',
 'rating_oral_6',
 'rating_writing_1',
 'rating_writing_2',
 'rating_writing_3',
 'rating_writing_4',
 'rating_writing_5',
 'rating_writing_6',
 'rating_originality_1',
 'rating_originality_2',
 'rating_originality_3',
 'rating_originality_4',
 'rating_originality_5',
 'rating_originality_6',
 'rating_perseverance_1',
 'rating_perseverance_2',
 'rating_perseverance_3',
 'rating_perseverance_4',
 'rating_perseverance_5',
 'rating_perseverance_6',
 'rating_independence_1',
 'rating_independence_

## LLM Usage
Next we prompt the LLM to figure out which form fields apply, and how: 

In [None]:
#| export
from claudette import Chat

def get_field_mappings(fields, recc_info, letter_text, model="claude-sonnet-4-20250514", debug=False):
    """Use LLM to map recommender info and letter to form fields"""
    prompt = f"""You are filling out a graduate school recommendation form.

RECOMMENDER INFO:
{recc_info}

RECOMMENDATION LETTER:
{letter_text}

FORM FIELDS TO FILL:
{json.dumps([f for f in fields if not f['prefilled']], indent=2)}

For each field, provide the field ID and value to fill. For dropdowns, pick from the options listed.
Pay attention to groups of radio buttons (grouped via div or similar id prefixes) as they may form likert scales.
Return as JSON array: [{{"id": "form_xxx", "value": "..."}}]
"""
    chat = Chat(model=model)
    if debug: print(f"  Prompt length is {len(prompt)} characters")
    response = chat(prompt)
    json_match = re.search(r'```json\s*(.*?)\s*```', response.content[0].text, re.DOTALL)
    return json.loads(json_match.group(1))

## Filling in the Form

In [None]:
#| export
async def get_element_info(page, field_id):
    "given an id or a name, find the element on the page and get its info"
    elem = page.locator(f'#{field_id}, [name="{field_id}"]')
    await elem.wait_for(timeout=1000) # 1 second. default timeout for non-found fields is 30 seconds.
    tag = await elem.evaluate('el => el.tagName.toLowerCase()')
    input_type = await elem.evaluate('el => el.type')
    return elem, tag, input_type


async def should_skip(elem, tag, input_type, skip_prefilled) -> bool:
    "should we fill in this element? Not if there's already a value there."
    if skip_prefilled and tag != 'select' and input_type != 'radio':
        current = await elem.input_value()
        if current: return True # there's already a value provided, skip it
    return False


async def fill_element(elem, tag, input_type, field_id, value):
    "actually fill in this element"
    if tag == 'select':
        await elem.select_option(label=value)
    else:
        if input_type == 'radio':
            print(f"  Clicking radio button {field_id} with value {value}")
            await elem.click()
        else:
            await elem.fill(value)   


async def fill_form(page, mappings, skip_prefilled=True, debug=False):
    """Fill form fields using Playwright"""
    results = {'filled': [], 'skipped': [], 'errors': []}
    for i, item in enumerate(mappings):
        field_id, value = item['id'], item['value']
        if debug: print(f"Mapping {i+1} of {len(mappings)}:  Processing {field_id}...")
        try:
            elem, tag, input_type = await get_element_info(page, field_id)
            
            if await should_skip(elem, tag, input_type, skip_prefilled):
                results['skipped'].append(field_id)
                continue
            
            await fill_element(elem, tag, input_type, field_id, value)
            results['filled'].append(field_id)
        except Exception as e:
            print(f"  Error filling {field_id}: {e}")
            results['errors'].append({'id': field_id, 'error': str(e)[:50]})
    return results

In [None]:
#| export
async def upload_recommendation(page, file_path):
    """Upload the recommendation PDF"""
    file_input = page.locator('input[type="file"]').first
    await file_input.set_input_files(file_path)

In [None]:
#| export
async def process_url(page, url, recc_info, letter_text, pdf_path, debug=False):
    """Process a single recommendation URL"""
    await page.goto(url)
    html = await page.content()
    
    if debug: print("Scraping form fields")
    fields = scrape_form_fields(html)
    if debug: print(f"Found {len(fields)} fields")
    
    if debug: print("Calling LLM to get field mappings")
    mappings = get_field_mappings(fields, recc_info, letter_text, debug=debug)
    if debug: print(f"Got {len(mappings)} mappings from LLM")
    
    if debug: print("Filling in form")
    results = await fill_form(page, mappings, debug=debug)
    if debug: print(f"Filled: {len(results['filled'])}, Errors: {len(results['errors'])}")
    
    if debug: print("Uploading PDF") 
    await upload_recommendation(page, pdf_path)
    if debug: print("Uploaded PDF")
    
    input("Review the form, then press Enter to continue to next URL (or Ctrl+C to stop)...")

# `formalyzer` CLI script

In [None]:
#| export
def read_info(recc_info:str, pdf_path:str, urls:str):
    "parse CLI args and read input files"
    recc_info, pdf_path = [os.path.expanduser(_) for _ in [recc_info, pdf_path]]
    assert os.path.exists(recc_info), f"File not found: {recc_info}"
    assert os.path.exists(pdf_path), f"File not found: {pdf_path}"
    recc_info = read_recc_info(recc_info) 
    letter_text = read_pdf_text(pdf_path)
    if os.path.exists(os.path.expanduser(urls)): 
        print(f"File {urls} exists. Reading.")
        urls = read_urls_file(urls)
    else: 
        print(f"No file {urls}. Treating it as a single url") 
        urls = [urls]
    return recc_info, letter_text, urls 

In [None]:
#| export
async def setup_browser():
    """Connect to Chrome with remote debugging"""
    from playwright.async_api import async_playwright

    pw = await async_playwright().start()
    browser = await pw.chromium.connect_over_cdp("http://localhost:9222")
    page = await browser.new_page()
    return pw, browser, page


async def run_formalyzer(recc_info, letter_text, urls, pdf_path, debug=False):
    """Main async workflow"""
    pw, browser, page = await setup_browser()
    try:
        for i, url in enumerate(urls):
            if not url.strip(): continue  # skip empty urls
            print(f"\nURL {i+1} of {len(urls)}: {url}")
            await process_url(page, url, recc_info, letter_text, pdf_path, debug=debug)
    finally:
        await browser.close()
        await pw.stop()


from fastcore.script import call_parse
import asyncio

@call_parse
def main(recc_info:str, pdf_path:str, urls:str, debug:bool=False):
    assert os.environ.get('ANTHROPIC_API_KEY'), "Please set ANTHROPIC_API_KEY environment variable" # used by Claudette
    recc_info, letter_text, urls = read_info(recc_info, pdf_path, urls)
    if debug:
        print("recc_info =\n", recc_info)
        print("letter_text =\n", letter_text)
        print("urls =\n", urls)
    
    # Run the async workflow
    asyncio.run(run_formalyzer(recc_info, letter_text, urls, pdf_path, debug))

In [None]:
#main("~/recc_info.txt", "~/recc_letter.pdf","~/recc_urls.txt", debug=True)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()