# core

>Here's where the main `formalyzer` workflow is defined

```markdown
formalyzer: 

Reads PDF reccomendation letter, fills in admissions form(s)

usage: 
  formalyzer <recc_letter.pdf> <url_list.txt>

Instead of url_list.txt, a single URL can be given (esp. for testing purposes) 

Description: 
Formalyzer will scrape the text from the PDF recc letter, 
and for each URL in url_list, it will: 
- launch a browser tab for that url 
- fill in the form using what the LLM has gleaned from the recc letter
- attach the PDF via the form's upload/attachment button
...and do no more. 
The user will need to review the page and press the Submit button manually.


Requirements: 
- Playwright 
- ANTHROPIC_API_KEY env var. (Could support other LLMs layer)
- pypdf  

Author: Scott H. Hawley, @drscotthawley
```



In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import os 

def read_recc_info(info_file:str) -> list:
    "read a text file of info on the reviewer" 
    with open(os.path.expanduser(info_file)) as f:
        return f.read()

In [None]:
recc_info = read_recc_info("~/recc_info.txt") 
recc_info

'Reccomender Name: Scott H. Hawley \nTitle: Professor of Physics \n\nAddress: \nBelmont University \n1900 Belmont Blvd \nNashville, TN 37211\n\nPhone: 615-460-6206\nEmail: scott.hawley@belmont.edu\n'

In [None]:
#| export
def read_urls_file(urls_file:str) -> list:
    "read a text file where each line is a url of a submission site" 
    with open(os.path.expanduser(urls_file)) as f:
        return f.read().splitlines()

In [None]:
urls = read_urls_file("~/recc_urls.txt") 
print(f"{len(urls)} urls in list")

11 urls in list


In [None]:
#| export
from pypdf import PdfReader
import logging
logging.getLogger("pypdf").setLevel(logging.ERROR)

def read_pdf_text(pdf_file):
    reader = PdfReader(os.path.expanduser(pdf_file))
    return "\n".join(page.extract_text() for page in reader.pages)

In [None]:
letter_text = read_pdf_text("~/recc_letter.pdf")
#letter_text

In [None]:
#| export
from bs4 import BeautifulSoup
import json, re


def scrape_form_fields(html):
    """Extract all fillable form fields from HTML"""
    soup = BeautifulSoup(html, 'html.parser')
    fields = []
    for inp in soup.find_all(['input', 'select', 'textarea']):
        field_id = inp.get('id', '')
        if not field_id: continue
        field_type = inp.get('type', inp.name)
        if field_type in ['hidden', 'submit', 'button']: continue
        
        label = soup.find('label', {'for': field_id})
        label_text = label.get_text(strip=True) if label else ''
        current_value = inp.get('value', '')
        
        options = None
        if inp.name == 'select':
            options = [opt.get_text(strip=True) for opt in inp.find_all('option') if opt.get_text(strip=True)]
        
        fields.append({
            'id': field_id, 'label': label_text, 'type': field_type,
            'options': options, 'prefilled': bool(current_value and field_type not in ['radio','checkbox'] and inp.name != 'select')
        })
    return fields

In [None]:
#| export
from claudette import Chat

def get_field_mappings(fields, recc_info, letter_text, model="claude-sonnet-4-20250514"):
    """Use LLM to map recommender info and letter to form fields"""
    prompt = f"""You are filling out a graduate school recommendation form.

RECOMMENDER INFO:
{recc_info}

RECOMMENDATION LETTER:
{letter_text}

FORM FIELDS TO FILL:
{json.dumps([f for f in fields if not f['prefilled']], indent=2)}

For each field, provide the field ID and value to fill. For dropdowns, pick from the options listed.
Return as JSON array: [{{"id": "form_xxx", "value": "..."}}]
Skip radio buttons.
"""
    chat = Chat(model=model)
    response = chat(prompt)
    json_match = re.search(r'```json\s*(.*?)\s*```', response.content[0].text, re.DOTALL)
    return json.loads(json_match.group(1))

In [None]:
#| export
async def fill_form(page, mappings, skip_prefilled=True):
    """Fill form fields using Playwright"""
    results = {'filled': [], 'skipped': [], 'errors': []}
    for item in mappings:
        field_id, value = item['id'], item['value']
        try:
            elem = page.locator(f'#{field_id}')
            tag = await elem.evaluate('el => el.tagName.toLowerCase()')
            
            if skip_prefilled and tag != 'select':
                current = await elem.input_value()
                if current:
                    results['skipped'].append(field_id)
                    continue
            
            if tag == 'select':
                await elem.select_option(label=value)
            else:
                await elem.fill(value)
            results['filled'].append(field_id)
        except Exception as e:
            results['errors'].append({'id': field_id, 'error': str(e)[:50]})
    return results

In [None]:
#| export
async def upload_recommendation(page, file_path):
    """Upload the recommendation PDF"""
    file_input = page.locator('input[type="file"]').first
    await file_input.set_input_files(file_path)

In [None]:
#| export
import asyncio

async def process_url(page, url, recc_info, letter_text, pdf_path, debug=False):
    """Process a single recommendation URL"""
    await page.goto(url)
    html = await page.content()
    
    fields = scrape_form_fields(html)
    if debug: print(f"Found {len(fields)} fields")
    
    mappings = get_field_mappings(fields, recc_info, letter_text)
    if debug: print(f"Got {len(mappings)} mappings from LLM")
    
    results = await fill_form(page, mappings)
    if debug: print(f"Filled: {len(results['filled'])}, Errors: {len(results['errors'])}")
    
    await upload_recommendation(page, pdf_path)
    if debug: print("Uploaded PDF")
    
    input("Review the form, then press Enter to continue to next URL (or Ctrl+C to stop)...")

# `formalyzer` CLI script

In [None]:
#| export
def read_info(recc_info:str, pdf_path:str, urls:str):
    "parse CLI args and read input files"
    recc_info, pdf_path = [os.path.expanduser(_) for _ in [recc_info, pdf_path]]
    assert os.path.exists(recc_info), f"File not found: {recc_info}"
    assert os.path.exists(pdf_path), f"File not found: {pdf_path}"
    recc_info = read_recc_info(recc_info) 
    letter_text = read_pdf_text(pdf_path)
    if os.path.exists(os.path.expanduser(urls)): 
        print(f"File {urls} exists. Reading.")
        urls = read_urls_file(urls)
    else: 
        print(f"No file {urls}. Treating it as a single url") 
        urls = [urls]
    return recc_info, letter_text, urls 

In [None]:
#| export
import os 
from playwright.async_api import async_playwright
from fastcore.script import call_parse

async def setup_browser():
    """Connect to Chrome with remote debugging"""
    pw = await async_playwright().start()
    browser = await pw.chromium.connect_over_cdp("http://localhost:9222")
    page = await browser.new_page()
    return pw, browser, page


async def run_formalyzer(recc_info, letter_text, urls, pdf_path, debug=False):
    """Main async workflow"""
    pw, browser, page = await setup_browser()
    try:
        for i, url in enumerate(urls):
            if not url.strip(): continue  # skip empty urls
            print(f"\nURL {i+1} of {len(urls)}: {url}")
            await process_url(page, url, recc_info, letter_text, pdf_path, debug=debug)
    finally:
        await browser.close()
        await pw.stop()


@call_parse
def main(recc_info:str, pdf_path:str, urls:str, debug:bool=False):
    assert os.environ.get('ANTHROPIC_API_KEY'), "Please set ANTHROPIC_API_KEY environment variable" # used by Claudette
    recc_info, letter_text, urls = read_info(recc_info, pdf_path, urls)
    if debug:
        print("recc_info =\n", recc_info)
        print("letter_text =\n", letter_text)
        print("urls =\n", urls)
    
    # Run the async workflow
    asyncio.run(run_formalyzer(recc_info, letter_text, urls, pdf_path, debug))

In [None]:
#main("~/recc_info.txt", "~/recc_letter.pdf","~/recc_urls.txt", debug=True)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

Okay, we built this whole thing and it worked fine when we were running from Solveit connecting to my laptop via an ssh tunnel.  
but the goal is to make it so I could run it locally.  When I try to run it locally, I get this error: 
```
$ formalyzer recc_info.txt BrodyBlackwood.pdf brody_urls.txt 
File brody_urls.txt exists. Reading.
Traceback (most recent call last):
  File "/Users/shawley/github/formalyzer/.venv/bin/formalyzer", line 10, in <module>
    sys.exit(main())
             ~~~~^^
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/fastcore/script.py", line 125, in _f
    return tfunc(**merge(args, args_from_prog(func, xtra)))
  File "/Users/shawley/github/formalyzer/formalyzer/core.py", line 189, in main
    asyncio.run(run_formalyzer(recc_info, letter_text, urls, pdf_path, debug))
    ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/shawley/.local/share/uv/python/cpython-3.13.2-macos-aarch64-none/lib/python3.13/asyncio/runners.py", line 195, in run
    return runner.run(main)
           ~~~~~~~~~~^^^^^^
  File "/Users/shawley/.local/share/uv/python/cpython-3.13.2-macos-aarch64-none/lib/python3.13/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "/Users/shawley/.local/share/uv/python/cpython-3.13.2-macos-aarch64-none/lib/python3.13/asyncio/base_events.py", line 725, in run_until_complete
    return future.result()
           ~~~~~~~~~~~~~^^
  File "/Users/shawley/github/formalyzer/formalyzer/core.py", line 168, in run_formalyzer
    pw, browser, page = await setup_browser()
                        ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/shawley/github/formalyzer/formalyzer/core.py", line 161, in setup_browser
    browser = await pw.chromium.connect_over_cdp("http://localhost:9222")
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/playwright/async_api/_generated.py", line 14898, in connect_over_cdp
    await self._impl_obj.connect_over_cdp(
    ...<4 lines>...
    )
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/playwright/_impl/_browser_type.py", line 207, in connect_over_cdp
    response = await self._channel.send_return_as_dict(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        "connectOverCDP", TimeoutSettings.launch_timeout, params
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/playwright/_impl/_connection.py", line 83, in send_return_as_dict
    return await self._connection.wrap_api_call(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<3 lines>...
    )
    ^
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/playwright/_impl/_connection.py", line 559, in wrap_api_call
    raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None
playwright._impl._errors.Error: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
  - <ws preparing> retrieving websocket url from http://localhost:9222
```
Presumably because there's absolutely nothing running on port 9222. The browser's not running I think we need more instructions for running this locally -- on the Mac itself. 

##### ðŸ¤–ReplyðŸ¤–<!-- SOLVEIT_SEPARATOR_7f3a9b2c -->

ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ 

You're exactly right! The `setup_browser()` function expects Chrome to already be running with remote debugging enabled on port 9222. When you were using the SSH tunnel, that port was being forwarded to a Chrome instance running elsewhere.

To run locally, you need to start Chrome with the `--remote-debugging-port=9222` flag *before* running formalyzer.

On Mac, try this in your terminal:

```bash
/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
```

This will launch Chrome with the debugging port open. Then in a *separate* terminal window, run your formalyzer command.

Does that make sense? Give it a try and let me know if it works or if you hit another snag!

Ok... but i still get an error: 
```
$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222
Opening in existing browser session.
$
```

then i run and again get this: 
```
$ formalyzer recc_info.txt BrodyBlackwood.pdf brody_urls.txt 
File brody_urls.txt exists. Reading.
Traceback (most recent call last):
  File "/Users/shawley/github/formalyzer/.venv/bin/formalyzer", line 10, in <module>
    sys.exit(main())
             ~~~~^^
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/fastcore/script.py", line 125, in _f
    return tfunc(**merge(args, args_from_prog(func, xtra)))
  File "/Users/shawley/github/formalyzer/formalyzer/core.py", line 189, in main
    asyncio.run(run_formalyzer(recc_info, letter_text, urls, pdf_path, debug))
    ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/shawley/.local/share/uv/python/cpython-3.13.2-macos-aarch64-none/lib/python3.13/asyncio/runners.py", line 195, in run
    return runner.run(main)
           ~~~~~~~~~~^^^^^^
  File "/Users/shawley/.local/share/uv/python/cpython-3.13.2-macos-aarch64-none/lib/python3.13/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "/Users/shawley/.local/share/uv/python/cpython-3.13.2-macos-aarch64-none/lib/python3.13/asyncio/base_events.py", line 725, in run_until_complete
    return future.result()
           ~~~~~~~~~~~~~^^
  File "/Users/shawley/github/formalyzer/formalyzer/core.py", line 168, in run_formalyzer
    pw, browser, page = await setup_browser()
                        ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/shawley/github/formalyzer/formalyzer/core.py", line 161, in setup_browser
    browser = await pw.chromium.connect_over_cdp("http://localhost:9222")
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/playwright/async_api/_generated.py", line 14898, in connect_over_cdp
    await self._impl_obj.connect_over_cdp(
    ...<4 lines>...
    )
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/playwright/_impl/_browser_type.py", line 207, in connect_over_cdp
    response = await self._channel.send_return_as_dict(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        "connectOverCDP", TimeoutSettings.launch_timeout, params
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/playwright/_impl/_connection.py", line 83, in send_return_as_dict
    return await self._connection.wrap_api_call(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<3 lines>...
    )
    ^
  File "/Users/shawley/github/formalyzer/.venv/lib/python3.13/site-packages/playwright/_impl/_connection.py", line 559, in wrap_api_call
    raise rewrite_error(error, f"{parsed_st['apiName']}: {error}") from None
playwright._impl._errors.Error: BrowserType.connect_over_cdp: connect ECONNREFUSED ::1:9222
Call log:
  - <ws preparing> retrieving websocket url from http://localhost:9222