# core

>Here's where the main `formalyzer` workflow is defined

```markdown
formalyzer: 

Reads PDF reccomendation letter, fills in admissions form(s)

usage: 
  formalyzer <recc_letter.pdf> <url_list.txt>

Instead of url_list.txt, a single URL can be given (esp. for testing purposes) 

Description: 
Formalyzer will scrape the text from the PDF recc letter, 
and for each URL in url_list, it will: 
- launch a browser tab for that url 
- fill in the form using what the LLM has gleaned from the recc letter
- attach the PDF via the form's upload/attachment button
...and do no more. 
The user will need to review the page and press the Submit button manually.


Requirements: 
- Playwright 
- ANTHROPIC_API_KEY env var. (Could support other LLMs layer)
- pypdf  

Author: Scott H. Hawley, @drscotthawley
```



In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import os 

def read_recc_info(info_file:str) -> list:
    "read a text file of info on the reviewer" 
    with open(os.path.expanduser(info_file)) as f:
        return f.read()

In [None]:
recc_info = read_recc_info("~/recc_info.txt") 
recc_info

'Reccomender Name: Scott H. Hawley \nTitle: Professor of Physics \n\nAddress: \nBelmont University \n1900 Belmont Blvd \nNashville, TN 37211\n\nPhone: 615-460-6206\nEmail: scott.hawley@belmont.edu\n'

In [None]:
#| export
def read_urls_file(urls_file:str) -> list:
    "read a text file where each line is a url of a submission site" 
    with open(os.path.expanduser(urls_file)) as f:
        return f.read().splitlines()

In [None]:
urls = read_urls_file("~/recc_urls.txt") 
print(f"{len(urls)} urls in list")

11 urls in list


In [None]:
#| export
from pypdf import PdfReader
import logging
logging.getLogger("pypdf").setLevel(logging.ERROR)

def read_pdf_text(pdf_file):
    reader = PdfReader(os.path.expanduser(pdf_file))
    return "\n".join(page.extract_text() for page in reader.pages)

In [None]:
letter_text = read_pdf_text("~/recc_letter.pdf")
#letter_text

In [None]:
import json

# Create a structured prompt for the LLM
prompt = f"""You are filling out a graduate school recommendation form.

RECOMMENDER INFO:
{recc_info}

RECOMMENDATION LETTER:
{letter_text[:2000]}... [truncated]

FORM FIELDS TO FILL (first 20):
{json.dumps(fields_to_fill[:20], indent=2)}

For each field, provide:
1. The field ID
2. The value to fill in (or select from options)
3. Brief reasoning

Return as JSON array like:
[{{"id": "form_xxx", "value": "Outstanding", "reason": "Letter says top 2%"}}]
"""

print(f"Prompt length: {len(prompt)} chars")
print(prompt[:450])

NameError: name 'fields_to_fill' is not defined

Next? 

##### ðŸ¤–ReplyðŸ¤–<!-- SOLVEIT_SEPARATOR_7f3a9b2c -->

ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ 

**Note:** Some earlier messages were truncated due to context limits, but we have what we need.

Now let's call the LLM to get the field mappings. We'll use claudette:

```python
from claudette import Chat

chat = Chat(model="claude-sonnet-4-20250514")
response = chat(prompt)
print(response)
```

This will send the prompt with the recommender info, letter, and field structure to Claude and get back the JSON mapping. Run that and let's see what it suggests!

In [None]:
from claudette import Chat

chat = Chat(model="claude-sonnet-4-20250514")
response = chat(prompt)
print(response)

Message(id='msg_01KAg9RcNiQJa2PrputhLozs', content=[TextBlock(citations=None, text='```json\n[\n  {\n    "id": "form_b2d46a9c-36e6-4095-bf56-9d15b524bea5",\n    "value": "Standard",\n    "reason": "This is a physics student recommendation, not nursing-specific"\n  },\n  {\n    "id": "form_d4a7d23d-9fd3-5542-466e-d2e30fe30e6c",\n    "value": "Belmont University",\n    "reason": "Primary institution address from recommender info"\n  },\n  {\n    "id": "form_210773b2-503d-7748-63b5-f0f37c83b6ed",\n    "value": "1900 Belmont Blvd",\n    "reason": "Street address from recommender info"\n  },\n  {\n    "id": "form_cb198687-7870-debf-65f3-8daf444c1a7f",\n    "value": "Nashville",\n    "reason": "City from recommender address"\n  },\n  {\n    "id": "form_1f9293f9-1537-56e3-7bb2-0f4d3a1848a5",\n    "value": "TN",\n    "reason": "State from recommender address"\n  },\n  {\n    "id": "form_a06b225f-d0e7-701d-34c7-e92c5b4d3f3e",\n    "value": "37211",\n    "reason": "Zip code from recommender addr

That's looking pretty good, although I'm not sure if it included all of the fields. 

##### ðŸ¤–ReplyðŸ¤–<!-- SOLVEIT_SEPARATOR_7f3a9b2c -->

ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ 

You're right â€” we only sent the first 20 fields in the prompt (`fields_to_fill[:20]`). We have 123 total! Let's send all of them:

```python
# Send ALL fields this time
prompt_full = f"""You are filling out a graduate school recommendation form.

RECOMMENDER INFO:
{recc_info}

RECOMMENDATION LETTER:
{letter_text}

FORM FIELDS TO FILL:
{json.dumps(fields_to_fill, indent=2)}

For each field, provide the field ID and value to fill, and brief reasoning. 
For dropdowns, pick from the options listed.
Return as JSON array: [{{"id": "form_xxx", "value": "..."}}]
Skip fields that are already filled or radio buttons.
"""

print(f"Full prompt length: {len(prompt_full)} chars")
response_full = chat(prompt_full)
print(response_full)
```

This sends the complete letter and all 123 fields. Run that?

In [None]:
#| eval: false
# Send ALL fields this time
prompt_full = f"""You are filling out a graduate school recommendation form.

RECOMMENDER INFO:
{recc_info}

RECOMMENDATION LETTER:
{letter_text}

FORM FIELDS TO FILL:
{json.dumps(fields_to_fill, indent=2)}

For each field, provide the field ID and value to fill. For dropdowns, pick from the options listed.
Return as JSON array: [{{"id": "form_xxx", "value": "..."}}]
Skip fields that are already filled or radio buttons.
"""

print(f"Full prompt length: {len(prompt_full)} chars")
response_full = chat(prompt_full)
#print(response_full)

Okay, how are we doing? Are we ready to try a sample filling into the form? By the way, it's crucial that you do **not** press the final submit button at the bottom of the form. I need to review the filled-in page manually before we submit!  Note that some of the fields are already filled in. For example, it looks like the first four text fields are already filled in with values. I would say if there are already values filled in, we can skip them and move on to the others. 

##### ðŸ¤–ReplyðŸ¤–<!-- SOLVEIT_SEPARATOR_7f3a9b2c -->

ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ 

Great! Let's parse the response and fill the form. I'll skip pre-filled fields and definitely **not** click submit:

```python
import re

# Extract JSON from response
json_match = re.search(r'```json\s*(.*?)\s*```', response_full.content[0].text, re.DOTALL)
fill_data = json.loads(json_match.group(1))

print(f"Got {len(fill_data)} field mappings to apply")

# Fill the form, skipping pre-filled fields
for item in fill_data:
    field_id = item['id']
    value = item['value']
    
    try:
        elem = page.locator(f'#{field_id}')
        
        # Check if it's a select or input/textarea
        tag = await elem.evaluate('el => el.tagName.toLowerCase()')
        
        # Check if already has a value (skip if so)
        current = await elem.input_value() if tag != 'select' else await elem.evaluate('el => el.value')
        if current and tag != 'select':
            print(f"SKIP (prefilled): {field_id[:30]}")
            continue
        
        if tag == 'select':
            await elem.select_option(label=value)
            print(f"SELECT: {value[:30]}")
        else:
            await elem.fill(value)
            print(f"FILL: {value[:30]}")
            
    except Exception as e:
        print(f"ERROR {field_id[:20]}: {e}")

print("\nâœ“ Done! Review the form in your browser before submitting.")
```

This will fill the fields and print what it's doing. You can watch it happen in your Chrome window. Ready to try it?

In [None]:
import re

# Extract JSON from response
json_match = re.search(r'```json\s*(.*?)\s*```', response_full.content[0].text, re.DOTALL)
fill_data = json.loads(json_match.group(1))

print(f"Got {len(fill_data)} field mappings to apply")

# Fill the form, skipping pre-filled fields
for item in fill_data:
    field_id = item['id']
    value = item['value']
    
    try:
        elem = page.locator(f'#{field_id}')
        
        # Check if it's a select or input/textarea
        tag = await elem.evaluate('el => el.tagName.toLowerCase()')
        
        # Check if already has a value (skip if so)
        current = await elem.input_value() if tag != 'select' else await elem.evaluate('el => el.value')
        if current and tag != 'select':
            print(f"SKIP (prefilled): {field_id[:30]}")
            continue
        
        if tag == 'select':
            await elem.select_option(label=value)
            print(f"SELECT: {value[:30]}")
        else:
            await elem.fill(value)
            print(f"FILL: {value[:30]}")
            
    except Exception as e:
        print(f"ERROR {field_id[:20]}: {e}")

print("\nâœ“ Done! Review the form in your browser before submitting.")

NameError: name 'response_full' is not defined

That worked great. the only thing left is to do the file upload thing I guess for that you would need to know where on my local file system that file is. Now, currently we're running on Solveit, but ultimately my intent is to produce this as a script that can be run from my local machine. in which case that file will be available. 
How about if we focus on packaging up what we've got so far into a nice set of callable functions and then we can fill in the rest of the main function further below.  So the codes that we have above -- the beautifulSoup, the filling in of the form, etc --  let's modularize them as callable functions, yea? 

##### ðŸ¤–ReplyðŸ¤–<!-- SOLVEIT_SEPARATOR_7f3a9b2c -->

ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ 

Exactly! Let's modularize. Here's a clean structure:

```python
from bs4 import BeautifulSoup
import json, re
from claudette import Chat

def scrape_form_fields(html):
    """Extract all fillable form fields from HTML"""
    soup = BeautifulSoup(html, 'html.parser')
    fields = []
    for inp in soup.find_all(['input', 'select', 'textarea']):
        field_id = inp.get('id', '')
        if not field_id: continue
        field_type = inp.get('type', inp.name)
        if field_type in ['hidden', 'submit', 'button']: continue
        
        label = soup.find('label', {'for': field_id})
        label_text = label.get_text(strip=True) if label else ''
        current_value = inp.get('value', '')
        
        options = None
        if inp.name == 'select':
            options = [opt.get_text(strip=True) for opt in inp.find_all('option') if opt.get_text(strip=True)]
        
        fields.append({
            'id': field_id, 'label': label_text, 'type': field_type,
            'options': options, 'prefilled': bool(current_value and field_type not in ['radio','checkbox'] and inp.name != 'select')
        })
    return fields

def get_field_mappings(fields, recc_info, letter_text, model="claude-sonnet-4-20250514"):
    """Use LLM to map recommender info and letter to form fields"""
    prompt = f"""You are filling out a graduate school recommendation form.

RECOMMENDER INFO:
{recc_info}

RECOMMENDATION LETTER:
{letter_text}

FORM FIELDS TO FILL:
{json.dumps([f for f in fields if not f['prefilled']], indent=2)}

For each field, provide the field ID and value to fill. For dropdowns, pick from the options listed.
Return as JSON array: [{{"id": "form_xxx", "value": "..."}}]
Skip radio buttons.
"""
    chat = Chat(model=model)
    response = chat(prompt)
    json_match = re.search(r'```json\s*(.*?)\s*```', response.content[0].text, re.DOTALL)
    return json.loads(json_match.group(1))

async def fill_form(page, mappings, skip_prefilled=True):
    """Fill form fields using Playwright"""
    results = {'filled': [], 'skipped': [], 'errors': []}
    for item in mappings:
        field_id, value = item['id'], item['value']
        try:
            elem = page.locator(f'#{field_id}')
            tag = await elem.evaluate('el => el.tagName.toLowerCase()')
            
            if skip_prefilled and tag != 'select':
                current = await elem.input_value()
                if current:
                    results['skipped'].append(field_id)
                    continue
            
            if tag == 'select':
                await elem.select_option(label=value)
            else:
                await elem.fill(value)
            results['filled'].append(field_id)
        except Exception as e:
            results['errors'].append({'id': field_id, 'error': str(e)[:50]})
    return results

async def upload_recommendation(page, file_path):
    """Upload the recommendation PDF"""
    file_input = page.locator('input[type="file"]').first
    await file_input.set_input_files(file_path)
```

Run that to define the functions, then we can write the main workflow that uses them. Sound good?

In [None]:
#| export
from bs4 import BeautifulSoup
import json, re


def scrape_form_fields(html):
    """Extract all fillable form fields from HTML"""
    soup = BeautifulSoup(html, 'html.parser')
    fields = []
    for inp in soup.find_all(['input', 'select', 'textarea']):
        field_id = inp.get('id', '')
        if not field_id: continue
        field_type = inp.get('type', inp.name)
        if field_type in ['hidden', 'submit', 'button']: continue
        
        label = soup.find('label', {'for': field_id})
        label_text = label.get_text(strip=True) if label else ''
        current_value = inp.get('value', '')
        
        options = None
        if inp.name == 'select':
            options = [opt.get_text(strip=True) for opt in inp.find_all('option') if opt.get_text(strip=True)]
        
        fields.append({
            'id': field_id, 'label': label_text, 'type': field_type,
            'options': options, 'prefilled': bool(current_value and field_type not in ['radio','checkbox'] and inp.name != 'select')
        })
    return fields

In [None]:
#| export
from claudette import Chat

def get_field_mappings(fields, recc_info, letter_text, model="claude-sonnet-4-20250514"):
    """Use LLM to map recommender info and letter to form fields"""
    prompt = f"""You are filling out a graduate school recommendation form.

RECOMMENDER INFO:
{recc_info}

RECOMMENDATION LETTER:
{letter_text}

FORM FIELDS TO FILL:
{json.dumps([f for f in fields if not f['prefilled']], indent=2)}

For each field, provide the field ID and value to fill. For dropdowns, pick from the options listed.
Return as JSON array: [{{"id": "form_xxx", "value": "..."}}]
Skip radio buttons.
"""
    chat = Chat(model=model)
    response = chat(prompt)
    json_match = re.search(r'```json\s*(.*?)\s*```', response.content[0].text, re.DOTALL)
    return json.loads(json_match.group(1))

In [None]:
#| export
async def fill_form(page, mappings, skip_prefilled=True):
    """Fill form fields using Playwright"""
    results = {'filled': [], 'skipped': [], 'errors': []}
    for item in mappings:
        field_id, value = item['id'], item['value']
        try:
            elem = page.locator(f'#{field_id}')
            tag = await elem.evaluate('el => el.tagName.toLowerCase()')
            
            if skip_prefilled and tag != 'select':
                current = await elem.input_value()
                if current:
                    results['skipped'].append(field_id)
                    continue
            
            if tag == 'select':
                await elem.select_option(label=value)
            else:
                await elem.fill(value)
            results['filled'].append(field_id)
        except Exception as e:
            results['errors'].append({'id': field_id, 'error': str(e)[:50]})
    return results

In [None]:
#| export
async def upload_recommendation(page, file_path):
    """Upload the recommendation PDF"""
    file_input = page.locator('input[type="file"]').first
    await file_input.set_input_files(file_path)

In [None]:
#| export
import asyncio

async def process_url(page, url, recc_info, letter_text, pdf_path, debug=False):
    """Process a single recommendation URL"""
    await page.goto(url)
    html = await page.content()
    
    fields = scrape_form_fields(html)
    if debug: print(f"Found {len(fields)} fields")
    
    mappings = get_field_mappings(fields, recc_info, letter_text)
    if debug: print(f"Got {len(mappings)} mappings from LLM")
    
    results = await fill_form(page, mappings)
    if debug: print(f"Filled: {len(results['filled'])}, Errors: {len(results['errors'])}")
    
    await upload_recommendation(page, pdf_path)
    if debug: print("Uploaded PDF")
    
    input("Review the form, then press Enter to continue to next URL (or Ctrl+C to stop)...")

# `formalyzer` CLI script

In [None]:
#| export
def read_info(recc_info:str, pdf_path:str, urls:str):
    "parse CLI args and read input files"
    recc_info, pdf_path = [os.path.expanduser(_) for _ in [recc_info, pdf_path]]
    assert os.path.exists(recc_info), f"File not found: {recc_info}"
    assert os.path.exists(pdf_path), f"File not found: {pdf_path}"
    recc_info = read_recc_info(recc_info) 
    letter_text = read_pdf_text(pdf_path)
    if os.path.exists(os.path.expanduser(urls)): 
        print(f"File {urls} exists. Reading.")
        urls = read_urls_file(urls)
    else: 
        print(f"No file {urls}. Treating it as a single url") 
        urls = [urls]
    return recc_info, letter_text, urls 

In [None]:
#main("~/recc_info.txt", "~/recc_letter.pdf","~/recc_urls.txt", debug=True)

Okay, so above you see my minimal `main` function, but we should add to it now, right?  we need to tell it to use the chrome browser, and we need some kind of async stuff?  I dont understand that at all, e.g. the Error : 'await' outside async function

##### ðŸ¤–ReplyðŸ¤–<!-- SOLVEIT_SEPARATOR_7f3a9b2c -->

ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ ðŸ§ 

Good question! Here's the deal with async:

- `await` can only be used inside functions marked with `async def`
- Playwright's browser control is async (so multiple things can happen without blocking)
- To run async code from a regular function, you use `asyncio.run()`

Here's the fix â€” make `main` call an async function:

```python
async def setup_browser():
    """Connect to Chrome with remote debugging"""
    pw = await async_playwright().start()
    browser = await pw.chromium.connect_over_cdp("http://localhost:9222")
    page = await browser.new_page()
    return pw, browser, page

async def run_formalyzer(recc_info, letter_text, urls, pdf_path, debug=False):
    """Main async workflow"""
    pw, browser, page = await setup_browser()
    try:
        for i, url in enumerate(urls):
            if not url.strip(): continue  # skip empty urls
            print(f"\nURL {i+1} of {len(urls)}: {url}")
            await process_url(page, url, recc_info, letter_text, pdf_path, debug=debug)
    finally:
        await browser.close()
        await pw.stop()

@call_parse
def main(recc_info:str, pdf_path:str, urls:str, debug:bool=False):
    recc_info, letter_text, urls = read_info(recc_info, pdf_path, urls)
    if debug:
        print("recc_info =\n", recc_info)
        print("letter_text =\n", letter_text)
        print("urls =\n", urls)
    
    # Run the async workflow
    asyncio.run(run_formalyzer(recc_info, letter_text, urls, pdf_path, debug))
```

The pattern is: regular `main()` â†’ calls `asyncio.run()` â†’ which runs the `async def run_formalyzer()` â†’ which can use `await`.

Does that make sense?

In [None]:
#| export
import os 
from playwright.async_api import async_playwright
from fastcore.script import call_parse

async def setup_browser():
    """Connect to Chrome with remote debugging"""
    pw = await async_playwright().start()
    browser = await pw.chromium.connect_over_cdp("http://localhost:9222")
    page = await browser.new_page()
    return pw, browser, page


async def run_formalyzer(recc_info, letter_text, urls, pdf_path, debug=False):
    """Main async workflow"""
    pw, browser, page = await setup_browser()
    try:
        for i, url in enumerate(urls):
            if not url.strip(): continue  # skip empty urls
            print(f"\nURL {i+1} of {len(urls)}: {url}")
            await process_url(page, url, recc_info, letter_text, pdf_path, debug=debug)
    finally:
        await browser.close()
        await pw.stop()


@call_parse
def main(recc_info:str, pdf_path:str, urls:str, debug:bool=False):
    assert os.environ.get('ANTHROPIC_API_KEY'), "Please set ANTHROPIC_API_KEY environment variable" # used by Claudette
    recc_info, letter_text, urls = read_info(recc_info, pdf_path, urls)
    if debug:
        print("recc_info =\n", recc_info)
        print("letter_text =\n", letter_text)
        print("urls =\n", urls)
    
    # Run the async workflow
    asyncio.run(run_formalyzer(recc_info, letter_text, urls, pdf_path, debug))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()