Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ pull-model:
docker compose exec ollama ollama pull mistral

test:
docker compose exec app python3 -m pytest src/test/
docker compose exec app python3 -m pytest tests/ -v

clean:
docker compose down -v
Expand Down
12 changes: 9 additions & 3 deletions api/db/models.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
from sqlmodel import SQLModel, Field
from sqlalchemy import Column, JSON
from datetime import datetime
from datetime import datetime, timezone


def _utcnow() -> datetime:
"""Return the current UTC time as a timezone-aware datetime (Python 3.12+ safe)."""
return datetime.now(timezone.utc)


class Template(SQLModel, table=True):
id: int | None = Field(default=None, primary_key=True)
name: str
fields: dict = Field(sa_column=Column(JSON))
pdf_path: str
created_at: datetime = Field(default_factory=datetime.utcnow)
created_at: datetime = Field(default_factory=_utcnow)


class FormSubmission(SQLModel, table=True):
id: int | None = Field(default=None, primary_key=True)
template_id: int
input_text: str
output_pdf_path: str
created_at: datetime = Field(default_factory=datetime.utcnow)
created_at: datetime = Field(default_factory=_utcnow)
3 changes: 3 additions & 0 deletions api/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from fastapi import FastAPI
from api.routes import templates, forms
from api.errors.handlers import register_exception_handlers

app = FastAPI()

register_exception_handlers(app)

app.include_router(templates.router)
app.include_router(forms.router)
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
testpaths = tests
pythonpath = .
3 changes: 2 additions & 1 deletion src/file_manipulator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os
from src.filler import Filler
from src.llm import LLM
from commonforms import prepare_form


class FileManipulator:
Expand All @@ -12,7 +11,9 @@ def __init__(self):
def create_template(self, pdf_path: str):
"""
By using commonforms, we create an editable .pdf template and we store it.
Lazy import prevents ultralytics/YOLO from loading during test collection.
"""
from commonforms import prepare_form # lazy import
template_path = pdf_path[:-4] + "_template.pdf"
prepare_form(pdf_path, template_path)
return template_path
Expand Down
7 changes: 5 additions & 2 deletions src/filler.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ def fill_form(self, pdf_form: str, llm: LLM):
+ "_filled.pdf"
)

# Generate dictionary of answers from your original function
t2j = llm.main_loop()
# Generate dictionary of answers from your original function.
# main_loop_batch() extracts all fields in a single LLM call instead of
# one call per field, significantly reducing latency for large forms.
# Falls back to the sequential main_loop() if the LLM returns invalid JSON.
t2j = llm.main_loop_batch()
textbox_answers = t2j.get_data() # This is a dictionary

answers_list = list(textbox_answers.values())
Expand Down
90 changes: 90 additions & 0 deletions src/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,5 +131,95 @@ def handle_plural_values(self, plural_value):

return values

def build_batch_prompt(self):
"""
Builds a single prompt that asks the LLM to extract ALL target fields
at once and return them as a JSON object.
This replaces N sequential API calls with a single round-trip.
"""
fields_list = json.dumps(list(self._target_fields.keys()), indent=2)
prompt = f"""
SYSTEM PROMPT:
You are an AI assistant that extracts structured data from incident transcriptions.
Extract values for ALL of the following JSON fields from the text below.
Return ONLY a valid JSON object with no extra explanation, commentary, or markdown fences.
If a field is plural and multiple values exist in the text, use a list of strings.
If a value cannot be found in the text, use null.

FIELDS TO EXTRACT:
{fields_list}

TEXT:
{self._transcript_text}

OUTPUT FORMAT:
{{
"field_name": "extracted value or null",
...
}}
"""
return prompt

def main_loop_batch(self):
"""
Single-call extraction — replaces the N sequential calls in main_loop().
Sends one prompt containing all target fields and parses the JSON response.
Falls back to main_loop() if the LLM does not return valid JSON.
"""
prompt = self.build_batch_prompt()
ollama_host = os.getenv("OLLAMA_HOST", "http://localhost:11434").rstrip("/")
ollama_url = f"{ollama_host}/api/generate"

payload = {
"model": "mistral",
"prompt": prompt,
"stream": False,
}

try:
response = requests.post(ollama_url, json=payload)
response.raise_for_status()
except requests.exceptions.ConnectionError:
raise ConnectionError(
f"Could not connect to Ollama at {ollama_url}. "
"Please ensure Ollama is running and accessible."
)
except requests.exceptions.HTTPError as e:
raise RuntimeError(f"Ollama returned an error: {e}")

raw = response.json()["response"].strip()

# Strip markdown code fences if the model wrapped the output
if raw.startswith("```"):
parts = raw.split("```")
# parts[1] is the fenced block; drop a leading "json" language tag if present
raw = parts[1].lstrip("json").strip()

try:
extracted = json.loads(raw)
except json.JSONDecodeError as e:
print(
f"\t[WARN] main_loop_batch: LLM did not return valid JSON ({e}). "
"Falling back to sequential main_loop()."
)
return self.main_loop()

# Populate self._json using the existing add_response_to_json logic
for field in self._target_fields.keys():
value = extracted.get(field)
if value is None:
self.add_response_to_json(field, "-1")
elif isinstance(value, list):
self.add_response_to_json(field, "; ".join(str(v) for v in value))
else:
self.add_response_to_json(field, str(value))

print("----------------------------------")
print("\t[LOG] Resulting JSON created from the input text (batch mode):")
print(json.dumps(self._json, indent=2))
print("--------- extracted data ---------")

return self

def get_data(self):
return self._json
92 changes: 68 additions & 24 deletions tests/test_forms.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,69 @@
from unittest.mock import patch


def test_submit_form(client):
pass
# First create a template
# form_payload = {
# "template_id": 3,
# "input_text": "Hi. The employee's name is John Doe. His job title is managing director. His department supervisor is Jane Doe. His phone number is 123456. His email is jdoe@ucsc.edu. The signature is <Mamañema>, and the date is 01/02/2005",
# }

# template_res = client.post("/templates/", json=template_payload)
# template_id = template_res.json()["id"]

# # Submit a form
# form_payload = {
# "template_id": template_id,
# "data": {"rating": 5, "comment": "Great service"},
# }

# response = client.post("/forms/", json=form_payload)

# assert response.status_code == 200

# data = response.json()
# assert data["id"] is not None
# assert data["template_id"] == template_id
# assert data["data"] == form_payload["data"]
# Step 1: Create a template first
with patch("api.routes.templates.Controller") as MockController:
MockController.return_value.create_template.return_value = "src/inputs/file_template.pdf"

template_payload = {
"name": "Test Template",
"pdf_path": "src/inputs/file.pdf",
"fields": {
"reporting_officer": "string",
"incident_location": "string",
"amount_of_victims": "string",
"victim_name_s": "string",
"assisting_officer": "string",
},
}
template_res = client.post("/templates/create", json=template_payload)
assert template_res.status_code == 200
template_id = template_res.json()["id"]

# Step 2: Fill form using that template
with patch("api.routes.forms.Controller") as MockController:
MockController.return_value.fill_form.return_value = "src/outputs/filled_test.pdf"

form_payload = {
"template_id": template_id,
"input_text": (
"Officer Voldemort here, at an incident reported at 456 Oak Street. "
"Two victims, Mark Smith and Jane Doe. "
"Handed off to Sheriff's Deputy Alvarez. End of transmission."
),
}

response = client.post("/forms/fill", json=form_payload)

assert response.status_code == 200
data = response.json()
assert data["template_id"] == template_id
assert data["output_pdf_path"] == "src/outputs/filled_test.pdf"
assert data["input_text"] == form_payload["input_text"]
assert "id" in data


def test_submit_form_invalid_template(client):
with patch("api.routes.forms.Controller") as MockController:
MockController.return_value.fill_form.return_value = "src/outputs/filled_test.pdf"

form_payload = {
"template_id": 99999,
"input_text": "Some random incident text here.",
}

response = client.post("/forms/fill", json=form_payload)
assert response.status_code == 404


def test_submit_form_missing_input_text(client):
with patch("api.routes.forms.Controller") as MockController:
MockController.return_value.fill_form.return_value = "src/outputs/filled_test.pdf"

form_payload = {
"template_id": 1,
}

response = client.post("/forms/fill", json=form_payload)
assert response.status_code == 422
Loading