# Invalid Data Test
Test the QA system with intentionally wrong data.

## Test Criteria

The test data (`test_invalid_yesno.json`) has 6 intentionally wrong Yes/No values:
- `nuclearEnergyRelatedActivitiesSection426-428`
- `fossilGasRelatedActivitiesSection429-431`

These values are set to be **opposite of what the AI extracts from the actual PDF**, ensuring all 6 will be rejected regardless of any errors in `enbw.json`.

**Expected Result:** All 6 should be rejected by QA system (mismatches between submitted and AI-predicted values)

**Note:** Uses a unique reporting period to ensure a new dataset is created each run.

In [5]:
import json
from pathlib import Path

import pandas as pd
import requests

In [6]:
reference_file = Path("..") / "data" / "jsons" / "enbw.json"
with Path(reference_file).open(encoding="utf-8") as f:
    correct_data = json.load(f)

invalid_file = Path("..") / "data" / "jsons_invalid" / "test_invalid_yesno.json"
with Path(invalid_file).open(encoding="utf-8") as f:
    invalid_data = json.load(f)

print("Data files loaded")

Data files loaded


In [7]:
import time

from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError

from dataland_qa_lab.database import database_engine
from dataland_qa_lab.dataland.provide_test_data import get_company_id, upload_dataset, upload_pdf
from dataland_qa_lab.utils import config

dataland_client = config.get_config().dataland_client
pdf_path = Path("../data/pdfs")
pdf_id = "9c0a555a29683aedd2cd50ff7e837181a7fbb2d1c567d336897e2356fc17a595"

# Clear any cached OCR data for this PDF to ensure fresh results (not mock data)
session = database_engine.SessionLocal()
try:
    result = session.execute(
        text("DELETE FROM cached_documents WHERE file_reference = :ref"),
        {"ref": pdf_id},
    )
    session.commit()
    print(f"Cleared {result.rowcount} cached OCR entries")
except SQLAlchemyError as e:
    session.rollback()
    print(f"Could not clear cache: {e}")
finally:
    session.close()

upload_pdf(
    pdf_path=pdf_path,
    pdf_id=pdf_id,
    company="enbw",
    dataland_client=dataland_client,
)

company_id = get_company_id(company="enbw", dataland_client=dataland_client)
invalid_data["companyId"] = company_id

# Use unique reporting period to force new dataset creation
unique_period = f"test-invalid-{int(time.time())}"
invalid_data["reportingPeriod"] = unique_period

json_str = json.dumps(invalid_data, indent=4)

dataset_id = upload_dataset(
    company_id=company_id,
    json_str=json_str,
    dataland_client=dataland_client,
    reporting_period=unique_period,
)

print(f"Dataset uploaded: {dataset_id}")

Cleared 0 cached OCR entries
Dataset uploaded: 79889521-2fe7-419e-b818-417b25e3c95d


In [8]:
import textwrap

api_url = "http://localhost:8000/data-point-flow/review-dataset/" + dataset_id
payload = {"ai_model": "gpt-5", "use_ocr": True, "override": True}
response = requests.post(api_url, json=payload, timeout=60)
response.raise_for_status()
results = response.json()

# Get submitted values from invalid_data for comparison
submitted_values = invalid_data["data"]["general"]["general"]

# Build detailed results
data = []
for key, value in results.items():
    if isinstance(value, dict):
        # Extract short field name
        short_name = key.replace("extendedEnumYesNo", "").replace("RelatedActivities", "")

        # Get submitted value
        submitted_key = key.replace("extendedEnumYesNo", "")
        submitted_key = submitted_key[0].lower() + submitted_key[1:]  # camelCase
        submitted = submitted_values.get(submitted_key, {}).get("value", "?")

        predicted = value.get("predicted_answer")
        status = value.get("qa_status")

        if status == "QaRejected":
            explanation = f"Submitted '{submitted}' but PDF shows '{predicted}' → MISMATCH detected"
        else:
            explanation = f"Submitted '{submitted}' matches PDF value '{predicted}' → No mismatch"

        data.append(
            {
                "field": short_name,
                "submitted": submitted,
                "predicted": predicted,
                "status": status,
                "explanation": explanation,
                "reasoning": value.get("reasoning", ""),
            }
        )

df = pd.DataFrame(data)

# Display as table
print(f"Validated {len(df)} data points\n")
print(df[["field", "submitted", "predicted", "status", "explanation"]].to_string(index=False))

# Show full reasoning separately
print("\n" + "=" * 80)
print("DETAILED REASONING")
print("=" * 80)
for _, row in df.iterrows():
    print(f"\n{row['field']}:")
    wrapped = textwrap.fill(row["reasoning"], width=80, initial_indent="  ", subsequent_indent="  ")
    print(wrapped)

ConnectionError: HTTPConnectionPool(host='localhost', port=8000): Max retries exceeded with url: /data-point-flow/review-dataset/79889521-2fe7-419e-b818-417b25e3c95d (Caused by NewConnectionError("HTTPConnection(host='localhost', port=8000): Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it"))

In [None]:
status_counts = df["status"].value_counts()

rejected = status_counts.get("QaRejected", 0)
accepted = status_counts.get("QaAccepted", 0)

print("Test Results:")
print(f"QaRejected: {rejected}")
print(f"QaAccepted: {accepted}")

expected_rejected = 6
if rejected >= expected_rejected:
    print(f"\nSUCCESS: {rejected}/{expected_rejected} invalid values were rejected")
else:
    print(f"\nFAILED: Only {rejected}/{expected_rejected} invalid values were rejected")

Test Results:
QaRejected: 6
QaAccepted: 0

SUCCESS: 6/6 invalid values were rejected
