# Invalid Data Test
Test the QA system with intentionally wrong data.

Test Criteria:
The test data has 6 intentionally wrong Yes/No values:
- nuclearEnergyRelatedActivitiesSection426-428 (inverted from correct values)
- fossilGasRelatedActivitiesSection429-431 (inverted from correct values)

Expected Result: All 6 should be rejected by QA system (mismatches between submitted values and previous answers)
Success: If QaRejected >= 6, the comparison logic is working correctly

In [None]:
import json
from pathlib import Path

import pandas as pd
import requests

In [None]:
reference_file = Path("..") / "data" / "jsons" / "enbw.json"
with Path(reference_file).open(encoding="utf-8") as f:
    correct_data = json.load(f)

invalid_file = Path("..") / "data" / "jsons_invalid" / "test_invalid_yesno.json"
with Path(invalid_file).open(encoding="utf-8") as f:
    invalid_data = json.load(f)

print("Data files loaded")

In [None]:
from dataland_qa_lab.dataland.provide_test_data import get_company_id, upload_dataset, upload_pdf
from dataland_qa_lab.utils import config

dataland_client = config.get_config().dataland_client
pdf_path = Path("../data/pdfs")

upload_pdf(
    pdf_path=pdf_path,
    pdf_id="9c0a555a29683aedd2cd50ff7e837181a7fbb2d1c567d336897e2356fc17a595",
    company="enbw",
    dataland_client=dataland_client,
)

company_id = get_company_id(company="enbw", dataland_client=dataland_client)
invalid_data["companyId"] = company_id

json_str = json.dumps(invalid_data, indent=4)

dataset_id = upload_dataset(
    company_id=company_id,
    json_str=json_str,
    dataland_client=dataland_client,
    reporting_period=invalid_data["reportingPeriod"],
)

print(f"Dataset uploaded: {dataset_id}")

In [None]:
api_url = "http://localhost:8000/data-point-flow/review-dataset/" + dataset_id
payload = {"ai_model": "gpt-5", "use_ocr": True, "override": False}
response = requests.post(api_url, json=payload)
results = response.json()

data = [
    {"data_point_type": value.get("data_point_type"), "qa_status": value.get("qa_status")}
    for value in results.values()
    if isinstance(value, dict)
]

df = pd.DataFrame(data)
print(f"Total data points: {len(df)}")

In [None]:
status_counts = df["qa_status"].value_counts()

rejected = status_counts.get("QaRejected", 0)
accepted = status_counts.get("QaAccepted", 0)
not_attempted = status_counts.get("QaNotAttempted", 0)

print("Test Results:")
print(f"QaRejected: {rejected}")
print(f"QaAccepted: {accepted}")
print(f"QaNotAttempted: {not_attempted}")
print()

expected_rejected = 6
if rejected >= expected_rejected:
    print(f"Success: All {expected_rejected} wrong values were rejected")
    print("The comparison logic correctly detects mismatches")
else:
    print(f"Issue: Only {rejected}/{expected_rejected} wrong values were rejected")
    print("The comparison logic may have failed for some values")