# DIN 276 cost groups matching

In [None]:
import os
import json
import csv
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, AfterValidator
from typing import Annotated, List

# 1) Load your API key
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment.")

client = OpenAI(api_key=api_key)

In [None]:
# 2) Pydantic model to enforce structured output: we expect a list of DIN 276 codes
def validate_three_digit_numeric(v: List[str]) -> List[str]:
    for item in v:
        # Enforce that each code is a three-digit numeric string
        if not (item.isdigit() and len(item) == 3):
            raise ValueError(f"'{item}' is not a three-digit number.")
    return v

# Use Annotated with AfterValidator to enforce the constraint on cost_group_codes
CostGroupCodes = Annotated[List[str], AfterValidator(validate_three_digit_numeric)]

class Din276CostGroupResponse(BaseModel):
    cost_group_codes: CostGroupCodes


# 3) Read the DIN 276 cost groups from CSV
din276_costgroups = []
csv_file_path = "../../data/pipeline2/csv/din276_concrete_sub.csv"
with open(csv_file_path, "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        din276_costgroups.append({
            "nr": row["Nr"],
            "cg_name": row["Cost group (CG)"],
            "notes": row["Notes"]
        })

# 4) Prepare your EPD data.
epd_file_path = "../../data/pipeline2/json/edited_epds.jsonl"
epds = []
with open(epd_file_path, "r", encoding="utf-8") as epd_file:
    for line in epd_file:
        if line.strip():
            epds.append(json.loads(line))

# Get only the first N EPDs
# epds = epds[:3]

# 5) Define a function to extract EPD details according to the provided paths
def extract_epd_details(epd_item: dict) -> dict:
    """
    Extract required EPD fields:
      - Product
      - Category
      - Description
      - Applicability
      - Compressive Strength
      - Bulk Density
      - Flow Property
    """
    doc = epd_item.get("document", {})
    process_info = doc.get("processInformation", {})
    
    # Product path
    product = (
        process_info
        .get("dataSetInformation", {})
        .get("name", {})
        .get("baseName", [{}])[0]
        .get("value", "")
    )

    # Category (static for demonstration, or can be read from the EPD if needed)
    category = "Mineral building products > Mortar and Concrete > Ready mixed concrete"

    # Description path
    description = (
        process_info
        .get("technology", {})
        .get("technologyDescriptionAndIncludedProcesses", [{}])[0]
        .get("value", "")
    )

    # Applicability path
    applicability = (
        process_info
        .get("technology", {})
        .get("technologicalApplicability", [{}])[0]
        .get("value", "")
    )

    # Compressive Strength & Bulk Density
    compressive_strength = ""
    bulk_density = ""
    exchanges = doc.get("exchanges", {}).get("exchange", [])
    
    if exchanges:
        material_props = exchanges[0].get("materialProperties", [])
        for prop in material_props:
            prop_name = prop.get("name", [{}]).lower()
            prop_value = prop.get("value", "")
            prop_unit = prop.get("unit", "")
            if "compressive" in prop_name:
                compressive_strength = f"{prop_value} {prop_unit}"
            elif "density" in prop_name:
                bulk_density = f"{prop_value} {prop_unit}"

    # Flow Property
    flow_property = ""
    if exchanges:
        flow_props = exchanges[0].get("flowProperties", [])
        if flow_props:
            fp = flow_props[0]
            fp_name = fp.get("name", [{}])[0].get("value", "")
            fp_mean_val = fp.get("meanValue", "")
            fp_ref_unit = fp.get("referenceUnit", "")
            flow_property = f"{fp_mean_val} {fp_ref_unit} ({fp_name})"

    # Return a dictionary of all extracted data
    return {
        "Product": product,
        "Category": category,
        "Description": description,
        "Applicability": applicability,
        "Compressive Strength": compressive_strength,
        "Bulk Density": bulk_density,
        "Flow Property": flow_property
    }

# 6) Build our base system prompt
system_prompt = """\
You are an expert in construction cost classification with in-depth knowledge of DIN 276 cost groups and Environmental Product Declarations (EPDs). In the DIN 276 classification, cost groups are organized hierarchically into parent and child groups. The parent groups are identified by the codes 310, 320, 330, 340, 350, 360, and 370. If you list a parent group, you must also include at least one corresponding child group to ensure a thorough classification. Your task is to thoroughly analyze the provided EPD details, evaluate both primary and secondary cost factors, and determine all applicable DIN 276 cost group codes by listing both parent and child groups where relevant.
"""

# 7) Utility function to build each request
def build_request(i, epd_item):
    # Extract EPD details
    details = extract_epd_details(epd_item)
    # Remove keys with "N/A" or empty strings
    clean_details = {k: v for k, v in details.items() if v and v != ""}

    # Build context string using the specified fields
    fields = [
        ("Product", "Product"),
        ("Category", "Category"),
        ("Description", "Description"),
        ("Applicability", "Applicability"),
        ("Compressive Strength", "Compressive Strength"),
        ("Bulk Density", "Bulk Density"),
        ("Flow Property", "Flow Property"),
    ]
    context_parts = [
        f"- {label}: {clean_details.get(key, '')}"
        for key, label in fields if clean_details.get(key, '')
    ]

    # Build cost groups context from CSV data
    cost_groups_str = "\n".join(
        f"- ({group['nr']}) {group['cg_name']}: {group['notes']}"
        for group in din276_costgroups
    )

    # Assemble the final prompt
    final_prompt = (
        "Product Details:\n"
        + "\n".join(context_parts)
        + "\n\nDIN 276 Cost Groups:\n"
        + cost_groups_str
        + "\n\nWhich cost group codes are applicable?"
        + "\n\nPlease respond in valid JSON format exactly as specified, "
        "with only a key 'cost_group_codes' whose value is a list of three-digit numeric codes."
    )

    # Print final prompt for debugging
    print("\n============ Final Prompt ============")
    print(final_prompt)
    print("======================================\n")

    # Generate JSON schema from the Pydantic model and set additionalProperties to false
    schema = Din276CostGroupResponse.model_json_schema()
    schema["additionalProperties"] = False

    # Return a request dictionary
    return {
        "custom_id": f"{i}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "o3-mini",
            "reasoning_effort": "high",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": final_prompt},
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "Din276CostGroupResponse",
                    "schema": schema,
                    "strict": True,
                },
            },
        },
    }

# 8) Create the .jsonl file with one request per EPD
input_jsonl_file = "../../data/pipeline2/json/openai/batch_input.jsonl"
with open(input_jsonl_file, "w", encoding="utf-8") as out_file:
    for i, epd in enumerate(epds, start=1):
        request_obj = build_request(i, epd)
        out_file.write(json.dumps(request_obj) + "\n")

print(f"Created '{input_jsonl_file}' with {len(epds)} lines for batch processing.")


In [None]:
# 4) Upload the input file to OpenAI for batch processing
batch_input_file = client.files.create(
    file=open(input_jsonl_file, "rb"),
    purpose="batch"
)
print("Uploaded file:", batch_input_file)

# 5) Create the batch job (with a 24-hour completion window)
batch = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "EPD DIN 276 cost group classification (2025-04-05_1728)"}
)
print("Created Batch:", batch)

# 6) Save the batch id for later status checks
batch_id = batch.id
print(f"Batch submitted with id: {batch_id}.")
print("You can check the status later with:")
print(f"client.batches.retrieve('{batch_id}')")

In [None]:
# Replace with your actual batch id if needed
batch_id = "batch_68178796ad60819093950a4807f772f9"  # for example
batch_status = client.batches.retrieve(batch_id)
print("Current batch status:", batch_status.status)

if batch_status.status == "completed":
    output_file_id = batch_status.output_file_id
    if output_file_id:
        output_jsonl_file = f"../../data/pipeline2/json/openai/{batch_id}_output.jsonl"
        file_response = client.files.content(output_file_id)
        with open(output_jsonl_file, "wb") as f:
            f.write(file_response.content)
        print(f"Batch results saved to {output_jsonl_file}")
    else:
        print("No output file available yet.")

In [None]:
# Split the file path into directory and file name
dir_name, file_name = os.path.split(input_jsonl_file)

# Create a new file name with batch_id in front
new_file_name = f"{batch_id}_input.jsonl"
new_input_jsonl_file = os.path.join(dir_name, new_file_name)

# Rename the file
os.rename(input_jsonl_file, new_input_jsonl_file)

print(f"Input file renamed to: {new_input_jsonl_file}")

In [None]:
import pprint

for batch in list(client.batches.list()):
    pprint.pprint(batch.model_dump())


In [None]:
# Check error file
error_file_id = "file-5iENHkNczXuUZTTNXgP7qu"
error_file = client.files.content(error_file_id)
print(error_file.text)


In [None]:
new_input_jsonl_file

In [None]:
import jsonlines
from pydantic import ValidationError

# List to hold validated outputs
validated_outputs = []

new_input_jsonl_file = "../../data/pipeline2/json/openai/batch_67d5a00f7f2c8190a0e2cdc3cf04382b_output.jsonl"

# Open the JSONL output file using jsonlines
with jsonlines.open(new_input_jsonl_file, mode='r') as reader:
    for record in reader:
        # For debugging: print the keys in the record
        print("Record custom_id:", record.get("custom_id"), "keys:", record.keys())
        
        try:
            # Extract the LLM answer from the nested structure
            answer_content = record["response"]["body"]["choices"][0]["message"]["content"]
        except (KeyError, IndexError) as e:
            print("Error extracting answer for record with custom_id:", record.get("custom_id"))
            continue

        try:
            # Validate the response using the Pydantic model
            validated = Din276CostGroupResponse.model_validate_json(answer_content)
            validated_outputs.append(validated)
            print("Valid output for custom_id", record.get("custom_id"), ":", validated)
        except ValidationError as e:
            print("Validation error for record custom_id:", record.get("custom_id"), "Error:", e)

# validated_outputs now holds all successfully validated responses
