# Unmatched Regex

In [None]:
import os
import json
import pandas as pd
import datetime
from typing import List, Dict
import torch
import gc
import re

from sentence_transformers import SentenceTransformer
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.embeddings.base import Embeddings


##################################################
# Custom SentenceTransformer Embeddings
##################################################
class CustomSentenceTransformerEmbeddings(Embeddings):
    """
    Allows using a SentenceTransformer model within a LangChain-based FAISS store.
    Handles initialization of different models with specific arguments.
    """

    def __init__(self, embedding_model_name: str):
        self.embedding_model_name = embedding_model_name
        self.model = self._initialize_model()

    def _initialize_model(self) -> SentenceTransformer:
        model_configs = {
            "jinaai/jina-embeddings-v3": {
                "trust_remote_code": True,
                "revision": "main",
                "device": "cuda",
                "model_kwargs": {"use_flash_attn": False},
            },
            "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5": {
                "local_files_only": True,
                "device": "cuda",
                "model_kwargs": {"attn_implementation": "eager"},
            },
            "Alibaba-NLP/gte-large-en-v1.5": {
                "trust_remote_code": True,
                "revision": "main",
                "device": "cuda",
                "model_kwargs": {"attn_implementation": "eager"},
            },
        }
        config = model_configs.get(
            self.embedding_model_name,
            {"device": "cuda", "model_kwargs": {}},  # default fallback
        )
        try:
            model = SentenceTransformer(self.embedding_model_name, **config)
            print(f"Initialized SentenceTransformer model: {self.embedding_model_name}")
            return model
        except Exception as e:
            print(f"Error initializing model {self.embedding_model_name}: {e}")
            raise

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

    def unload_model(self):
        if self.model:
            del self.model
            self.model = None
            torch.cuda.empty_cache()
            gc.collect()
            print(f"Unloaded embedding model: {self.embedding_model_name}")
        else:
            print("[DEBUG] Embedding model was already None or not set.")


##################################################
# Build Context and Save as JSON
##################################################
def build_context_and_save(
    df: pd.DataFrame,
    vectorstore: FAISS,
    k_value: int = 5,
    output_file: str = "../../data/pipeline2/json/context.json",
):
    """
    For each row in the DataFrame:
      1) Build a query from the row (using Product Name, Technology Description, Classification, and Applicability).
      2) Use similarity_search_with_score to get (Document, original L2 distance) tuples.
      3) Compute transformed_score = 1 - distance so that higher is better.
      4) Sort the results by transformed_score descending.
      5) Keep only those documents whose transformed_score is within 0.2 of the best score.
      6) Build a JSON object for each product row with all qualifying category_score entries.
      7) Aggregate all JSON objects and save them to one JSON file.
    """
    all_payloads = []
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    for idx, row in df.iterrows():
        product_name = row.get("Product Name", "N/A")
        classification = row.get("Classification", "N/A")
        technology_description = row.get("Technology Description", "N/A")
        technological_applicability = row.get("Technological Applicability", "N/A")
        flow_prop_name = row.get("Flow Property Name", "N/A")
        flow_prop_mean_value = row.get("Flow Property Mean Value", "N/A")
        flow_prop_ref_unit = row.get("Flow Property Reference Unit", "N/A")
        uuid_val = row.get("UUID", "N/A")

        # Ensure non-NaN values
        if pd.isna(technology_description):
            technology_description = "N/A"
        if pd.isna(technological_applicability):
            technological_applicability = "N/A"
        if pd.isna(flow_prop_name):
            flow_prop_name = "N/A"
        if pd.isna(flow_prop_mean_value):
            flow_prop_mean_value = "N/A"
        if pd.isna(flow_prop_ref_unit):
            flow_prop_ref_unit = "N/A"

        # Build the query string
        query_str = (
            f"Product: {product_name}\n"
        )

        if classification != "N/A":
            query_str += f"Classification: {classification}\n"
        if technology_description != "N/A":
            query_str += f"Description: {technology_description}\n"
        if technological_applicability != "N/A":
            query_str += f"Applicability: {technological_applicability}\n"
        if flow_prop_name != "N/A":
            query_str += f"Flow Property Name: {flow_prop_name}\n"
        if flow_prop_mean_value != "N/A":
            query_str += f"Flow Property Mean Value: {flow_prop_mean_value}\n"
        if flow_prop_ref_unit != "N/A":
            query_str += f"Flow Property Reference Unit: {flow_prop_ref_unit}\n"

        print("=== Query ===")
        print(query_str)

        # Retrieve documents with original L2 distance scores (lower is better)
        docs_with_scores = vectorstore.similarity_search_with_score(
            query_str, k=k_value
        )

        # Compute transformed scores (1 - original_score) so that higher is better
        transformed_docs = [
            (doc, 1 - orig_score) for doc, orig_score in docs_with_scores
        ]

        print("=== Retrieved Documents ===")
        for doc, transformed_score in transformed_docs:
            print(f"[{transformed_score:.4f}] {doc.page_content}")
        print("-" * 40)

        if not transformed_docs:
            print("No documents retrieved for this query.\n")
            continue

        # Sort by transformed score descending (highest first)
        transformed_docs.sort(key=lambda x: x[1], reverse=True)
        best_score = transformed_docs[0][1]

        # Dynamic filtering: include docs within 0.2 of the best score
        filtered_docs = [
            (doc, score)
            for doc, score in transformed_docs
            if (best_score - score) <= 0.1
        ]

        if not filtered_docs:
            # Fallback: always include the best document
            filtered_docs = [transformed_docs[0]]
            print(
                "No documents passed the dynamic filtering; using the top result only.\n"
            )

        # Group filtered documents for the current product into one payload
        payload = {
            "Product": product_name,
            "UUID": uuid_val,
            "Description": technology_description,
            "Applicability": technological_applicability,
            "Flow Property Name": flow_prop_name,
            "Flow Property Mean Value": flow_prop_mean_value,
            "Flow Property Reference Unit": flow_prop_ref_unit,
            "category_score": [],
        }
        for doc, transformed_score in filtered_docs:
            payload["category_score"].append(
                {"category": doc.page_content, "score": float(transformed_score)}
            )
        all_payloads.append(payload)

    # Save all JSON payloads into one file as a list
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_payloads, f, ensure_ascii=False, indent=2)

    print(f"\nSaved context JSON to: {output_file}")


##################################################
# Main Flow
##################################################
if __name__ == "__main__":

    # 1) Load the CSV file
    summaries_path = "../../data/pipeline2/sql/regex_classified/filtered_epd_data02_classified_concrete05.csv"
    df = pd.read_csv(summaries_path)
    df = df[df["RegEx Classification"].str.match(r"^Concrete$", case=True, na=False)]
    print(f"Loaded {len(df)} rows from {summaries_path}")

    # 2) Load FAISS vectorstore
    embedding_model_name = "mxbai-embed-large:latest"
    embeddings = OllamaEmbeddings(model=embedding_model_name)
    faiss_path = "../../embeddings/pipeline2/mxbai-embed-large/faiss_index_COS_EN"
    vectorstore = FAISS.load_local(
        faiss_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True,
    )

    k_value = 10

    # 4) Create JSON output
    build_context_and_save(
        df=df,
        vectorstore=vectorstore,
        k_value=k_value,
        output_file="../../data/pipeline2/json/TIES_concrete.json",
    )

    # Unload model to free GPU memory
    if isinstance(embeddings, CustomSentenceTransformerEmbeddings):
        embeddings.unload_model()
    else:
        print(f"Done with {embedding_model_name}.")

Loaded 93 rows from ../../data/pipeline2/sql/regex_classified/filtered_epd_data02_classified_concrete05.csv
=== Query ===
Product: Architectural concrete coping
Classification: construction products, infrastructure and buildings
Description: The declared blocks consist of cement, aggregates mix, and additives. The manufacturing includes mixing the raw materials according to the relevant recipes of the product.
Applicability: To be applied internal and external use in all construction sectors.
Flow Property Name: Mass
Flow Property Mean Value: 0.8
Flow Property Reference Unit: kg

=== Retrieved Documents ===
[0.3824] Mineral building products > Bricks, blocks and elements > Precast concrete elements and goods
[0.3823] Mineral building products > Bricks, blocks and elements > Light concrete
[0.3735] Mineral building products > Binder > Cement
[0.3667] Mineral building products > Mortar and Concrete > Mortar (masonry)
[0.3569] Mineral building products > Concrete aggregates > Byproducts f

## Select One Category

### Real-Time API

### Batch API

In [6]:
import os
import json
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI

# 1) Load your API key
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment.")

client = OpenAI(api_key=api_key)

# Pydantic model to enforce structured output
class BestCategoryResponse(BaseModel):
    best_category: str

# 2) Read your product data
with open(
    "../../data/pipeline2/json/TIES_concrete.json",
    "r",
    encoding="utf-8",
) as file:
    products = json.load(file)

base_prompt = """\
You are an expert in product categorization. The following product information comes from an Environmental Product Declaration (EPD).
Review the details below and choose the best category from the list, considering both the score values and the contextual product details.

Pick only one best category.
"""

def build_request(i, product):
    # Remove keys with "N/A"
    clean_product = {k: v for k, v in product.items() if v != "N/A"}

    # Build context string
    fields = [
        ("Product", "Product"),
        ("Description", "Description"),
        ("Applicability", "Applicability"),
        ("Flow Property Name", "Flow Property Name"),
        ("Flow Property Mean Value", "Flow Property Mean Value"),
        ("Flow Property Reference Unit", "Flow Property Reference Unit"),
    ]
    context_parts = [
        f"- {label}: {value}"
        for key, label in fields
        for value in [clean_product.get(key, "")]
        if value  # only include if non-empty
    ]

    # Category scores with two decimals
    category_scores = clean_product.get("category_score", [])
    categories_str = "\n".join(
        f"- {c['category']} (score: {c['score']:.2f})"
        for c in category_scores
    )

    # Final user prompt
    final_prompt = (
        "Product Details:\n"
        + "\n".join(context_parts)
        + "\n\nPossible Categories:\n"
        + categories_str
        + "\n\nWhich category is best? Please respond in valid json format."
    )

    # (Optional) Print the final prompt for debugging
    print("\n============ Final Prompt ============")
    print(final_prompt)
    print("======================================\n")
	
    # Generate JSON Schema from the Pydantic model
    schema = BestCategoryResponse.model_json_schema()
    schema["additionalProperties"] = False

    return {
        "custom_id": f"req_{i}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "o3-mini",
            "reasoning_effort": "low",
            "messages": [
				{"role": "system", "content": base_prompt},
                {"role": "user", "content": final_prompt}
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "BestCategoryResponse",
                    "schema": schema,
                    "strict": True
                }
            },
        },
    }

# 3) Write out the batch‐input JSONL file
input_jsonl_file = "../../data/pipeline2/json/openai/batch_input_TIES_concrete.jsonl"
os.makedirs(os.path.dirname(input_jsonl_file), exist_ok=True)
with open(input_jsonl_file, "w", encoding="utf-8") as out_file:
    for i, product in enumerate(products):
        req_obj = build_request(i, product)
        out_file.write(json.dumps(req_obj) + "\n")

print(f"Created {input_jsonl_file} with {len(products)} lines.")



Product Details:
- Product: Architectural concrete coping
- Description: The declared blocks consist of cement, aggregates mix, and additives. The manufacturing includes mixing the raw materials according to the relevant recipes of the product.
- Applicability: To be applied internal and external use in all construction sectors.
- Flow Property Name: Mass
- Flow Property Mean Value: 0.8
- Flow Property Reference Unit: kg

Possible Categories:
- Mineral building products > Bricks, blocks and elements > Precast concrete elements and goods (score: 0.38)
- Mineral building products > Bricks, blocks and elements > Light concrete (score: 0.38)
- Mineral building products > Binder > Cement (score: 0.37)
- Mineral building products > Mortar and Concrete > Mortar (masonry) (score: 0.37)
- Mineral building products > Concrete aggregates > Byproducts from power plant (score: 0.36)
- Mineral building products > Mortar and Concrete > Concrete additive (score: 0.36)
- Mineral building products > Br

In [20]:
# Replace with your actual batch id
batch_id = "batch_6814cdb67bc08190902c4a7f1940df1d"
batch_status = client.batches.retrieve(batch_id)
print("Current batch status:", batch_status.status)

if batch_status.status == "completed":
    output_file_id = batch_status.output_file_id
    if output_file_id:
        output_jsonl_file = f"../../data/pipeline2/json/openai/batch_output_{batch_id}.jsonl"
        file_response = client.files.content(output_file_id)
        with open(output_jsonl_file, "wb") as f:
            f.write(file_response.content)
        print(f"Batch results saved to {output_jsonl_file}")
    else:
        print("No output file available yet.")


Current batch status: completed
Batch results saved to ../../data/pipeline2/json/openai/batch_output_batch_6814cdb67bc08190902c4a7f1940df1d.jsonl


In [15]:
# Cancel the batch if needed
client.batches.cancel("batch_6814d7b49e988190a46ba0036dfc9df6")

Batch(id='batch_6814d7b49e988190a46ba0036dfc9df6', completion_window='24h', created_at=1746196404, endpoint='/v1/chat/completions', input_file_id='file-BzLmhxBLucXocisTQkYvGe', object='batch', status='cancelling', cancelled_at=None, cancelling_at=1746196422, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1746282804, failed_at=None, finalizing_at=None, in_progress_at=1746196406, metadata={'description': 'The International EPD System concrete product category classification'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=93))

In [19]:
import pprint

# Print batch jobs and their status
for batch in list(client.batches.list()):
    pprint.pprint(batch.model_dump())


{'cancelled_at': 1746197030,
 'cancelling_at': 1746196422,
 'completed_at': None,
 'completion_window': '24h',
 'created_at': 1746196404,
 'endpoint': '/v1/chat/completions',
 'error_file_id': 'file-3maNchA6DpVmeFyssGSUi2',
 'errors': None,
 'expired_at': None,
 'expires_at': 1746282804,
 'failed_at': None,
 'finalizing_at': None,
 'id': 'batch_6814d7b49e988190a46ba0036dfc9df6',
 'in_progress_at': 1746196406,
 'input_file_id': 'file-BzLmhxBLucXocisTQkYvGe',
 'metadata': {'description': 'The International EPD System concrete product '
                             'category classification'},
 'object': 'batch',
 'output_file_id': None,
 'request_counts': {'completed': 0, 'failed': 0, 'total': 93},
 'status': 'cancelled'}
{'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1746200623,
 'completion_window': '24h',
 'created_at': 1746193846,
 'endpoint': '/v1/chat/completions',
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1746280246,
 'failed_at':

In [None]:
# Check output file
output_file_id = "file-KtupsEnFunSJxoUQcycSQ3"
output_file = client.files.content(output_file_id)
print(output_file.text)

In [None]:
# Check error file
error_file_id = "file-X2X7sMjyw2tpLoomziXjwz"
error_file = client.files.content(error_file_id)
print(error_file.text)


## Validation Step
- should run in the Batch API

In [None]:
import os
import json
import datetime
from dotenv import load_dotenv
from pydantic import BaseModel
from openai import OpenAI

# 1) Load your API key, create the client
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

# 2) Define a Pydantic model for our "validation check" structured output
class CategoryValidation(BaseModel):
    is_consistent: bool
    issue: str  # If is_consistent=False, explain the problem; otherwise, can be empty or null

def validate_chosen_category(product_dict: dict, chosen_category: str) -> dict:
    """
    Sends product details + chosen category to a reasoning model (o3-mini)
    and returns a structured JSON response verifying if the category 
    is consistent or not.
    """
    # Build a human-readable summary of the product
    context_str = [
        f"- Product: {product_dict.get('Product', '')}",
        f"- Description: {product_dict.get('Description', '')}",
        f"- Applicability: {product_dict.get('Applicability', '')}",
        f"- Flow Property Name: {product_dict.get('Flow Property Name', '')}",
        f"- Flow Property Mean Value: {product_dict.get('Flow Property Mean Value', '')}",
        f"- Flow Property Reference Unit: {product_dict.get('Flow Property Reference Unit', '')}"
    ]
    context_str = "\n".join(context_str)

    # Create a prompt
    prompt = f"""
You are an expert in building product classification. Below are the details of a product 
(along with EPD info) and the category we have assigned to it.

Product details:
{context_str}

Assigned category:
{chosen_category}

Check whether the assigned category accurately fits the product details.
If the category is correct, respond with:
{{
  "is_consistent": true,
  "issue": null
}}

If the category is NOT correct or if there's a mismatch, respond with:
{{
  "is_consistent": false,
  "issue": "Short explanation of what's wrong"
}}

Return your answer as valid JSON and nothing else.
"""

    # 3) Call the beta chat completions with a structured parse
    completion = client.beta.chat.completions.parse(
        model="o3-mini",            # or "gpt-4o-mini" / "o1" etc.
        reasoning_effort="low",     
        messages=[
            {"role": "user", "content": prompt}
        ],
        response_format=CategoryValidation,
    )

    # 4) Extract the structured Pydantic object, convert to dict
    validation_result = completion.choices[0].message.parsed.model_dump()

    return validation_result

# ---------------------------------------
# EXAMPLE USAGE:

if __name__ == "__main__":
    # Suppose you already have 'all_responses' from your classification step:
    # all_responses = [
    #   {
    #       "best_category": "Mineral building products > Bricks, ...",
    #       "Product": "Architectural concrete coping",
    #       ...
    #   },
    #   ...
    # ]
    # For each product, retrieve the product details dictionary you originally used
    # plus the chosen category, then do:

    # Example single product dictionary for demonstration:
    example_product = {
        "Product": "Architectural concrete coping",
        "Description": "The declared blocks consist of cement...",
        "Applicability": "For internal and external use...",
        "Flow Property Name": "Mass",
        "Flow Property Mean Value": 0.8,
        "Flow Property Reference Unit": "kg",
        # etc. (whatever fields you have)
    }
    chosen_category = "Mineral building products > Bricks, blocks and elements > Precast concrete elements and goods"

    result = validate_chosen_category(example_product, chosen_category)
    print("Validation result:", result)

    # If you want to do this for your entire dataset:
    # validated_outcomes = []
    # for row in all_responses:
    #     # row should contain 'Product' or the entire product dictionary
    #     # plus 'best_category'
    #     prod_dict = {...}  # reconstruct or retrieve the product details 
    #     chosen_cat = row["best_category"]
    #     validation = validate_chosen_category(prod_dict, chosen_cat)
    #     validated_outcomes.append({
    #         "product": row["Product"],
    #         "chosen_category": chosen_cat,
    #         "is_consistent": validation["is_consistent"],
    #         "issue": validation["issue"]
    #     })
    #
    # # Now 'validated_outcomes' has a list of yes/no answers and any mismatch issues.


# EPDNorge 

In [27]:
import os
import json
import pandas as pd
import datetime
from typing import List, Dict
import torch
import gc
import re

from sentence_transformers import SentenceTransformer
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.embeddings.base import Embeddings


##################################################
# Custom SentenceTransformer Embeddings
##################################################
class CustomSentenceTransformerEmbeddings(Embeddings):
    """
    Allows using a SentenceTransformer model within a LangChain-based FAISS store.
    Handles initialization of different models with specific arguments.
    """

    def __init__(self, embedding_model_name: str):
        self.embedding_model_name = embedding_model_name
        self.model = self._initialize_model()

    def _initialize_model(self) -> SentenceTransformer:
        model_configs = {
            "jinaai/jina-embeddings-v3": {
                "trust_remote_code": True,
                "revision": "main",
                "device": "cuda",
                "model_kwargs": {"use_flash_attn": False},
            },
            "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5": {
                "local_files_only": True,
                "device": "cuda",
                "model_kwargs": {"attn_implementation": "eager"},
            },
            "Alibaba-NLP/gte-large-en-v1.5": {
                "trust_remote_code": True,
                "revision": "main",
                "device": "cuda",
                "model_kwargs": {"attn_implementation": "eager"},
            },
        }
        config = model_configs.get(
            self.embedding_model_name,
            {"device": "cuda", "model_kwargs": {}},  # default fallback
        )
        try:
            model = SentenceTransformer(self.embedding_model_name, **config)
            print(f"Initialized SentenceTransformer model: {self.embedding_model_name}")
            return model
        except Exception as e:
            print(f"Error initializing model {self.embedding_model_name}: {e}")
            raise

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

    def unload_model(self):
        if self.model:
            del self.model
            self.model = None
            torch.cuda.empty_cache()
            gc.collect()
            print(f"Unloaded embedding model: {self.embedding_model_name}")
        else:
            print("[DEBUG] Embedding model was already None or not set.")


##################################################
# Build Context and Save as JSON
##################################################
def build_context_and_save(
    df: pd.DataFrame,
    vectorstore: FAISS,
    k_value: int = 5,
    output_file: str = "../../data/pipeline2/json/context.json",
):
    """
    For each row in the DataFrame:
      1) Build a query from the row (using Product Name, Technology Description, Classification, and Applicability).
      2) Use similarity_search_with_score to get (Document, original L2 distance) tuples.
      3) Compute transformed_score = 1 - distance so that higher is better.
      4) Sort the results by transformed_score descending.
      5) Keep only those documents whose transformed_score is within 0.2 of the best score.
      6) Build a JSON object for each product row with all qualifying category_score entries.
      7) Aggregate all JSON objects and save them to one JSON file.
    """
    all_payloads = []
    payload_id = 0  
    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    for idx, row in df.iterrows():
        product_name = row.get("Product Name", "N/A")
        classification = row.get("Classification", "N/A")
        lca_method_details = row.get("LCA Method Details", "N/A")
        technology_description = row.get("Technology Description", "N/A")
        technological_applicability = row.get("Technological Applicability", "N/A")
        flow_prop_name = row.get("Flow Property Name", "N/A")
        flow_prop_mean_value = row.get("Flow Property Mean Value", "N/A")
        flow_prop_ref_unit = row.get("Flow Property Reference Unit", "N/A")
        uuid_val = row.get("UUID", "N/A")

        # Ensure non-NaN values
        if pd.isna(lca_method_details):
            lca_method_details = "N/A"
        if pd.isna(technology_description):
            technology_description = "N/A"
        if pd.isna(technological_applicability):
            technological_applicability = "N/A"
        if pd.isna(flow_prop_name):
            flow_prop_name = "N/A"
        if pd.isna(flow_prop_mean_value):
            flow_prop_mean_value = "N/A"
        if pd.isna(flow_prop_ref_unit):
            flow_prop_ref_unit = "N/A"

        # Build the query string
        query_str = (
            f"Product: {product_name}\n"
        )

        if classification != "N/A":
            query_str += f"Classification: {classification}\n"
        if lca_method_details != "N/A":
            query_str += f"PCR: {lca_method_details}\n"
        if technology_description != "N/A":
            query_str += f"Description: {technology_description}\n"
        if technological_applicability != "N/A":
            query_str += f"Applicability: {technological_applicability}\n"
        # if flow_prop_name != "N/A":
        #     query_str += f"Flow Property Name: {flow_prop_name}\n"
        # if flow_prop_mean_value != "N/A":
        #     query_str += f"Flow Property Mean Value: {flow_prop_mean_value}\n"
        # if flow_prop_ref_unit != "N/A":
        #     query_str += f"Flow Property Reference Unit: {flow_prop_ref_unit}\n"

        print("=== Query ===")
        print(query_str)

        # Retrieve documents with original L2 distance scores (lower is better)
        docs_with_scores = vectorstore.similarity_search_with_score(
            query_str, k=k_value
        )

        # Compute transformed scores (1 - original_score) so that higher is better
        transformed_docs = [
            (doc, 1 - orig_score) for doc, orig_score in docs_with_scores
        ]

        print("=== Retrieved Documents ===")
        for doc, transformed_score in transformed_docs:
            print(f"[{transformed_score:.4f}] {doc.page_content}")
        print("-" * 40)

        if not transformed_docs:
            print("No documents retrieved for this query.\n")
            continue

        # Sort by transformed score descending (highest first)
        transformed_docs.sort(key=lambda x: x[1], reverse=True)
        best_score = transformed_docs[0][1]

        # Dynamic filtering: include docs within 0.2 of the best score
        filtered_docs = [
            (doc, score)
            for doc, score in transformed_docs
            if (best_score - score) <= 0.1
        ]
        filtered_docs = transformed_docs

        if not filtered_docs:
            # Fallback: always include the best document
            filtered_docs = [transformed_docs[0]]
            print(
                "No documents passed the dynamic filtering; using the top result only.\n"
            )

        # Group filtered documents for the current product into one payload
        payload = {
            "id": payload_id, 
            "Product": product_name,
            "UUID": uuid_val,
            "Classification": classification,
            "PCR": lca_method_details,
            "Description": technology_description,
            "Applicability": technological_applicability,
            "Flow Property Name": flow_prop_name,
            "Flow Property Mean Value": flow_prop_mean_value,
            "Flow Property Reference Unit": flow_prop_ref_unit,
            "category_score": [],
        }
        for doc, transformed_score in filtered_docs:
            payload["category_score"].append(
                {"category": doc.page_content, "score": float(transformed_score)}
            )
        all_payloads.append(payload)
        payload_id += 1

    # Save all JSON payloads into one file as a list
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_payloads, f, ensure_ascii=False, indent=2)

    print(f"\nSaved context JSON to: {output_file}")


##################################################
# Main Flow
##################################################
if __name__ == "__main__":

    # 1) Load the CSV file
    summaries_path = "../../data/pipeline2/sql/filtered_epd_other_categories.csv"
    df = pd.read_csv(summaries_path)
    df = df[df["Classification System"].str.match(r"EPDNorge", case=True, na=False)]
    print(f"Loaded {len(df)} rows from {summaries_path}")

    # 2) Load FAISS vectorstore
    embedding_model_name = "jinaai/jina-embeddings-v3"
    embeddings = CustomSentenceTransformerEmbeddings(embedding_model_name)
    
    faiss_path = "../../embeddings/pipeline2/jinaai_jina-embeddings-v3/faiss_index_COS_EN"
    vectorstore = FAISS.load_local(
        faiss_path,
        embeddings=embeddings,
        allow_dangerous_deserialization=True,
    )

    k_value = 20

    timestamp_str = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

    if embedding_model_name == "jinaai/jina-embeddings-v3":
        model_name = embedding_model_name.replace("/", "_")
    elif embedding_model_name.startswith("HIT-TMG") or embedding_model_name.startswith("Alibaba-NLP"):
        model_name = embedding_model_name.replace("/", "_")
    elif embedding_model_name.startswith("granite-embedding"):
        model_name = embedding_model_name.replace(":", "-")
    elif ":" in embedding_model_name:
        model_name = embedding_model_name.split(":")[0].replace("/", "_")
    else:
        model_name = embedding_model_name.replace("/", "_")

    # 4) Create JSON output
    build_context_and_save(
        df=df,
        vectorstore=vectorstore,
        k_value=k_value,
        output_file=f"../../data/pipeline2/json/context_{model_name}_{timestamp_str}.json",
    )

    # Unload model to free GPU memory
    if isinstance(embeddings, CustomSentenceTransformerEmbeddings):
        embeddings.unload_model()
    else:
        print(f"Done with {embedding_model_name}.")

Loaded 540 rows from ../../data/pipeline2/sql/filtered_epd_other_categories.csv
Initialized SentenceTransformer model: jinaai/jina-embeddings-v3
=== Query ===
Product: 2-løps Kabelkanal 620x280x2400mm
Classification: Bygg / Betongvarer
PCR: NPCR 020:2018 Part B for Concrete and concrete elements
Applicability: Kabelkanalene blir brukt ved fremføring av signalkoblinger og annen elektrisitet og fremstår som en effektiv sikring mot potensielle kabelbrudd. 2 løps kabelkanal har mål på 620x280x2400mm

=== Retrieved Documents ===
[0.1445] Building service engineering > Electrical > Cable
[0.0510] Plastics > Roofing membranes > ECB roofing membrane (Ethylene Copolymer Bitumen)
[0.0210] Mineral building products > Bricks, blocks and elements > Concrete roof tiles
[0.0095] Plastics > Roofing membranes > Bituminous sheet
[0.0054] Mineral building products > Bricks, blocks and elements > Fiber cement
[-0.0129] Mineral building products > Concrete aggregates > Expanded clay
[-0.0222] Mineral build

### Batch API

In [None]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel

# 1) Load your API key
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY not found in environment.")

client = OpenAI(api_key=api_key)


# Pydantic model to enforce structured output
class BestCategoryResponse(BaseModel):
    best_category: str


input_jsonl_file = "../../data/pipeline2/json/openai/batch_input_EPDNorge_concrete_20250503134414.jsonl"  # The file we'll create & upload

# 2) Read your product data (all products or a subset)
with open(
    "../../data/pipeline2/json/context_jinaai_jina-embeddings-v3_20250503134414.json",
    "r",
    encoding="utf-8",
) as file:
    products = json.load(file)

base_prompt = """\
You are an expert in product categorization. The following product information comes from an Environmental Product Declaration (EPD).

Your task:
- Review the product details and the list of possible categories.
- Treat the numeric scores only as guidance—choose the category that best fits the product based on its description and applicability, even if it is not the top score.
- Do not simply pick the highest-scoring category.
- Use exactly one of the listed categories, matching its name character-for-character.
- Do not invent any new categories.
"""


def build_request(i, product):
    # Remove keys with "N/A"
    clean_product = {k: v for k, v in product.items() if v != "N/A"}

    # Build context string
    fields = [
        ("Product", "Product"),
        ("Classification", "Classification"),
        ("PCR", "PCR"),
        ("Description", "Description"),
        ("Applicability", "Applicability"),
        ("Flow Property Name", "Flow Property Name"),
        ("Flow Property Mean Value", "Flow Property Mean Value"),
        ("Flow Property Reference Unit", "Flow Property Reference Unit"),
    ]

    context_parts = [
        f"- {label}: {value}"
        for key, label in fields
        for value in [clean_product.get(key, "")]
        if value  # only include if non-empty
    ]
    # Category scores with two decimals
    category_scores = clean_product.get("category_score", [])
    categories_str = "\n".join(
        f"- {c['category']} (score: {c['score']:.2f})" for c in category_scores
    )

    final_prompt = (
        "Product Details:\n"
        + "\n".join(context_parts)
        + "\n\nPossible Categories:\n"
        + categories_str
        + "\n\nWhich category is best? Please respond in valid json format."
    )

    # (Optional) Print the final prompt for debugging
    print("\n============ Final Prompt ============")
    print(final_prompt)
    print("======================================\n")

    # Generate schema from the Pydantic model and add "additionalProperties": false
    schema = BestCategoryResponse.model_json_schema()
    schema["additionalProperties"] = False

    # Return a dict for a single request
    return {
        "custom_id": f"req_{i}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "o3-mini",
            "reasoning_effort": "high", # price: low <0.50; high <2.00
            "messages": [
                {"role": "system", "content": base_prompt},
                {"role": "user", "content": final_prompt},
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "BestCategoryResponse",
                    "schema": schema,
                    "strict": True,
                },
            },
        },
    }


# 3) Create the .jsonl file with one request per product
with open(input_jsonl_file, "w", encoding="utf-8") as out_file:
    for i, product in enumerate(products):
        req_obj = build_request(i, product)
        out_file.write(json.dumps(req_obj) + "\n")

print(f"Created {input_jsonl_file} with {len(products)} lines.")


Product Details:
- Product: 2-løps Kabelkanal 620x280x2400mm
- Classification: Bygg / Betongvarer
- PCR: NPCR 020:2018 Part B for Concrete and concrete elements
- Applicability: Kabelkanalene blir brukt ved fremføring av signalkoblinger og annen elektrisitet og fremstår som en effektiv sikring mot potensielle kabelbrudd. 2 løps kabelkanal har mål på 620x280x2400mm
- Flow Property Name: Innhold av biogent karbon i produkt

Possible Categories:
- Building service engineering > Electrical > Cable (score: 0.14)
- Plastics > Roofing membranes > ECB roofing membrane (Ethylene Copolymer Bitumen) (score: 0.05)
- Mineral building products > Bricks, blocks and elements > Concrete roof tiles (score: 0.02)
- Plastics > Roofing membranes > Bituminous sheet (score: 0.01)
- Mineral building products > Bricks, blocks and elements > Fiber cement (score: 0.01)
- Mineral building products > Concrete aggregates > Expanded clay (score: -0.01)
- Mineral building products > Bricks, blocks and elements > Cer

In [41]:
# Replace with your actual batch id
batch_id = "batch_6816185b6cc08190a28a2829a6c1f780"
batch_status = client.batches.retrieve(batch_id)
print("Current batch status:", batch_status.status)

if batch_status.status == "completed":
    output_file_id = batch_status.output_file_id
    if output_file_id:
        output_jsonl_file = f"../../data/pipeline2/json/openai/batch_output_{batch_id}.jsonl"
        file_response = client.files.content(output_file_id)
        with open(output_jsonl_file, "wb") as f:
            f.write(file_response.content)
        print(f"Batch results saved to {output_jsonl_file}")
    else:
        print("No output file available yet.")


Current batch status: completed
Batch results saved to ../../data/pipeline2/json/openai/batch_output_batch_6816185b6cc08190a28a2829a6c1f780.jsonl


In [None]:
# Cancel the batch if needed
client.batches.cancel("batch_xxxxxxxxxx")

In [40]:
import pprint

# Print batch jobs and their status
for batch in list(client.batches.list()):
    pprint.pprint(batch.model_dump())


{'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1746281053,
 'completion_window': '24h',
 'created_at': 1746278491,
 'endpoint': '/v1/chat/completions',
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1746364891,
 'failed_at': None,
 'finalizing_at': 1746280996,
 'id': 'batch_6816185b6cc08190a28a2829a6c1f780',
 'in_progress_at': 1746278492,
 'input_file_id': 'file-6styPrDc1H3Kr6nuRaRQ7b',
 'metadata': {'description': 'EPDNorge product category classification '
                             '(o3-mini-high)'},
 'object': 'batch',
 'output_file_id': 'file-BAiQxjZ3PsQTGPikXaQHpk',
 'request_counts': {'completed': 540, 'failed': 0, 'total': 540},
 'status': 'completed'}
{'cancelled_at': 1746197030,
 'cancelling_at': 1746196422,
 'completed_at': None,
 'completion_window': '24h',
 'created_at': 1746196404,
 'endpoint': '/v1/chat/completions',
 'error_file_id': 'file-3maNchA6DpVmeFyssGSUi2',
 'errors': None,
 'expired_at': None,
 'expires_at': 17462

In [None]:
# Check output file
output_file_id = "file-KtupsEnFunSJxoUQcycSQ3"
output_file = client.files.content(output_file_id)
print(output_file.text)

In [None]:
# Check error file
error_file_id = "file-X2X7sMjyw2tpLoomziXjwz"
error_file = client.files.content(error_file_id)
print(error_file.text)
