# Chunk per row

## OllamaEmbeddings

In [1]:
import pandas as pd
import os
from typing import List, Dict, Optional, Set
import torch
import gc

from sentence_transformers import SentenceTransformer

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_ollama import OllamaEmbeddings
from langchain.embeddings.base import Embeddings


class CustomSentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model: SentenceTransformer, model_name: str):
        """
        Initialize with a pre-loaded SentenceTransformer model.
        """
        self.model = model
        self.model_name = model_name

    def embed_query(self, text: str) -> List[float]:
        """
        Embed a single query text.
        """
        return self.model.encode(text).tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embed multiple documents.
        """
        return self.model.encode(texts).tolist()

    def unload_model(self):
        """
        Remove the model from memory after processing to free up GPU resources.
        """
        if self.model:
            del self.model  # Delete the model instance
            self.model = None  # Ensure the reference is cleared
            torch.cuda.empty_cache()  # Clear the GPU cache
            gc.collect()  # Run garbage collection
        else:
            print("[DEBUG] Embedding model was already None or not set.")


def load_and_store(
    csv_paths: List[str],
    vectorstore_path: str,
    model: str,
    chunk_size: int = 1500,
    device: str = "cuda",
    normalize: bool = False,
):
    documents = []

    # Process each CSV
    for csv_path in csv_paths:
        df = pd.read_csv(csv_path)

        # Check for required columns and handle missing columns gracefully
        required_columns = [
            "Field Name (de)",
            "Field Name (en)",
            "Element/Attribute Name",
            "Datatype",
            "Definition (de)",
            "Definition (en)",
            "Original ILCD Format Definition (en)",
        ]
        for column in required_columns:
            if column not in df.columns:
                print(
                    f"Warning: Missing column '{column}' in {csv_path}. Skipping this file."
                )
                continue

        # Add schema_type based on file name
        schema_type = csv_path.split("/")[-1].split(".")[0]

        for _, row in df.iterrows():
            content = (
                f"'{row.get('Field Name (de)', '')}',"
                f"'{row.get('Field Name (en)', '')}',"
                f"'{row.get('Element/Attribute Name', '')}',"
                f"'{row.get('Datatype', '')}',"
                f"'{row.get('Definition (de)', '')}',"
                f"'{row.get('Definition (en)', '')}',"
                f"'{row.get('Original ILCD Format Definition (en)', '')}'"
            )
            documents.append(
                Document(
                    page_content=content.strip(),
                    metadata={"source": csv_path, "schema_type": schema_type},
                )
            )

    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=0
    )
    all_splits = text_splitter.split_documents(documents)

    # Ensure the directory exists before opening the file
    output_dir = f"../data/chunks/{model_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Write the split chunks to a text file with metadata
    with open(
        f"../data/chunks/{model_name}/chunk_{chunk_strat}.txt", "w", encoding="utf-8"
    ) as file:
        for i, chunk in enumerate(all_splits):
            file.write(f"Chunk {i+1}:\n")
            file.write(chunk.page_content + "\n")
            file.write("Metadata:\n")
            file.write(str(chunk.metadata) + "\n")
            file.write("-" * 50 + "\n")

    # Create embeddings
    if model == "jinaai/jina-embeddings-v3":
        model_st = SentenceTransformer(
            model,
            trust_remote_code=True,
            revision="main",
            device=device,
            model_kwargs={"use_flash_attn": False},
        )
        embeddings = CustomSentenceTransformerEmbeddings(
            model=model_st, model_name=model
        )
    elif model.startswith("HIT-TMG"):
        model_st = SentenceTransformer(
            model,
            local_files_only=True,
            device=device,
            model_kwargs={"attn_implementation": "eager"},
        )
        embeddings = CustomSentenceTransformerEmbeddings(
            model=model_st, model_name=model
        )
    elif model.startswith("Alibaba-NLP"):
        model_st = SentenceTransformer(
            model,
            trust_remote_code=True,
            revision="main",
            device=device,
            model_kwargs={"attn_implementation": "eager"},
        )
        embeddings = CustomSentenceTransformerEmbeddings(
            model=model_st, model_name=model
        )
    else:
        embeddings = OllamaEmbeddings(model=model)

    # Create the FAISS vector store from the chunked documents using the new embeddings
    vectorstore = FAISS.from_documents(
        all_splits,
        embedding=embeddings,
        distance_strategy=DistanceStrategy.COSINE,
        normalize_L2=normalize,
    )
    vectorstore.save_local(vectorstore_path)
    print(f"Vector store saved to {vectorstore_path}")

    if isinstance(embeddings, CustomSentenceTransformerEmbeddings):
        print(f"Unloading embedding model: {embeddings.model_name}")
        embeddings.unload_model()


#########
# Usage
#########
# Define the CSV files to process
csv_files = [
    "../data/csv/EPD_DataSet.csv",
    "../data/csv/EPD_FlowDataSet.csv",
    "../data/csv/ILCD_FlowPropertyDataSet.csv",
    "../data/csv/ILCD_LCIAMethodDataSet.csv",
]

# Define the embedding models to use
models = [
    "bge-m3:latest",  # Ollama
    "snowflake-arctic-embed2:latest",  # Ollama
    "jina/jina-embeddings-v2-base-de:latest",  # Ollama
    "paraphrase-multilingual:latest",  # Ollama
    "jeffh/intfloat-multilingual-e5-large-instruct:f32",  # Ollama
    "granite-embedding:278m",  # Ollama
    "bge-large:latest",  # Ollama
    "mxbai-embed-large:latest",  # Ollama
    "jinaai/jina-embeddings-v3",  # HuggingFace
    "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5",  # HuggingFace
    "Alibaba-NLP/gte-large-en-v1.5",  # HuggingFace
]

chunk_size = 1500

for model in models:
    # Determine model name based on the model string
    if model == "jinaai/jina-embeddings-v3":
        model_name = model.replace("/", "_")
    elif model.startswith("HIT-TMG") or model.startswith("Alibaba-NLP"):
        model_name = model.replace("/", "_")
    elif ":" in model:
        model_name = model.split(":")[0].replace("/", "_")
    else:
        model_name = model.replace("/", "_")

    chunk_strat = f"row_cs{chunk_size}_co0"
    vectorstore_dir = f"../embeddings/{model_name}/{chunk_strat}_faiss_index_COS"

    load_and_store(
        csv_files,
        vectorstore_dir,
        model=model,
        chunk_size=chunk_size,
        device="cuda",
        normalize=False,
    )

Vector store saved to ../embeddings/bge-m3/row_cs1500_co0_faiss_index_COS
Vector store saved to ../embeddings/snowflake-arctic-embed2/row_cs1500_co0_faiss_index_COS
Vector store saved to ../embeddings/jina_jina-embeddings-v2-base-de/row_cs1500_co0_faiss_index_COS
Vector store saved to ../embeddings/paraphrase-multilingual/row_cs1500_co0_faiss_index_COS
Vector store saved to ../embeddings/jeffh_intfloat-multilingual-e5-large-instruct/row_cs1500_co0_faiss_index_COS
Vector store saved to ../embeddings/granite-embedding/row_cs1500_co0_faiss_index_COS
Vector store saved to ../embeddings/bge-large/row_cs1500_co0_faiss_index_COS
Vector store saved to ../embeddings/mxbai-embed-large/row_cs1500_co0_faiss_index_COS
Vector store saved to ../embeddings/jinaai_jina-embeddings-v3/row_cs1500_co0_faiss_index_COS
Unloadind embedding model: jinaai/jina-embeddings-v3
Vector store saved to ../embeddings/HIT-TMG_KaLM-embedding-multilingual-mini-instruct-v1.5/row_cs1500_co0_faiss_index_COS
Unloadind embeddi

## Add More Metadata

In [1]:
import pandas as pd
import os
from typing import List, Dict, Optional
import torch
import gc
import yaml  # For YAML serialization

from sentence_transformers import SentenceTransformer

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_ollama import OllamaEmbeddings
from langchain.embeddings.base import Embeddings


class CustomSentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model: SentenceTransformer, model_name: str):
        """
        Initialize with a pre-loaded SentenceTransformer model.
        """
        self.model = model
        self.model_name = model_name

    def embed_query(self, text: str) -> List[float]:
        """
        Embed a single query text.
        """
        return self.model.encode(text).tolist()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Embed multiple documents.
        """
        return self.model.encode(texts).tolist()

    def unload_model(self):
        """
        Remove the model from memory after processing to free up GPU resources.
        """
        if self.model:
            del self.model  # Delete the model instance
            self.model = None  # Ensure the reference is cleared
            torch.cuda.empty_cache()  # Clear the GPU cache
            gc.collect()  # Run garbage collection
        else:
            print("[DEBUG] Embedding model was already None or not set.")


def create_hierarchy_mapping(csv_paths: List[str]) -> Dict[str, str]:
    """
    Create a mapping from eDoc ID to Element/Attribute Name across all CSVs.
    """
    mapping = {}
    for csv_path in csv_paths:
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"Error reading {csv_path}: {e}")
            continue
        for _, row in df.iterrows():
            edoc_id = str(row.get("eDoc ID", "")).strip()
            name = str(row.get("Element/Attribute Name", "")).strip()
            if edoc_id and name:
                mapping[edoc_id] = name
    return mapping


def parse_edoc_id(edoc_id: str, hierarchy_mapping: Dict[str, str]) -> Dict[str, str]:
    """
    Parse the eDoc ID and return a dictionary representing the hierarchy levels.
    For example, '1-1-2-1' will return:
    {
        'schema_hierarchy_1': 'Process data set',
        'schema_hierarchy_1-1': 'Process information',
        'schema_hierarchy_1-1-2': 'Key Data Set Information',
        'schema_hierarchy_1-1-2-1': 'UUID of Process data set'
    }
    """
    hierarchy = {}
    parts = edoc_id.split("-")
    for i in range(1, len(parts) + 1):
        key = "-".join(parts[:i])
        hierarchy_key = f"schema_hierarchy_{key}"
        hierarchy_value = hierarchy_mapping.get(key, f"Level {key}")
        hierarchy[hierarchy_key] = hierarchy_value
    return hierarchy


def format_metadata_as_yaml(metadata: Dict[str, str]) -> str:
    """
    Convert metadata dictionary to a YAML-formatted string.
    """
    return yaml.dump(metadata, default_flow_style=False).strip()


def load_and_store(
    csv_paths: List[str],
    vectorstore_path: str,
    hierarchy_mapping: Dict[str, str],
    model: str,
    model_name: str,
    chunk_size: int = 1500,
    device: str = "cuda",
    normalize: bool = False,
):
    """
    Load CSV files, create Documents with hierarchical metadata incorporated into chunk content,
    split into chunks, embed, and store in FAISS vector store.
    """
    documents = []

    # Process each CSV
    for csv_path in csv_paths:
        try:
            df = pd.read_csv(csv_path)
        except Exception as e:
            print(f"Error reading {csv_path}: {e}")
            continue

        # Define the columns to extract, fill missing columns with empty strings
        columns_to_extract = [
            "Field Name (de)",
            "Field Name (en)",
            "Element/Attribute Name",
            "Datatype",
            "Definition (de)",
            "Definition (en)",
            "Original ILCD Format Definition (en)",
            "eDoc ID",
        ]

        # Replace missing columns with empty strings
        for column in columns_to_extract:
            if column not in df.columns:
                print(f"Warning: Missing column '{column}' in {csv_path}. Filling with empty strings.")
                df[column] = ""

        # Add schema_type based on file name
        schema_type = os.path.splitext(os.path.basename(csv_path))[0]

        for _, row in df.iterrows():
            content = (
                f"'{row.get('Field Name (de)', '')}',"
                f"'{row.get('Field Name (en)', '')}',"
                f"'{row.get('Element/Attribute Name', '')}',"
                f"'{row.get('Datatype', '')}',"
                f"'{row.get('Definition (de)', '')}',"
                f"'{row.get('Definition (en)', '')}',"
                f"'{row.get('Original ILCD Format Definition (en)', '')}'"
            )
            edoc_id = str(row.get("eDoc ID", "")).strip()
            hierarchy_metadata = (
                parse_edoc_id(edoc_id, hierarchy_mapping) if edoc_id else {}
            )

            # Only include non-empty metadata
            if hierarchy_metadata:
                yaml_metadata = format_metadata_as_yaml(hierarchy_metadata)
                # Prepend YAML metadata to content
                combined_content = f"---\n{yaml_metadata}\n---\n{content.strip()}"
            else:
                combined_content = content.strip()

            documents.append(
                Document(
                    page_content=combined_content,
                    metadata={
                        "source": csv_path,
                        "schema_type": schema_type,
                        # You can keep metadata separate if needed
                        # **hierarchy_metadata
                    },
                )
            )

    if not documents:
        print("No documents to process. Exiting.")
        return

    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=0
    )
    all_splits = text_splitter.split_documents(documents)

    # Ensure the directory exists before opening the file
    output_dir = f"../data/chunks/{model_name}"
    os.makedirs(output_dir, exist_ok=True)

    chunk_strat = f"row_cs{chunk_size}_co0"

    # Write the split chunks to a text file with metadata
    with open(
        os.path.join(output_dir, f"chunk_{chunk_strat}_meta.txt"), "w", encoding="utf-8"
    ) as file:
        for i, chunk in enumerate(all_splits):
            file.write(f"Chunk {i+1}:\n")
            file.write(chunk.page_content + "\n")
            file.write("Metadata:\n")
            file.write(str(chunk.metadata) + "\n")
            file.write("-" * 50 + "\n")

    # Create embeddings
    if model == "jinaai/jina-embeddings-v3":
        model_st = SentenceTransformer(
            model,
            trust_remote_code=True,
            revision="main",
            device=device,
            model_kwargs={"use_flash_attn": False},
        )
        embeddings = CustomSentenceTransformerEmbeddings(
            model=model_st, model_name=model
        )
    elif model.startswith("HIT-TMG"):
        model_st = SentenceTransformer(
            model,
            local_files_only=True,
            device=device,
            model_kwargs={"attn_implementation": "eager"},
        )
        embeddings = CustomSentenceTransformerEmbeddings(
            model=model_st, model_name=model
        )
    elif model.startswith("Alibaba-NLP"):
        model_st = SentenceTransformer(
            model,
            trust_remote_code=True,
            revision="main",
            device=device,
            model_kwargs={"attn_implementation": "eager"},
        )
        embeddings = CustomSentenceTransformerEmbeddings(
            model=model_st, model_name=model
        )
    else:
        embeddings = OllamaEmbeddings(model=model)

    # Create the FAISS vector store from the chunked documents using the new embeddings
    vectorstore = FAISS.from_documents(
        all_splits,
        embedding=embeddings,
        distance_strategy=DistanceStrategy.COSINE,
        normalize_L2=normalize,
    )
    vectorstore.save_local(vectorstore_path)
    print(f"Vector store saved to {vectorstore_path}")

    # Unload the embedding model if it's a custom SentenceTransformer
    if isinstance(embeddings, CustomSentenceTransformerEmbeddings):
        print(f"Unloading embedding model: {embeddings.model_name}")
        embeddings.unload_model()


#########
# Usage
#########
# Define the CSV files to process
csv_files = [
    "../data/csv/EPD_DataSet.csv",
    "../data/csv/EPD_FlowDataSet.csv",
    "../data/csv/ILCD_FlowPropertyDataSet.csv",
    "../data/csv/ILCD_LCIAMethodDataSet.csv",
]

# Define the embedding models to use
models = [
    "bge-m3:latest",  # Ollama
    "snowflake-arctic-embed2:latest",  # Ollama
    "jina/jina-embeddings-v2-base-de:latest",  # Ollama
    "paraphrase-multilingual:latest",  # Ollama
    "jeffh/intfloat-multilingual-e5-large-instruct:f32",  # Ollama
    "granite-embedding:278m",  # Ollama
    "bge-large:latest",  # Ollama
    "mxbai-embed-large:latest",  # Ollama
    "jinaai/jina-embeddings-v3",  # HuggingFace
    "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5",  # HuggingFace
    "Alibaba-NLP/gte-large-en-v1.5",  # HuggingFace
]

# Define the chunk size
chunk_size = 1500

# Create the hierarchy mapping from all CSVs
hierarchy_mapping = create_hierarchy_mapping(csv_files)
print("Hierarchy mapping created.")

for model in models:
    # Determine model name based on the model string
    if model == "jinaai/jina-embeddings-v3":
        model_name = model.replace("/", "_")
    elif model.startswith("HIT-TMG") or model.startswith("Alibaba-NLP"):
        model_name = model.replace("/", "_")
    elif ":" in model:
        model_name = model.split(":")[0].replace("/", "_")
    else:
        model_name = model.replace("/", "_")

    chunk_strat = f"row_cs{chunk_size}_co0"
    vectorstore_dir = os.path.join(
        "..", "embeddings", model_name, f"{chunk_strat}_faiss_index_COS_Meta"
    )

    load_and_store(
        csv_paths=csv_files,
        vectorstore_path=vectorstore_dir,
        hierarchy_mapping=hierarchy_mapping,
        model=model,
        model_name=model_name,
        chunk_size=chunk_size,
        device="cuda",
        normalize=False,
    )

Hierarchy mapping created.
Vector store saved to ..\embeddings\bge-m3\row_cs1500_co0_faiss_index_COS_Meta
Vector store saved to ..\embeddings\snowflake-arctic-embed2\row_cs1500_co0_faiss_index_COS_Meta
Vector store saved to ..\embeddings\jina_jina-embeddings-v2-base-de\row_cs1500_co0_faiss_index_COS_Meta
Vector store saved to ..\embeddings\paraphrase-multilingual\row_cs1500_co0_faiss_index_COS_Meta
Vector store saved to ..\embeddings\jeffh_intfloat-multilingual-e5-large-instruct\row_cs1500_co0_faiss_index_COS_Meta
Vector store saved to ..\embeddings\granite-embedding\row_cs1500_co0_faiss_index_COS_Meta
Vector store saved to ..\embeddings\bge-large\row_cs1500_co0_faiss_index_COS_Meta
Vector store saved to ..\embeddings\mxbai-embed-large\row_cs1500_co0_faiss_index_COS_Meta
Vector store saved to ..\embeddings\jinaai_jina-embeddings-v3\row_cs1500_co0_faiss_index_COS_Meta
Unloading embedding model: jinaai/jina-embeddings-v3
Vector store saved to ..\embeddings\HIT-TMG_KaLM-embedding-multilin