# Chunk per row

## OllamaEmbeddings

In [22]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
import os


def load_and_store(csv_paths, vectorstore_path):
    documents = []

    # Process each CSV
    for csv_path in csv_paths:
        df = pd.read_csv(csv_path)

        # Check for required columns and handle missing columns gracefully
        required_columns = [
            "Field Name (de)",
            "Field Name (en)",
            "Element/Attribute Name",
            "Datatype",
            "Definition (de)",
            "Definition (en)",
            "Original ILCD Format Definition (en)",
        ]
        for column in required_columns:
            if column not in df.columns:
                print(
                    f"Warning: Missing column '{column}' in {csv_path}. Skipping this file."
                )
                continue

        # Add schema_type based on file name
        schema_type = csv_path.split("/")[-1].split(".")[0]

        for _, row in df.iterrows():
            content = (
                f"'{row.get('Field Name (de)', '')}',"
                f"'{row.get('Field Name (en)', '')}',"
                f"'{row.get('Element/Attribute Name', '')}',"
                f"'{row.get('Datatype', '')}',"
                f"'{row.get('Definition (de)', '')}',"
                f"'{row.get('Definition (en)', '')}',"
                f"'{row.get('Original ILCD Format Definition (en)', '')}'"
            )
            documents.append(
                Document(
                    page_content=content.strip(),
                    metadata={"source": csv_path, "schema_type": schema_type},
                )
            )

    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    all_splits = text_splitter.split_documents(documents)

    # Ensure the directory exists before opening the file
    output_dir = f"../data/chunks/{model_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Write the split chunks to a text file with metadata
    with open(
        f"../data/chunks/{model_name}/chunk_{chunk_strat}.txt", "w", encoding="utf-8"
    ) as file:
        for i, chunk in enumerate(all_splits):
            file.write(f"Chunk {i+1}:\n")
            file.write(chunk.page_content + "\n")
            file.write("Metadata:\n")
            file.write(str(chunk.metadata) + "\n")
            file.write("-" * 50 + "\n")

    # Create and save FAISS vector store
    embeddings = OllamaEmbeddings(model=model)
    vectorstore = FAISS.from_documents(all_splits, embedding=embeddings)
    vectorstore.save_local(vectorstore_path)
    print(f"Vector store saved to {vectorstore_path}")

    # try:
    #     vectorstore = FAISS.from_documents(all_splits, embedding=embeddings)
    #     vectorstore.save_local(vectorstore_path)
    #     print(f"Vector store saved to {vectorstore_path}")
    # except Exception as e:
    #     print(f"Error during FAISS vector store creation: {e}")
    #     vectorstore.save_local(vectorstore_path)


if __name__ == "__main__":
    # Example usage with schema-specific CSV files
    csv_files = [
        "../data/csv/EPD_DataSet.csv",
        # "../data/csv/EPD_FlowDataSet.csv",
        # "../data/csv/ILCD_FlowPropertyDataSet.csv",
        # "../data/csv/ILCD_LCIAMethodDataSet.csv",
    ]

    models = [
        "bge-m3:latest",
        "snowflake-arctic-embed2:latest",
        "jina/jina-embeddings-v2-base-de:latest",
        "paraphrase-multilingual:latest",
        "jeffh/intfloat-multilingual-e5-large-instruct:f32",
        "granite-embedding:278m",
        "bge-large:latest",
    ]

    for model in models:
    # model = "bge-m3:latest"
    # model = "snowflake-arctic-embed2:latest"
    # model = "jina/jina-embeddings-v2-base-de:latest"
    # model = "paraphrase-multilingual:latest"
    # model = "jeffh/intfloat-multilingual-e5-large-instruct:f32" # Produces and error with cs3000
    # model = "granite-embedding:278m" # Produces and error cs3000
    # model = "bge-large:latest" # Produces and error cs3000
        if model == "bge-m3:latest" or model == "bge-large:latest":
            model_name = model.split(":")[0].replace("/", "_")
        else:
            model_name = model.split("-")[0].replace("/", "_")  
        chunk_strat = "row_cs1000_co0"
        vectorstore_dir = f"../embeddings/{model_name}/{chunk_strat}_faiss_index"
        load_and_store(csv_files, vectorstore_dir)

bge-m3:latest
bge-m3:latest
Vector store saved to ../embeddings/bge-m3/row_cs1000_co0_faiss_index
snowflake-arctic-embed2:latest
snowflake-arctic-embed2:latest
Vector store saved to ../embeddings/snowflake/row_cs1000_co0_faiss_index
jina/jina-embeddings-v2-base-de:latest
jina/jina-embeddings-v2-base-de:latest
Vector store saved to ../embeddings/jina_jina/row_cs1000_co0_faiss_index
paraphrase-multilingual:latest
paraphrase-multilingual:latest
Vector store saved to ../embeddings/paraphrase/row_cs1000_co0_faiss_index
jeffh/intfloat-multilingual-e5-large-instruct:f32
jeffh/intfloat-multilingual-e5-large-instruct:f32
Vector store saved to ../embeddings/jeffh_intfloat/row_cs1000_co0_faiss_index
granite-embedding:278m
granite-embedding:278m
Vector store saved to ../embeddings/granite/row_cs1000_co0_faiss_index
bge-large:latest
bge-large:latest
Vector store saved to ../embeddings/bge-large/row_cs1000_co0_faiss_index


## HuggingFaceEmbeddings

In [23]:
import os
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

def load_and_store(csv_paths, vectorstore_path, model_name, chunk_strat):
    documents = []

    # Process each CSV
    for csv_path in csv_paths:
        df = pd.read_csv(csv_path)

        # Check for required columns and handle missing columns gracefully
        required_columns = [
            "Field Name (de)",
            "Field Name (en)",
            "Element/Attribute Name",
            "Datatype",
            "Definition (de)",
            "Definition (en)",
            "Original ILCD Format Definition (en)",
        ]
        for column in required_columns:
            if column not in df.columns:
                print(
                    f"Warning: Missing column '{column}' in {csv_path}. Skipping this file."
                )
                continue

        # Add schema_type based on file name
        schema_type = csv_path.split("/")[-1].split(".")[0]

        # Convert each row to a Document
        for _, row in df.iterrows():
            content = (
                f"'{row.get('Field Name (de)', '')}',"
                f"'{row.get('Field Name (en)', '')}',"
                f"'{row.get('Element/Attribute Name', '')}',"
                f"'{row.get('Datatype', '')}',"
                f"'{row.get('Definition (de)', '')}',"
                f"'{row.get('Definition (en)', '')}',"
                f"'{row.get('Original ILCD Format Definition (en)', '')}'"
            )
            documents.append(
                Document(
                    page_content=content.strip(),
                    metadata={"source": csv_path, "schema_type": schema_type},
                )
            )

    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=0)
    all_splits = text_splitter.split_documents(documents)

    # Ensure the directory exists
    output_dir = f"../data/chunks/{model_name}"
    os.makedirs(output_dir, exist_ok=True)

    # Write the split chunks to a text file with metadata
    with open(
        f"../data/chunks/{model_name}/chunk_{chunk_strat}.txt", "w", encoding="utf-8"
    ) as file:
        for i, chunk in enumerate(all_splits):
            file.write(f"Chunk {i+1}:\n")
            file.write(chunk.page_content + "\n")
            file.write("Metadata:\n")
            file.write(str(chunk.metadata) + "\n")
            file.write("-" * 50 + "\n")

    # 2. Create HuggingFaceEmbeddings for your chosen model.
    #    You can pass "jinaai/jina-embeddings-v3" or any other HF model name.
    #    Use `model_kwargs={"trust_remote_code": True}` if needed.
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={"trust_remote_code": True, 'device': 'cuda'}
    )

    # 3. Create the FAISS vector store from the chunked documents using the new embeddings
    vectorstore = FAISS.from_documents(all_splits, embedding=embeddings)
    vectorstore.save_local(vectorstore_path)
    print(f"Vector store saved to {vectorstore_path}")


if __name__ == "__main__":
    # Example usage with schema-specific CSV files
    csv_files = [
        "../data/csv/EPD_DataSet.csv",
        # "../data/csv/EPD_FlowDataSet.csv",
        # "../data/csv/ILCD_FlowPropertyDataSet.csv",
        # "../data/csv/ILCD_LCIAMethodDataSet.csv",
    ]

    # 
    models = [
        # "jinaai/jina-embeddings-v3",
        "HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5"
    ]

    for model in models:
        # if model == "bge-m3:latest" or model == "bge-large:latest":
        #     # Example: model_name = "bge-m3" or "bge-large"
        #     huggingface_model_name = model.split(":")[0].replace("/", "_")
        # else:
        #     huggingface_model_name = model.split("-")[0].replace("/", "_")
        huggingface_model_name = model.replace("/", "_")

        chunk_strat = "row_cs3000_co0" # row_cs1000_co0
        vectorstore_dir = f"../embeddings/{huggingface_model_name}/{chunk_strat}_faiss_index"

        load_and_store(csv_files, vectorstore_dir, model, chunk_strat)


Vector store saved to ../embeddings/HIT-TMG_KaLM-embedding-multilingual-mini-instruct-v1.5/row_cs3000_co0_faiss_index


In [25]:
from sentence_transformers import SentenceTransformer


sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5')   # Do NOT set trust_remote_code
model.max_seq_length = 512

prompt = "Instruct: Classifying the category of french news. \n Query: "
embeddings = model.encode(
    sentences, 
    prompt=prompt,
    normalize_embeddings=True,
    batch_size=256, 
    show_progress_bar=True
    )
print(embeddings)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[[-0.04516602  0.01708984  0.04370117 ... -0.01989746 -0.00160217
  -0.01098633]
 [-0.04272461  0.01599121  0.04150391 ... -0.02661133  0.00037384
  -0.00601196]]


In [21]:
from langchain_huggingface import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings(model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5", model_kwargs = {'device': 'cuda'})
embeddings = HuggingFaceEmbeddings(
    model_name="jinaai/jina-embeddings-v3",
    model_kwargs={
        "trust_remote_code": True,
        "device": "cpu",
    },
)

text = "This is a test document."
query_result = embeddings.embed_query(text)

# show only the first 100 characters of the stringified vector
print(str(query_result)[:100] + "...")

RuntimeError: FlashAttention is not installed. To proceed with training, please install FlashAttention. For inference, you have two options: either install FlashAttention or disable it by setting use_flash_attn=False when loading the model.

In [3]:
from transformers import AutoModel, AutoTokenizer
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True)

In [11]:
from transformers import AutoModel

# Initialize the model
model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True, use_flash_attn=False)

texts = [
    "Follow the white rabbit.",  # English
    "Sigue al conejo blanco.",  # Spanish
    "Suis le lapin blanc.",  # French
    "跟着白兔走。",  # Chinese
    "اتبع الأرنب الأبيض.",  # Arabic
    "Folge dem weißen Kaninchen.",  # German
]

# When calling the `encode` function, you can choose a `task` based on the use case:
# 'retrieval.query', 'retrieval.passage', 'separation', 'classification', 'text-matching'
# Alternatively, you can choose not to pass a `task`, and no specific LoRA adapter will be used.
embeddings = model.encode(texts, task="text-matching")

# Compute similarities
print(embeddings[0] @ embeddings[1].T)


AttributeError: 'Qwen2Model' object has no attribute 'encode'

In [8]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True, model_kwargs={"use_flash_attn": False})


task = "retrieval.query"
embeddings = model.encode(
    ["What is the weather like in Berlin today?"],
    task=task,
    prompt_name=task,
)

print(embeddings)

[[-0.08300781  0.05957031 -0.04174805 ...  0.0559082  -0.04833984
  -0.0177002 ]]


In [1]:
from langchain_huggingface import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings(model_name="HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5")
embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3", model_kwargs={"trust_remote_code":True})

text = "This is a test document."
query_result = embeddings.embed_query(text)

# show only the first 100 characters of the stringified vector
print(str(query_result)[:100] + "...")

RuntimeError: FlashAttention is not installed. To proceed with training, please install FlashAttention. For inference, you have two options: either install FlashAttention or disable it by setting use_flash_attn=False when loading the model.

In [6]:
from transformers import AutoModel
from transformers.utils import is_flash_attn_2_available

model = AutoModel.from_pretrained(
    "jinaai/jina-embeddings-v3",
    trust_remote_code=True,
    attn_implementation="flash_attention_2"
)

# model = AutoModel.from_pretrained("jinaai/jina-embeddings-v3", trust_remote_code=True, use_flash_attn=False)

is_flash_attn_2_available()

ValueError: XLMRobertaLoRA does not support Flash Attention 2.0 yet. Please request to add support where the model is hosted, on its model hub page: https://huggingface.co/jinaai/jina-embeddings-v3/discussions/new or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new

In [None]:
# embeddings = OllamaEmbeddings(model="jeffh/intfloat-multilingual-e5-large-instruct:f32")
embeddings = OllamaEmbeddings(model="granite-embedding:278m")
# embeddings = OllamaEmbeddings(model="bge-large:latest")
try:
    result = embeddings.embed_documents(["Grüß dich."])
    print(f"Embedding Dimension: {len(result[0])}")
    print(result)
except Exception as e:
    print(f"Error: {e}")


Embedding Dimension: 768
[[-0.050539244, 0.04628803, -0.01797338, 0.03784683, 0.047730733, -0.03977373, 0.05454334, 0.04838282, 0.0069133127, 0.07339246, 0.02529023, 0.012348013, 0.035170067, -0.022228982, 0.0047577126, 0.03388236, 0.06368775, -0.026149891, -0.019046243, -0.0131056495, -0.009665934, 0.04692083, 0.0016999977, 0.06646937, 0.009140642, -0.034553282, 0.005663029, -0.0015759732, -0.08860464, 0.024826564, -0.003854021, 0.025727188, -0.017487092, 0.024928547, -0.029344574, -0.031072438, 0.03936611, 0.0063371826, -0.0017795764, 0.01630557, -0.030776525, 0.012538749, 0.021919202, 0.043259766, 0.0064417617, 0.0021398494, -0.055609055, 0.039181855, -0.010168613, 0.012661748, -0.0595976, 0.013703481, -0.016492944, -0.019992718, -0.007435035, 0.04991906, 0.03846667, 0.058389414, -0.059062816, 0.031050215, 0.001882077, 0.017884448, 0.03563421, -0.02947485, 0.02305669, 0.016973201, 0.04456293, -0.017746933, -0.033560168, 0.024379155, 0.005557521, 0.010576169, 0.048951235, 0.06729507,

# Chunk per `eDoc ID`

In [5]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaEmbeddings
import re

def load_and_store(csv_paths, vectorstore_path):
    documents = []

    # Process each CSV
    for csv_path in csv_paths:
        df = pd.read_csv(csv_path)

        # Check for required columns and handle missing columns gracefully
        required_columns = [
            "Field Name (de)",
            "Field Name (en)",
            "Element/Attribute Name",
            "Datatype",
            "Definition (de)",
            "Definition (en)",
            "Original ILCD Format Definition (en)",
            "eDoc ID",  # Column used for section-based grouping
        ]
        for column in required_columns:
            if column not in df.columns:
                print(
                    f"Warning: Missing column '{column}' in {csv_path}. Skipping this file."
                )
                continue

        # Add schema_type based on file name
        schema_type = csv_path.split("/")[-1].split(".")[0]

        # Extract major sections based on eDoc ID
        def extract_major_section(edoc_id):
            parts = edoc_id.split("-")
            if len(parts) > 1:
                return "-".join(parts[:2])  # Group by first two levels (e.g., 1-3, 1-1)
            return edoc_id

        df['Major Section'] = df['eDoc ID'].apply(lambda x: extract_major_section(str(x)))

        # Group rows by major section
        grouped = df.groupby("Major Section")

        for section_id, group in grouped:
            content = "\n".join(
                f"'{row.get('Field Name (de)', '')}',"
                f"'{row.get('Field Name (en)', '')}',"
                f"'{row.get('Element/Attribute Name', '')}',"
                f"'{row.get('Datatype', '')}',"
                f"'{row.get('Definition (de)', '')}',"
                f"'{row.get('Definition (en)', '')}',"
                f"'{row.get('Original ILCD Format Definition (en)', '')}'"
                for _, row in group.iterrows()
            )
            documents.append(
                Document(
                    page_content=content.strip(),
                    metadata={"source": csv_path, "schema_type": schema_type, "section_id": section_id},
                )
            )

    # Split into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    all_splits = text_splitter.split_documents(documents)

    # Write the split chunks to a text file with metadata
    with open("../data/chunks/all_chunks_output_csv02.txt", "w", encoding="utf-8") as file:
        for i, chunk in enumerate(all_splits):
            file.write(f"Chunk {i+1}:\n")
            file.write(chunk.page_content + "\n")
            file.write("Metadata:\n")
            file.write(str(chunk.metadata) + "\n")
            file.write("-" * 50 + "\n")

    # Create and save FAISS vector store
    embeddings = OllamaEmbeddings(model="bge-m3:latest")
    vectorstore = FAISS.from_documents(all_splits, embedding=embeddings)
    vectorstore.save_local(vectorstore_path)
    print(f"Vector store saved to {vectorstore_path}")

if __name__ == "__main__":
    # Example usage with schema-specific CSV files
    csv_files = [
        "../data/csv/EPD_DataSet.csv",
        "../data/csv/EPD_FlowDataSet.csv",
        "../data/csv/ILCD_FlowPropertyDataSet.csv",
        "../data/csv/ILCD_LCIAMethodDataSet.csv",
    ]
    vectorstore_dir = "../embeddings/bge-m3_csv02_faiss_index"
    load_and_store(csv_files, vectorstore_dir)

Vector store saved to ../embeddings/bge-m3_csv02_faiss_index
