In [9]:
import os
import sys
import time
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from pypdf import PdfReader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), "../.."))
sys.path.append(ROOT_DIR)

from app.config.config import CONFIG
from experiments.embeddings.test_func.func_embeding import compute_coverage

In [10]:
verbose = CONFIG["verbose"]

In [11]:
# Define the folder where the PDFs are located
folder_path = "./data"
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]

docs = []

# Check for available PDFs
if not pdf_files:
    print("❌ No PDF files found in the folder.")
else:
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)

        if verbose:
            print(f"\n📖 Reading the file: {pdf_file}")

        loader = PyPDFLoader(pdf_path)
        documents = loader.load()

        docs.extend(documents)


📖 Reading the file: BOE-A-2023-12203.pdf


In [12]:
#metadata 
print(docs[0].metadata)

{'producer': 'Antenna House PDF Output Library 6.6.1477 (Linux64)', 'creator': 'eBOE', 'creationdate': '2023-05-24T19:01:31+01:00', 'keywords': 'LEY 12/2023 de 24/05/2023;JEFATURA DEL ESTADO;BOE-A-2023-12203;BOE 124 de 2023;12203;25/05/2023', 'moddate': '2023-05-24T19:14:05+02:00', 'trapped': '/False', 'subject': 'BOE-A-2023-12203', 'author': 'JEFATURA DEL ESTADO', 'title': 'Disposición 12203 del BOE núm. 124 de 2023', 'source': './data\\BOE-A-2023-12203.pdf', 'total_pages': 61, 'page': 0, 'page_label': '1'}


In [13]:
# Split the documents into smaller chunks

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CONFIG["text_splitter"]["chunk_size"],
      chunk_overlap=CONFIG["text_splitter"]["chunk_overlap"], 
      add_start_index=CONFIG["text_splitter"]["add_start_index"]
)
all_splits = text_splitter.split_documents(docs)

if verbose:
  print(f"we have divided the text into {len(all_splits)} parts with an overlap of {CONFIG["text_splitter"]["chunk_overlap"]} words.")

we have divided the text into 312 parts with an overlap of 200 words.


In [14]:
# models to test

model_names = [
    "all-MiniLM-L6-v2",          
    "BAAI/bge-small-en-v1.5",                 
    "sentence-transformers/all-MiniLM-L12-v2",  
    "intfloat/e5-small-v2",                     
    "thenlper/gte-small",                     
    "jinaai/jina-embeddings-v2-small-en",      
    "mixedbread-ai/mxbai-embed-large-v1",     
    "nomic-ai/nomic-embed-text-v1",            
]


In [15]:
# queries in Spanish to evaluate models

queries = [
    "¿Cuál es el objetivo del documento?",
    "¿Qué dice sobre los plazos?",
    "¿Qué organismos participan?",
    "¿Quien tiene  la competencia plena en materia de vivienda?",
    "¿Qué dice sobre la sostenibilidad?",
    "¿Qué dice sobre los parques públicos de vivienda?",
    "¿Que presidente del Gobierno firma el documento?"
]

In [16]:
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)

# list to store results
results_data = []

for model_name in model_names:
    embeddings = HuggingFaceEmbeddings(model_name=model_name,
                                       model_kwargs={"trust_remote_code": True})
    vectorstore = FAISS.from_documents(docs, embeddings)

    result_path = os.path.join(results_dir, f"{model_name.replace('/', '_')}.txt")
    with open(result_path, "w", encoding="utf-8") as f:
        f.write(f"Model: {model_name}\n")
        f.write("=" * 60 + "\n")

        for query in queries:
            start_time = time.time()
            docs_retrieved = vectorstore.similarity_search(query, k=1)
            elapsed = round(time.time() - start_time, 4)

            fragment = docs_retrieved[0].page_content.strip()

            # Cosine similarity between question and retrieved fragment
            query_vec = embeddings.embed_query(query)
            frag_vec = embeddings.embed_query(fragment)
            sim = cosine_similarity(
                np.array(query_vec).reshape(1, -1),
                np.array(frag_vec).reshape(1, -1)
            )[0][0]
            sim = round(sim, 4)

            coverage = compute_coverage(query, fragment)

            #save results in the file
            f.write(f"\n Query: {query}\n")
            f.write(f" Time: {elapsed} s\n")
            f.write(f"Cosine similarity : {sim}\n")
            f.write(f"Coverage: {coverage}\n")
            f.write(f"Most similar fragment:\n{fragment[:1000]}\n")

            # save results in a list for further processing
            results_data.append({
            "Model": model_name,
            "Query": query,
            "Time (s)": elapsed,
            "Cosine Similarity": sim,
            "Coverage": coverage
            })

    print(f"Results saved in: {result_path}")

Results saved in: results\all-MiniLM-L6-v2.txt
Results saved in: results\BAAI_bge-small-en-v1.5.txt
Results saved in: results\sentence-transformers_all-MiniLM-L12-v2.txt
Results saved in: results\intfloat_e5-small-v2.txt
Results saved in: results\thenlper_gte-small.txt
Results saved in: results\jinaai_jina-embeddings-v2-small-en.txt
Results saved in: results\mixedbread-ai_mxbai-embed-large-v1.txt


pytorch_model.bin:   0%|          | 0.00/547M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
<All keys matched successfully>


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

Results saved in: results\nomic-ai_nomic-embed-text-v1.txt


In [17]:
df_results = pd.DataFrame(results_data)

In [19]:
df_results

Unnamed: 0,Model,Query,Time (s),Cosine Similarity,Coverage
0,all-MiniLM-L6-v2,¿Cuál es el objetivo del documento?,0.007,0.5518,0.3333
1,all-MiniLM-L6-v2,¿Qué dice sobre los plazos?,0.009,0.4942,0.2
2,all-MiniLM-L6-v2,¿Qué organismos participan?,0.007,0.508,0.0
3,all-MiniLM-L6-v2,¿Quien tiene la competencia plena en materia ...,0.008,0.6324,0.6667
4,all-MiniLM-L6-v2,¿Qué dice sobre la sostenibilidad?,0.007,0.5003,0.2
5,all-MiniLM-L6-v2,¿Qué dice sobre los parques públicos de vivienda?,0.009,0.7198,0.5
6,all-MiniLM-L6-v2,¿Que presidente del Gobierno firma el documento?,0.008,0.4751,0.4286
7,BAAI/bge-small-en-v1.5,¿Cuál es el objetivo del documento?,0.017,0.7269,0.3333
8,BAAI/bge-small-en-v1.5,¿Qué dice sobre los plazos?,0.014,0.6602,0.4
9,BAAI/bge-small-en-v1.5,¿Qué organismos participan?,0.015,0.7152,0.0
