# RAG, paso 1: Encontrar documentos similares usando embeddings

Este notebook arma un indice para buscar los documentos de entrenamiento mas cercanos a un documento a clasificar.
Utiliza un modelo de embedding de frases para transformar textos a embeddings. El modelo de embedding usado aqui (SBERT) es diferente del modelo usado en el próximo paso para ver de que clase es un texto. Podriamos usar el mismo modelo de embedding, pero los modelos autoregresivos no vienen con un modelo de embedding de frase precalculado, y armar uno toma mas memoria que la disponible en las notebooks gratuitas de colab.


## Instalar dependencias

In [None]:
!pip install --no-cache-dir tqdm
!pip install --no-cache-dir unstructured[md]
!pip install --no-cache-dir pandas
!pip install --no-cache-dir torch
!pip install --no-cache-dir transformers
!pip install --no-cache-dir accelerate
!pip install --no-cache-dir bitsandbytes
!pip install --no-cache-dir langchain
!pip install --no-cache-dir sentence-transformers
!pip install --no-cache-dir faiss-gpu
!pip install --no-cache-dir ragatouille

Collecting unstructured[md]
  Downloading unstructured-0.14.7-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting filetype (from unstructured[md])
  Downloading filetype-1.2.0-py2.py3-none-any.whl (19 kB)
Collecting python-magic (from unstructured[md])
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting emoji (from unstructured[md])
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from unstructured[md])
  Downloading dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting python-iso639 (from unstructured[md])
  Downloading python_iso639-2024.4.27-py3-none-any.whl (274 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hC

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple

from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
import warnings
import re

In [None]:
def clear_huggingface_model_cache():
  """Esta funcion sirve si tenemos que borrar modelos porque nos quedamos sin espacio de disco."""
  from transformers import TRANSFORMERS_CACHE
  print("Deleting huggingface model cachet at",TRANSFORMERS_CACHE)
  import shutil
  shutil.rmtree(TRANSFORMERS_CACHE)

In [None]:
EMBEDDING_MODEL_NAME = "Santp98/SBERT-bert-base-spanish-wwm-cased-2023-11-13-22-45" #"4i-ai/Llama-2-7b-alpaca-es"  # https://huggingface.co/4i-ai/Llama-2-7b-alpaca-es

USE_GPU=True
DEVICE = "cuda" if USE_GPU else "cpu"

## El modelo de embedding de frase trabaja con un tamaño fijo de tokens.

 No podemos armar embeddings de frase con mas tokens que ese límite.

In [1]:
print(f"El largo máximo de una frase soportada por el modelo de embedding de frase es: {SentenceTransformer(EMBEDDING_MODEL_NAME).max_seq_length} tokens.")


NameError: name 'SentenceTransformer' is not defined

## Cargas los documentos de entrenamiento

In [None]:
import glob
import os.path
from typing import List
from langchain_core.documents import Document
from langchain_community.document_loaders import UnstructuredFileLoader


class DirectoryLoader:
    def __init__(self, directory_path: str, glob_pattern: str = "**/*.*", mode: str = "single", doc_loader=UnstructuredFileLoader):
        """
        Initialize the loader with a directory path and a glob pattern.
        :param directory_path: Path to the directory containing files to load.
        :param glob_pattern: Glob pattern to match files within the directory.
        :param mode: Mode to use with UnstructuredFileLoader ('single', 'elements', or 'paged').
        """
        self.directory_path = directory_path
        self.glob_pattern = glob_pattern
        self.mode = mode
        self.doc_loader = doc_loader

    def load(self) -> List[Document]:
        """
        Load all files matching the glob pattern in the directory using UnstructuredFileLoader.
        :return: List of Document objects loaded from the files.
        """
        documents = []
        # Construct the full glob pattern
        if os.path.exists(self.directory_path):
            full_glob_pattern = f"{self.directory_path}/{self.glob_pattern}"
            # Iterate over all files matched by the glob pattern
            for file_path in glob.glob(full_glob_pattern):
                # Use UnstructuredFileLoader to load each file
                loader = self.doc_loader(file_path=file_path, mode=self.mode)
                docs = loader.load()
                documents.extend(docs)
            return documents
        raise FileNotFoundError(f"Directory '{self.directory_path}' cannot be found.")

#### Cargar documentos. Asumimos que adentro del dir 'docs' hay 1 directorio con el nombre de c/categoria, y los docs de esa categoria en formato Markdown.

In [None]:
# podemos usar mode="elements" si nos interesa que el titulo y el cuerpo sean partes diferentes del doc
loader = DirectoryLoader("docs",  doc_loader=UnstructuredMarkdownLoader,  mode="single")
docs = loader.load()

Guardar la categoria de cada documento en la metadata del doc

In [None]:
extract_cat = re.compile("docs/(.+)/")

In [None]:
for doc in docs:
   doc.metadata["categoria"] = extract_cat.search(docs[0].metadata["source"]).groups(0)[0]

## Tokenizar los documentos de entrenamiento.

In [None]:
# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
        "\n#{1,6} ",
        "```\n",
        "\n\\*\\*\\*+\n",
        "\n---+\n",
        "\n___+\n",
        "\n\n",
        "\n",
        " ",
        "",
    ]

def split_documents(
        chunk_size: int,
        knowledge_base: List[Document],
        tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
    ) -> List[Document]:
        """
        Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents.
        """

        text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            AutoTokenizer.from_pretrained(tokenizer_name),
            chunk_size=chunk_size,
            chunk_overlap=int(chunk_size / 10),
            add_start_index=True,
            strip_whitespace=True,
            separators=MARKDOWN_SEPARATORS,
        )

        docs_processed = []

        for doc in knowledge_base:
            docs_processed += text_splitter.split_documents([doc])

        # Remove duplicates
        unique_texts = {}
        docs_processed_unique = []

        for doc in docs_processed:
            if doc.page_content not in unique_texts:
                unique_texts[doc.page_content] = True
                docs_processed_unique.append(doc)

        return docs_processed_unique

In [None]:
tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME)

docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model
    docs,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)

## Guardar los documentos de entrenamiento en una base de datos vectorial (FAISS)

In [None]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    embedding_model = HuggingFaceEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        multi_process=True,
        model_kwargs={"device": DEVICE},
        encode_kwargs={"normalize_embeddings": True},  # Set to `True` for cosine similarity
    )

In [None]:

VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [None]:
def find_similar_docs(vector_db:FAISS, doc_txt:str, top_k:int=5) -> List[Document]:
  return vector_db.similarity_search(query=doc_to_find_similar, k=5)

In [None]:
doc_to_find_similar = "La reina esta triste"
for similar_doc in find_similar_docs(VECTOR_DATABASE, doc_to_find_similar, 5):
   print(similar_doc.metadata["source"])

docs/1.txt
