In [1]:
from transformers import AutoTokenizer, AutoModel

# Cargar el tokenizador y el modelo de embeddings de Hugging Face
model_name = "sentence-transformers/all-MiniLM-L6-v2"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [2]:
import torch

prefix = 'summary:'
max_input_length = 1024
max_target_length = 128

# Define the CustomEmbeddings class
class CustomEmbeddingsFineTunePGC:
    """
    To work with Chrome Vector Database 
    """
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def embed_documents(self, texts):
        # Tokenizar los textos
        inputs = self.tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
        # Pasar los tokens por el modelo para obtener los embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.tolist()
    
    def embed_query(self, text):
        return self.embed_documents([text])[0]

# Define the CustomEmbeddings class
class CustomEmbeddingsRagPGC:
    """
    To work with Chrome Vector Database 
    """
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def embed_documents(self,examples):
        inputs = [prefix + '\n' + bid + ch for bid,ch in zip(examples["book_id"],examples['chapter'])] # This seems to be custom?
        # print(inputs)
        model_inputs = self.tokenizer(inputs, max_length=max_input_length, truncation=True)

        # Setup the tokenizer for targets
        labels = self.tokenizer(text_target=examples["summary"], max_length=max_target_length, truncation=True)

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    
    def embed_query(self, text):
        return self.embed_documents([text])[0]
    
tokCustomEmbeds = CustomEmbeddingsRagPGC(model_name)

In [3]:
from datasets import load_dataset
print("Loading dataset...")
dataset = load_dataset("kmfoda/booksum", split="train")
print("loaded!")

Loading dataset...
loaded!


In [4]:
dataset

Dataset({
    features: ['bid', 'is_aggregate', 'source', 'chapter_path', 'summary_path', 'book_id', 'summary_id', 'content', 'summary', 'chapter', 'chapter_length', 'summary_name', 'summary_url', 'summary_text', 'summary_analysis', 'summary_length', 'analysis_length'],
    num_rows: 9600
})

In [5]:
tokCustomEmbeds.embed_documents(dataset)

In [18]:
from langchain_community.vectorstores import Chroma

# List[List[float]]
# Crear un vector store en Chroma
vectorstore = Chroma.from_texts(texts=documents,embedding=tokCustomEmbeds,persist_directory='data-test')




In [19]:
retriever = vectorstore.as_retriever()

In [6]:
# Ejemplo de consulta
query = "¿Cuál es la capital de Francia?"

# Generar el embedding para la consulta
query_embedding = tokCustomEmbeds.embed_documents([query])

In [25]:
results = vectorstore.search(query = query, search_type='similarity',k=1)
results

[Document(page_content='La capital de Francia es París.')]

In [23]:
retriever.invoke(query,k=1)

[Document(page_content='La capital de Francia es París.'),
 Document(page_content='La capital de Francia es París.'),
 Document(page_content='La capital de Francia es París.'),
 Document(page_content='El clima hoy es soleado.')]

In [None]:
# vectorstore.add_texts(documents)

In [None]:


# # Recuperar documentos similares desde Chroma
# results = vectorstore.search(embedding=query_embedding.numpy(), top_k=3)

# # Mostrar resultados
# for result in results:
#     print(f"Texto: {result['text']}, Similaridad: {result['similarity']}")
