In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
# Modelo de hugging Face
model_name = "sentence-transformers/all-MiniLM-L6-v2"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


# Define the CustomEmbeddings class
class CustomEmbeddings:
    """
    To work with Chrome Vector Database 
    """
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def embed_documents(self, texts):
        # Tokenizar los textos
        inputs = self.tokenizer(texts.page_content, return_tensors="pt", padding=True, truncation=True)
        # Pasar los tokens por el modelo para obtener los embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        return embeddings.tolist()
    
    def embed_query(self, text):
        return self.embed_documents([text])[0]
    
CE = CustomEmbeddings(model_name)

In [1]:
from datasets import load_dataset
print("Loading dataset...")
dataset = load_dataset("kmfoda/booksum", split="train")

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset...


In [4]:
dataset[0]

{'bid': 27681,
 'is_aggregate': True,
 'source': 'cliffnotes',
 'chapter_path': 'all_chapterized_books/27681-chapters/chapters_1_to_2.txt',
 'summary_path': 'finished_summaries/cliffnotes/The Last of the Mohicans/section_1_part_0.txt',
 'book_id': 'The Last of the Mohicans.chapters 1-2',
 'summary_id': 'chapters 1-2',
 'content': None,
 'summary': '{"name": "Chapters 1-2", "url": "https://web.archive.org/web/20201101053205/https://www.cliffsnotes.com/literature/l/the-last-of-the-mohicans/summary-and-analysis/chapters-12", "summary": "Before any characters appear, the time and geography are made clear. Though it is the last war that England and France waged for a country that neither would retain, the wilderness between the forces still has to be overcome first. Thus it is in 1757, in the New York area between the head waters of the Hudson River and Lake George to the north. Because only two years earlier General Braddock was disgracefully routed by a handful of French and Indians, the 

In [38]:
# from langchain_core.documents.base import Document
def preprocessing_pgc(dataset):
    documents = []
    for data in dataset:
        text = ''
        if type(data)==dict:
            text = 'summarize:\n' + data['book_id'] + data['chapter']
        elif type(data)==str:
            prefix = 'summarize: ' if 'summarize' not in data else ''
            text = prefix + data
        else:
            raise NotImplementedError
        documents.append(Document(text))
    return documents
print("loaded!, preprocessing:")
processed_dataset = preprocessing_pgc(dataset)
processed_dataset[:5]

loaded!, preprocessing:


 Document(page_content='summarize:\nThe Last of the Mohicans.chapter 3\n  "Before these fields were shorn and tilled,\n    Full to the brim our rivers flowed;\n  The melody of waters filled\n    The fresh and boundless wood;\n  And torrents dashed, and rivulets played,\n    And fountains spouted in the shade."\n\n  BRYANT.\n\n\nLeaving the unsuspecting Heyward and his confiding companions to\npenetrate still deeper into a forest that contained such treacherous\ninmates, we must use an author\'s privilege, and shift the scene a few\nmiles to the westward of the place where we have last seen them.\n\nOn that day, two men were lingering on the banks of a small but rapid\nstream, within an hour\'s journey of the encampment of Webb, like those\nwho awaited the appearance of an absent person, or the approach of some\nexpected event. The vast canopy of woods spread itself to the margin of\nthe river overhanging the water, and shadowing its dark current with a\ndeeper hue. The rays of the sun 

In [43]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
# processed_splits = text_splitter.split_documents(processed_dataset)

In [46]:
# processed_splits[0].page_content

'summarize:\nThe Last of the Mohicans.chapters 1-2\n  "Mine ear is open, and my heart prepared:\n  The worst is worldly loss thou canst unfold:\n  Say, is my kingdom lost?"\n\n  SHAKESPEARE.'

In [44]:
embeds = CE.embed_documents(processed_splits[:2])
embeds

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
del embeds

In [5]:
from langchain_community.vectorstores import Chroma

# List[List[float]]
# Crear un vector store en Chroma
vectorstore = Chroma.from_texts(texts=processed_dataset[:10],embedding=CE,persist_directory='data-test')

In [26]:
query = 'summarize The Last of the Mohicans.chapters 1-2'

In [24]:
# results = vectorstore.search(query = [query], search_type='mmr',k=1)
# results

In [27]:
retriever = vectorstore.as_retriever()
retriever.invoke(query,k=1)

[Document(page_content='summarize:\nThe Last of the Mohicans.chapters 13-14\n  "I\'ll seek a readier path."\n\n  PARNELL.\n\n\nThe route taken by Hawkeye lay across those sandy plains, relieved by\noccasional valleys and swells of land, which had been traversed by their\nparty on the morning of the same day, with the baffled Magua for their\nguide. The sun had now fallen low towards the distant mountains; and as\ntheir journey lay through the interminable forest, the heat was no\nlonger oppressive. Their progress, in consequence, was proportionate;\nand long before the twilight gathered about them, they had made good\nmany toilsome miles on their return.\n\nThe hunter, like the savage whose place he filled, seemed to select\namong the blind signs of their wild route, with a species of instinct,\nseldom abating his speed, and never pausing to deliberate. A rapid and\noblique glance at the moss on the trees, with an occasional upward gaze\ntowards the setting sun, or a steady but passing