In [4]:
import os
import spacy
import PyPDF2

# Project base path
base_path = os.path.dirname(os.path.abspath(os.getcwd()))

# Load spaCy model for tokenization
nlp = spacy.load("en_core_web_sm")


In [5]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        return text



In [19]:
pdf_path = os.path.join(base_path, 'docs', 'deeplearning', 'attention_is_all_you_need.pdf')
pdf_text = extract_text_from_pdf(pdf_path)

# Clean the text
doc = nlp(pdf_text)
cleaned_text = " ".join([token.text for token in doc if not token.is_stop])

In [7]:
cleaned_text

'Attention Need \n Ashish Vaswani\x03 \n Google Brain \n avaswani@google.comNoam Shazeer\x03 \n Google Brain \n noam@google.comNiki Parmar\x03 \n Google Research \n nikip@google.comJakob Uszkoreit\x03 \n Google Research \n usz@google.com \n Llion Jones\x03 \n Google Research \n llion@google.comAidan N. Gomez\x03y \n University Toronto \n aidan@cs.toronto.eduŁukasz Kaiser\x03 \n Google Brain \n lukaszkaiser@google.com \n Illia Polosukhin\x03z \n illia.polosukhin@gmail.com \n Abstract \n dominant sequence transduction models based complex recurrent \n convolutional neural networks include encoder decoder . best \n performing models connect encoder decoder attention \n mechanism . propose new simple network architecture , Transformer , \n based solely attention mechanisms , dispensing recurrence convolutions \n entirely . Experiments machine translation tasks models \n superior quality parallelizable requiring signiﬁcantly \n time train . model achieves 28.4 BLEU WMT 2014 English- \n - Ge

In [8]:
sentences = cleaned_text.split(". ")

In [20]:
len(sentences)

373

# Character Splitting

In [7]:
text = "This is the text I would like to chunk up. It is the example text for this exercise"

# Create a list that will hold your chunks
chunks = []

chunk_size = 35 # Characters

# Run through the a range with the length of your text and iterate every chunk_size you want
for i in range(0, len(text), chunk_size):
    chunk = text[i:i + chunk_size]
    chunks.append(chunk)
chunks

['This is the text I would like to ch',
 'unk up. It is the example text for ',
 'this exercise']

In [10]:
# Create a list that will hold your chunks
chunks = []

chunk_size = 1048 # Characters

# Run through the a range with the length of your text and iterate every chunk_size you want
for i in range(0, len(cleaned_text), chunk_size):
    chunk = cleaned_text[i:i + chunk_size]
    chunks.append(chunk)
chunks

['Attention Need \n Ashish Vaswani\x03 \n Google Brain \n avaswani@google.comNoam Shazeer\x03 \n Google Brain \n noam@google.comNiki Parmar\x03 \n Google Research \n nikip@google.comJakob Uszkoreit\x03 \n Google Research \n usz@google.com \n Llion Jones\x03 \n Google Research \n llion@google.comAidan N. Gomez\x03y \n University Toronto \n aidan@cs.toronto.eduŁukasz Kaiser\x03 \n Google Brain \n lukaszkaiser@google.com \n Illia Polosukhin\x03z \n illia.polosukhin@gmail.com \n Abstract \n dominant sequence transduction models based complex recurrent \n convolutional neural networks include encoder decoder . best \n performing models connect encoder decoder attention \n mechanism . propose new simple network architecture , Transformer , \n based solely attention mechanisms , dispensing recurrence convolutions \n entirely . Experiments machine translation tasks models \n superior quality parallelizable requiring signiﬁcantly \n time train . model achieves 28.4 BLEU WMT 2014 English- \n - G

In [11]:
len(chunks)

33

In [12]:

from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size = 35, chunk_overlap=0, separator='', strip_whitespace=False)


In [13]:
text_splitter.create_documents([text])

[Document(page_content='This is the text I would like to ch'),
 Document(page_content='unk up. It is the example text for '),
 Document(page_content='this exercise')]

In [15]:
text_splitter = CharacterTextSplitter(chunk_size = 512, chunk_overlap=0, separator='', strip_whitespace=False)
text_splitter.create_documents([cleaned_text])

[Document(page_content='Attention Need \n Ashish Vaswani\x03 \n Google Brain \n avaswani@google.comNoam Shazeer\x03 \n Google Brain \n noam@google.comNiki Parmar\x03 \n Google Research \n nikip@google.comJakob Uszkoreit\x03 \n Google Research \n usz@google.com \n Llion Jones\x03 \n Google Research \n llion@google.comAidan N. Gomez\x03y \n University Toronto \n aidan@cs.toronto.eduŁukasz Kaiser\x03 \n Google Brain \n lukaszkaiser@google.com \n Illia Polosukhin\x03z \n illia.polosukhin@gmail.com \n Abstract \n dominant sequence transduction models based complex recurrent \n co'),
 Document(page_content='nvolutional neural networks include encoder decoder . best \n performing models connect encoder decoder attention \n mechanism . propose new simple network architecture , Transformer , \n based solely attention mechanisms , dispensing recurrence convolutions \n entirely . Experiments machine translation tasks models \n superior quality parallelizable requiring signiﬁcantly \n time train .

In [9]:

from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader

splitter = SentenceSplitter(
    chunk_size=200,
    chunk_overlap=15,
)

documents = SimpleDirectoryReader(
    input_files=[pdf_path]
).load_data()

In [10]:
nodes = splitter.get_nodes_from_documents(documents)

In [11]:
nodes[0]

TextNode(id_='2630c59a-9fc7-487a-8931-39836067af27', embedding=None, metadata={'page_label': '1', 'file_name': 'attention_is_all_you_need.pdf', 'file_path': 'c:\\Users\\User\\Documents\\Cursos\\llm_zoomcamp_final_project\\docs\\deeplearning\\attention_is_all_you_need.pdf', 'file_type': 'application/pdf', 'file_size': 2201700, 'creation_date': '2024-08-25', 'last_modified_date': '2023-07-06'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='191928a0-2e22-4758-a151-a8852a53d894', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'attention_is_all_you_need.pdf', 'file_path': 'c:\\Users\\User\\Documents\\Cursos\\llm_zoomcamp_final_project\\docs\\deeplearning\\attention_is_all_you_n

## Recursive Character Text Splitting

Let's jump a level of complexity.

The problem with Level #1 is that we don't take into account the structure of our document at all. We simply split by a fix number of characters.

The Recursive Character Text Splitter helps with this. With it, we'll specify a series of separatators which will be used to split our docs.

You can see the default separators for LangChain here. Let's take a look at them one by one.

* "\n\n" - Double new line, or most commonly paragraph breaks
* "\n" - New lines
* " " - Spaces
* "" - Characters

I'm not sure why a period (".") isn't included on the list, perhaps it is not universal enough? If you know, let me know.

This is the swiss army knife of splitters and my first choice when mocking up a quick application. If you don't know which splitter to start with, this is a good first bet.

Let's try it out

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [13]:
text = """
One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear.

Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business.

It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. [1]
"""

In [14]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 65, chunk_overlap=0)

In [15]:
text_splitter.create_documents([text])

[Document(page_content="One of the most important things I didn't understand about the"),
 Document(page_content='world when I was a child is the degree to which the returns for'),
 Document(page_content='performance are superlinear.'),
 Document(page_content='Teachers and coaches implicitly told us the returns were linear.'),
 Document(page_content='"You get out," I heard a thousand times, "what you put in." They'),
 Document(page_content='meant well, but this is rarely true. If your product is only'),
 Document(page_content="half as good as your competitor's, you don't get half as many"),
 Document(page_content='customers. You get no customers, and you go out of business.'),
 Document(page_content="It's obviously true that the returns for performance are"),
 Document(page_content='superlinear in business. Some think this is a flaw of'),
 Document(page_content='capitalism, and that if we changed the rules it would stop being'),
 Document(page_content='true. But superlinear returns for

In [21]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 450, chunk_overlap=0)
text_splitter.create_documents([pdf_text])

[Document(page_content='Attention Is All You Need\nAshish Vaswani\x03\nGoogle Brain\navaswani@google.comNoam Shazeer\x03\nGoogle Brain\nnoam@google.comNiki Parmar\x03\nGoogle Research\nnikip@google.comJakob Uszkoreit\x03\nGoogle Research\nusz@google.com\nLlion Jones\x03\nGoogle Research\nllion@google.comAidan N. Gomez\x03y\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser\x03\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin\x03z\nillia.polosukhin@gmail.com\nAbstract'),
 Document(page_content='The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions'),
 Document(page_content='entirely. Experiments on two machine translation tasks show these mod

In [20]:
pdf_text

'Attention Is All You Need\nAshish Vaswani\x03\nGoogle Brain\navaswani@google.comNoam Shazeer\x03\nGoogle Brain\nnoam@google.comNiki Parmar\x03\nGoogle Research\nnikip@google.comJakob Uszkoreit\x03\nGoogle Research\nusz@google.com\nLlion Jones\x03\nGoogle Research\nllion@google.comAidan N. Gomez\x03y\nUniversity of Toronto\naidan@cs.toronto.eduŁukasz Kaiser\x03\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin\x03z\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer,\nbased solely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to\nbe superior in quality while being more parallelizable and requirin