In [29]:
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import os
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter


In [30]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\ernan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [31]:
def extract_text_from_pdfs(directory):
    pdf_texts = []
    for filename in os.listdir(directory):
        print(f"Processing: {filename}")
        if filename.endswith('.pdf'):
            file_path = os.path.join(directory, filename)

            with open(file_path, 'rb') as file:
                reader = PdfReader(file)
                print(f"{reader.metadata}\n")
                text = ""
                number_of_pages = len(reader.pages)
                for page_num in range(number_of_pages):
                    page = reader.pages[page_num]
                    text += page.extract_text()
                    pdf_texts.append(text)
    return pdf_texts


In [37]:
directory = '../data'

# Extract text from PDF files
pdf_texts = extract_text_from_pdfs(directory)

print (len(pdf_texts))
count = 0
for text in pdf_texts:
    print(f"Length of text {count}: {len(text)}")
    count += 1


Processing: 2005.11401v4.pdf
{'/Author': '', '/CreationDate': 'D:20210413004838Z', '/Creator': 'LaTeX with hyperref', '/Keywords': '', '/ModDate': 'D:20210413004838Z', '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', '/Producer': 'pdfTeX-1.40.21', '/Subject': '', '/Title': '', '/Trapped': '/False'}

Processing: 2106.11517v1.pdf
{'/Producer': 'dvips + GPL Ghostscript GIT PRERELEASE 9.22', '/CreationDate': "D:20210622204906-04'00'", '/ModDate': "D:20210622204906-04'00'", '/Creator': 'LaTeX with hyperref', '/Title': 'A template for the arxiv style', '/Subject': 'q-bio.NC, q-bio.QM', '/Author': 'David S. Hippocampus, Elias D. Striatum', '/Keywords': 'First keyword, Second keyword, More'}

Processing: 2210.02627v1.pdf
{'/Author': '', '/CreationDate': 'D:20221007003320Z', '/Creator': 'LaTeX with hyperref', '/Keywords': '', '/ModDate': 'D:20221007003320Z', '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 

In [33]:
sample_text = pdf_texts 

In [41]:
print("\n4. Sentence Tokenization using NLTK")
for text in pdf_texts:
    sentences = sent_tokenize(text)



4. Sentence Tokenization using NLTK


In [44]:
import pandas as pd

data = []
print("\n5. Word Tokenization using NLTK")
for text in pdf_texts:
    data_item = {}
    words = word_tokenize(text)
    data_item['processed'] = words
    data_item['length'] = len(words)
    data_item['original'] = text
    data_item['original length'] = len(text)
    data.append(data_item)

data_df = pd.DataFrame(data)
data_df.columns = ['Processed', 'len', 'Original', 'len']
data_df


5. Word Tokenization using NLTK


Unnamed: 0,Processed,len,Original,len.1
0,"[Retrieval-Augmented, Generation, for, Knowled...",479,Retrieval-Augmented Generation for\nKnowledge-...,2900
1,"[Retrieval-Augmented, Generation, for, Knowled...",1308,Retrieval-Augmented Generation for\nKnowledge-...,7490
2,"[Retrieval-Augmented, Generation, for, Knowled...",2050,Retrieval-Augmented Generation for\nKnowledge-...,11148
3,"[Retrieval-Augmented, Generation, for, Knowled...",2912,Retrieval-Augmented Generation for\nKnowledge-...,15354
4,"[Retrieval-Augmented, Generation, for, Knowled...",3731,Retrieval-Augmented Generation for\nKnowledge-...,19910
5,"[Retrieval-Augmented, Generation, for, Knowled...",4504,Retrieval-Augmented Generation for\nKnowledge-...,24020
6,"[Retrieval-Augmented, Generation, for, Knowled...",5333,Retrieval-Augmented Generation for\nKnowledge-...,28351
7,"[Retrieval-Augmented, Generation, for, Knowled...",5971,Retrieval-Augmented Generation for\nKnowledge-...,31537
8,"[Retrieval-Augmented, Generation, for, Knowled...",6697,Retrieval-Augmented Generation for\nKnowledge-...,35661
9,"[Retrieval-Augmented, Generation, for, Knowled...",7374,Retrieval-Augmented Generation for\nKnowledge-...,39439


In [45]:
print("\n10. Recursive Character Text Splitter for RAG Pipelines")

data = []
print("\n5. Word Tokenization using NLTK")
rec_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
for text in pdf_texts:
    data_item = {}
    words = rec_splitter.split_text(text)
    data_item['processed'] = words
    data_item['length'] = len(words)
    data_item['original'] = text
    data_item['original length'] = len(text)
    data.append(data_item)

data_df = pd.DataFrame(data)
data_df.columns = ['Processed', 'len', 'Original', 'len']
data_df




10. Recursive Character Text Splitter for RAG Pipelines

5. Word Tokenization using NLTK


Unnamed: 0,Processed,len,Original,len.1
0,[Retrieval-Augmented Generation for\nKnowledge...,36,Retrieval-Augmented Generation for\nKnowledge-...,2900
1,[Retrieval-Augmented Generation for\nKnowledge...,93,Retrieval-Augmented Generation for\nKnowledge-...,7490
2,[Retrieval-Augmented Generation for\nKnowledge...,143,Retrieval-Augmented Generation for\nKnowledge-...,11148
3,[Retrieval-Augmented Generation for\nKnowledge...,191,Retrieval-Augmented Generation for\nKnowledge-...,15354
4,[Retrieval-Augmented Generation for\nKnowledge...,253,Retrieval-Augmented Generation for\nKnowledge-...,19910
5,[Retrieval-Augmented Generation for\nKnowledge...,301,Retrieval-Augmented Generation for\nKnowledge-...,24020
6,[Retrieval-Augmented Generation for\nKnowledge...,356,Retrieval-Augmented Generation for\nKnowledge-...,28351
7,[Retrieval-Augmented Generation for\nKnowledge...,395,Retrieval-Augmented Generation for\nKnowledge-...,31537
8,[Retrieval-Augmented Generation for\nKnowledge...,453,Retrieval-Augmented Generation for\nKnowledge-...,35661
9,[Retrieval-Augmented Generation for\nKnowledge...,501,Retrieval-Augmented Generation for\nKnowledge-...,39439


: 

In [38]:

# 4. Sentence Tokenization using NLTK

# 5. Word Tokenization using NLTK

# 10. Recursive Character Text Splitter for RAG Pipelines

# 11. Token Text Splitter for RAG Pipelines
print("\n11. Token Text Splitter for RAG Pipelines")
token_splitter = TokenTextSplitter(chunk_size=50, chunk_overlap=10)
token_splits = token_splitter.split_text(sample_text)
print(token_splits)

# Summary of Splitting Techniques
print("\nSummary of Different Text Splitting Techniques Completed.")



4. Sentence Tokenization using NLTK


TypeError: expected string or bytes-like object, got 'list'

In [8]:
import os
from pypdf import PdfReader

file_path = "../data/2005.11401v4.pdf" 
reader = PdfReader(file_path)
number_of_pages = len(reader.pages)
page = reader.pages[0]
text = page.extract_text()
print(len(text))    


2900
{'/Author': '', '/CreationDate': 'D:20210413004838Z', '/Creator': 'LaTeX with hyperref', '/Keywords': '', '/ModDate': 'D:20210413004838Z', '/PTEX.Fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', '/Producer': 'pdfTeX-1.40.21', '/Subject': '', '/Title': '', '/Trapped': '/False'}



In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, chunk_overlap=0
)
doc_splits = text_splitter.split_documents([text])

AttributeError: 'str' object has no attribute 'page_content'