In [None]:
pip install pypdf nltk spacy

In [None]:
!python -m spacy download en_core_web_sm 

In [None]:
!python -m spacy download en_core_web_trf

In [2]:
# importing required modules 
from pypdf import PdfReader 

# creating a pdf reader object 
reader = PdfReader('../embed_docs/Player_s Handbook.pdf') 

# printing number of pages in pdf file 
print(len(reader.pages)) 


336



In [None]:

# getting a specific page from the pdf file 
page = reader.pages[52] 

# extracting text from page 
text = page.extract_text() 
print(text) 

In [2]:
from pypdf import PdfReader 
import nltk

file = '../embed_docs/Player_s Handbook.pdf'

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf = PdfReader(file)
        text = " ".join(page.extract_text() for page in pdf.pages)
    return text

# Extract text from the PDF and split it into sentences
text = extract_text_from_pdf(file)
print(len(text))

2001684


In [3]:
sample = text[31015:33037]
print(sample)

you make is going to be modified 
based on your character’s abilities. A tough character has 
a better chance of surviving a wyvern’s poison sting. A 
perceptive character is more likely to notice bugbears 
sneaking up from behind. A stupid character is not as 
likely to find a secret door that leads to a hidden treasure 
chamber. Your ability scores tell you what your modifiers are for rolls such as these. 
Your character has six abilities: Strength (abbreviated Str), Dex-
terity (Dex), Constitution (Con), Intelligence (Int), Wisdom (Wis), and Charisma (Cha). Each of your character’s above-average abilities gives you a benefit on certain die rolls, and each below-average ability gives you a disadvantage on other die rolls. When creating your character, you roll your scores randomly, assign them to the abilities as you like, and raise and lower them according to the character’s race. Later, you can increase them as your character advances in experience.  
ABILITY SCORES 
To create an a

In [4]:
# Splitting Text into Sentences
def split_text_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

sentences = split_text_into_sentences(text)
print(len(sentences))

16731


This gets us text, sure, but what do we do with it?

In [21]:
import ollama
from pprint import pprint
model_list = ollama.list()
for model in model_list['models']:
    model_name = model['model']  # Access the 'model' attribute directly from each model
    pprint(model_name)

'mistral:7b'
'mistral:latest'
'mxbai-embed-large:v1'
'phi:2.7b'


Let's try a summarization with the doc object

In [39]:
# importing required modules 
from pypdf import PdfReader 

# creating a pdf reader object 
reader = PdfReader('../embed_docs/Player_s Handbook.pdf') 
pages = reader.pages
page = pages[42]
print(page.extract_text(extraction_mode="layout"))

Rotated text discovered. Output will be incomplete.


                     Wholeness of Body (Su): At 7th level or higher, a monk can heal                                Ex-Monks
                 her own wounds. She can heal a number of hit points of damage                                         A monk who becomes nonlawful cannot gain new levels as a monk
                 equal to twice her current monk level each day, and she can spread                                    but retains all monk abilities.
                 this healing out among several uses.                                                                      Like a member of any other class, a monk may be a multiclass
                     Improved Evasion (Ex): At 9th level, a monk’s evasion ability                                     character, but multiclass monks face a special restriction. A monk
                 improves. She still takes no damage on a successful Reflex saving                                     who gains a new class or (if already multiclass) raise

In [None]:
import ollama
from pprint import pprint
from typing import List 

def summarize_from_pages(pages) -> List:
    task = "Summarize this in one to three sentences."
    response = []
    for page in pages:
        page_text = page.extract_text(extraction_mode="layout")
        task_response = ollama.generate(
            model='mistral:7b',
            prompt=f"Using the following context: {page_text}, perform this task: {task}"
        )
        response.append(task_response)
    return response

task_response = summarize_from_pages(pages)


In [55]:

pprint(task_response[42])

{'context': [733,
             16289,
             28793,
             28705,
             9616,
             272,
             2296,
             2758,
             28747,
             359,
             355,
             943,
             12115,
             409,
             302,
             16250,
             325,
             5173,
             1329,
             1794,
             28705,
             28787,
             362,
             2184,
             442,
             4337,
             28725,
             264,
             1326,
             28729,
             541,
             21385,
             359,
             1417,
             28705,
             1529,
             28733,
             7115,
             2285,
             13,
             359,
             559,
             1216,
             24741,
             28723,
             985,
             541,
             21385,
             264,
             1474,
             302,
             4067,
             3569

In [None]:
for task in task_response:
    pprint(task['response'])


Let's build a pipeline

In [15]:
pip install text_chunker tqdm numpy

Note: you may need to restart the kernel to use updated packages.


In [None]:
from pypdf import PdfReader 
from tqdm import tqdm

file = '../embed_docs/Player_s Handbook.pdf'

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf = PdfReader(file)
        text = " ".join(page.extract_text() for page in pdf.pages)
    return text

# Extract text from the PDF and split it into sentences
text = extract_text_from_pdf(file)

In [16]:
import numpy as np
import spacy

nlp = spacy.load('en_core_web_sm')
nlp.max_length = 30000000


def process(text):
    doc = nlp(text)
    sents = list(doc.sents)
    vecs = np.stack([sent.vector / sent.vector_norm for sent in sents])

    return sents, vecs

def cluster_text(sents, vecs, threshold):
    clusters = [[0]]
    for i in range(1, len(sents)):
        if np.dot(vecs[i], vecs[i-1]) < threshold:
            clusters.append([])
        clusters[-1].append(i)
    
    return clusters

def clean_text(text):
    # Add your text cleaning process here
    return text

# Initialize the clusters lengths list and final texts list
clusters_lens = []
final_texts = []

# Process the chunk
threshold = 0.3
sents, vecs = process(text)

# Cluster the sentences
clusters = cluster_text(sents, vecs, threshold)

for cluster in clusters:
    cluster_txt = clean_text(' '.join([sents[i].text for i in cluster]))
    cluster_len = len(cluster_txt)
    
    # Check if the cluster is too short
    if cluster_len < 60:
        continue
    
    # Check if the cluster is too long
    elif cluster_len > 3000:
        threshold = 0.6
        sents_div, vecs_div = process(cluster_txt)
        reclusters = cluster_text(sents_div, vecs_div, threshold)
        
        for subcluster in reclusters:
            div_txt = clean_text(' '.join([sents_div[i].text for i in subcluster]))
            div_len = len(div_txt)
            
            if div_len < 60 or div_len > 3000:
                continue
            
            clusters_lens.append(div_len)
            final_texts.append(div_txt)
            
    else:
        clusters_lens.append(cluster_len)
        final_texts.append(cluster_txt)

In [22]:
import ollama
from pprint import pprint
from typing import List 

def summarize_from_chunks(chunks) -> List:
    task = "Summarize this in one to three sentences."
    response = []
    for chunk in enumerate(chunks):
        task_response = ollama.generate(
            model='mistral:7b',
            prompt=f"Using the following context: {chunk}, perform this task: {task}"
        )
        response.append(task_response)
    return response

task_response = summarize_from_chunks(final_texts)


In [32]:
import ollama
from pprint import pprint
from typing import List 
from tqdm import tqdm

def chunk_and_summarize(chunks) -> List:
    task = "Summarize this in one to three sentences."
    summaries_and_chunks = []
    for chunk in tqdm(enumerate(chunks)):
        summary = ollama.generate(
            model='mistral:7b',
            prompt=f"Using the following context: {chunk}, perform this task: {task}"
        )
        summary_response = summary['response']
        summaries_and_chunks.append({'summary': summary_response, 'chunk': chunk})
    return summaries_and_chunks

s_and_c = chunk_and_summarize(final_texts)

3988it [51:13,  1.30it/s]


In [None]:
pprint(s_and_c[350])
