In [2]:
import pdfplumber
import tiktoken

In [61]:
def pdf_to_token_chunks(pdf_path, tokenizer, max_tokens=100):
    """
    Converts a PDF file to text chunks based on token count, ensuring that chunks end at logical boundaries.

    Parameters:
    - pdf_path (str): Path to the input PDF file.
    - tokenizer (tiktoken.Encoding): A tiktoken tokenizer instance.
    - max_tokens (int): Maximum number of tokens per chunk.

    Returns:
    - chunks (list): List of text chunks.
    """
    chunks = []
    full_text = ""

    # Extract all text from the PDF
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + "\n"  # Add a newline between pages for readability

    return chunking(full_text, tokenizer, max_tokens=max_tokens)



In [63]:
def text_to_token_chunks(txt_path, tokenizer, max_tokens=100):
    """
    Converts a TXT file to text chunks based on token count, ensuring chunks end logically.

    Parameters:
    - txt_path (str): Path to the input txt file.
    - tokenizer (tiktoken.Encoding): A tiktoken tokenizer instance.
    - max_tokens (int): Maximum number of tokens per chunk.

    Returns:
    - chunks (list): List of text chunks.
    """
    chunks = []
    full_text = ""

    # Read the full text from the TXT file
    with open(txt_path, "r", encoding="utf-8") as txt:
        full_text = txt.read()

    return chunking(full_text, tokenizer, max_tokens=max_tokens)



In [62]:
def chunking(full_text, tokenizer, max_tokens=100):
    sentences = full_text.split('\n')  # Split by newline to preserve sentence boundaries

    # Create chunks without breaking sentences
    current_chunk = ""
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = tokenizer.encode(sentence)
        sentence_length = len(sentence_tokens)

        # If adding this sentence exceeds the token limit, finalize the current chunk
        if current_tokens + sentence_length > max_tokens:
            chunks.append(current_chunk.strip() + '\n')  # Ensure the chunk ends with a newline
            current_chunk = ""
            current_tokens = 0

        # Add the sentence to the current chunk
        current_chunk += sentence + '\n'
        current_tokens += sentence_length

    # Add the last chunk if any text remains
    if current_chunk.strip():
        chunks.append(current_chunk.strip() + '\n')

    return chunks


In [64]:
def save_chunks_to_file(chunks, output_path):
    with open(output_path, 'w', encoding='utf-8') as file:
        for chunk in chunks:
            file.write(chunk + "\n---\n")  # Separate chunks with "---" for readability

In [65]:
pdf_path = "data/raw/deadpool/deadpool-2016.pdf"

# Output file for tokenized chunks
output_path = "data/chunks/deadpool/deadpool_script_token_chunks.txt"

# Maximum number of tokens per chunk
max_tokens = 512  # Adjust as needed

In [66]:
# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")


In [67]:
deadpool_token_chunks = pdf_to_token_chunks(pdf_path, tokenizer, max_tokens=max_tokens)


In [68]:
len(deadpool_token_chunks)

174

In [71]:
print(deadpool_token_chunks[1])

DEADPOOL
It’s like Christmas Day, Dopinder. Been
waiting one thousand eight hundred twenty-
two days, three hours...
(checks ‘Adventure Time’
watch)
...and thirty-six minutes for this shit.
(CONTINUED)
Deadpool Final Shooting Script 11/16/15 2.
1 CONTINUED: 1
DEADPOOL turns himself RIGHT-SIDE-UP in the front seat. He
is YOKED to the gills and ARMED to the teeth. TWIN KATANAS.
TWIN DESERT EAGLE .50 CALIBER PISTOLS.
Deadpool grabs Dopinder’s OPEN BAG of CORN NUTS. Dopinder
isn’t quick enough to stop him. Deadpool gazes out the
window onto the city - a teeming, sooty urban sprawl that
looks almost... pre-post-apocalyptic.
Deadpool turns up his MASK. Dopinder catches a GLIMPSE of
the bottom of a SCARRED face. And quickly looks AWAY.
Deadpool eats the CORN NUTS. CRUNCH. CRUNCH. Points.
DEADPOOL (CONT’D)
Nice.
Dopinder eyes his DAFFODIL DAYDREAM AIR FRESHENER and takes a
deep breath through his nose.
DOPINDER
Smells good, no?
DEADPOOL
Not the Daffodil Daydream. The girl.
A PICTURE of a young

In [43]:
txt_path = "data/raw/deadpool/deadpool_2016_subs.txt"

In [44]:
deadpool_token_chunks = text_to_token_chunks(txt_path, tokenizer, max_tokens=max_tokens)


In [47]:
print(deadpool_token_chunks[3])

- (MACHINE-GUN FIRING)
- Oh!

(GROANS) Four...

- (GUN FIRES)
- (GROANS)

Gotcha.

(GRUNTS)

Right up main street.

Three, two!

Stupid! Worth it.

(GUNFIRE CONTINUES)

(GUN FIRES)

(ALL GROANING)

(CLINKS)

(SNIFFING)

Ah!

I'm touching myself tonight.

Francis!

Francis...

What the shit-biscuit!

Where you at, Francis?

(GROANING)

(GROANING LOUDLY)

You're not Francis.

Really?
Rolling up the sleeves?

(GROANS)

WADE:
You're probably thinking,

"My boyfriend said
this was a superhero movie...

"but that guy in the red suit
just turned

"that other guy
into a fucking kabab!"

Well, I may be super,
but I'm no hero.

And yeah, technically,
this is a murder.

But some of the best love
stories start with a murder.

And that's exactly
what this is, a love story.

And to tell it right...

I gotta take you back
to long before

I squeezed this ass
into red spandex.

MERCHANT: Look, would it help
if I slow it down for you?

I didn't order the pizza.

Is this 7348 Red Ledge Drive?
Are you Mr.