In [11]:
import pdfplumber
import os
import re
import pandas as pd


## <span style="color: aquamarine;"> Functions</span>

### <span style="color: yellow">Text Extraction</span>


In [12]:
def extract_text_pdfplumber(path):
    text = ""

    with pdfplumber.open(path) as file:
        print(f"Total pages: {len(file.pages)}")

        for page_num, page in enumerate(file.pages):
            page_text = page.extract_text()
            if page_text:
                text += page_text
                text += f"\n--- Page {page_num +1} ---\n"

    # filename = os.path.splitext(os.path.basename(path))[0]
    # output_path = f"output/{filename}.txt"
        
    # with open(output_path, 'w', encoding='utf-8') as f:
    #     f.write(text)
    
    #     print(f"✓ {filename}: {len(text)} characters")
    return text


### <span style="color: yellow">Text Cleaning</span>


In [13]:
def clean_text(text, remove_page_markers=True):
    if not text:
        print("No text to clean")
        return ""
    
    original_length = len(text)
    print(f"Original length: {len(text):,} characters")


    # Remove page markers if requested
    if remove_page_markers:
        text = re.sub(r'\n--- Page \d+ ---\n', '\n', text)
    
    #Remove common headers/ footers
    text = re.sub(r'Page \d+ of \d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE) 

    #Remove excessive whitespace
    text = re.sub(r' +', ' ', text)  # Multiple spaces to single space
    text = re.sub(r'\t+', ' ', text)  # Tabs to spaces

   #Fix newlines (keep paragraph breaks, remove excessive ones)
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)  # Max 2 newlines
    
    # Remove special characters that might cause issues (optional)
    # Uncomment if you see weird symbols
    # text = re.sub(r'[^\w\s.,!?;:()\-\'\"$%@#&/\\]', '', text)
    
    #Fix common PDF extraction issues
    text = text.replace('\x00', '')  # Remove null characters
    text = text.replace('\uf0b7', '•')  # Fix bullet points
    text = text.replace('\u2019', "'")  # Fix apostrophes
    text = text.replace('\u201c', '"')  # Fix quotes
    text = text.replace('\u201d', '"')  # Fix quotes
    
    #Strip leading/trailing whitespace
    text = text.strip()
    
    cleaned_length = len(text)
    print(f"Cleaned length: {len(text):,} characters")
    print(f"Removed: {original_length - cleaned_length:,} characters")
    
    return text

### <span style="color: yellow">Text Chunking</span>

#### <span style="color: pink">This step is to split text into chunks of approximately chunk_size words. Parameters include text (text to chunk), chunk_size (target words per chunk), and overlap (words to overlap between chunks).</span>

In [14]:
def chunk_text(text, chunk_size=600, overlap=100):
    words = text.split()

    chunks = []
    start = 0

    while start < len(words):
        end = start+chunk_size
        chunk_words = words[start:end]

        chunk=' '.join(chunk_words)
        chunks.append(chunk)

        start+=chunk_size-overlap
    return chunks


In [15]:
sample_text = "This is a sample text. " * 1000  
sample_chunks = chunk_text(sample_text, chunk_size=100, overlap=20)
print(f"Created {len(sample_chunks)} chunks from sample text")

Created 63 chunks from sample text


### <span style="color: yellow">Storing Documents</span>


In [16]:
def create_document_store(cleaned_texts, company_name="delta", chunk_size=600, overlap=100):
    
    all_chunks = []
    chunk_counter = 1
    
    print("\nCreating document store...")
    print("="*60)
    
    for source_file, text in cleaned_texts.items():
        print(f"\nProcessing: {source_file}")
        
        # Chunk the text
        chunks = chunk_text(text, chunk_size=chunk_size, overlap=overlap)
        print(f"  Created {len(chunks)} chunks")
        
        # Add each chunk to the list
        for i, chunk in enumerate(chunks):
            chunk_data = {
                'chunk_id': f"chunk_{chunk_counter:03d}",
                'company': company_name,
                'source_file': source_file,
                'chunk_text': chunk
            }
            all_chunks.append(chunk_data)
            chunk_counter += 1
    
    # Create DataFrame
    df = pd.DataFrame(all_chunks)
    
    print("\n" + "="*60)
    print(f"✓ Document store created!")
    print(f"  Total chunks: {len(df)}")
    print(f"  Total documents: {len(cleaned_texts)}")
    print("="*60)
    
    return df

## <span style="color: aquamarine;"> Processing Files</span>

In [17]:
input_files = {
    "annual_report_2020": "input/10K Report 2020.pdf",
    "annual_report_2021": "input/10K Report 2021.pdf",
    "annual_report_2022": "input/10K Report 2022.pdf",
    "annual_report_2023": "input/10K Report 2023.pdf",
    "annual_report_2024": "input/10K Report 2024.pdf",
    "esg_report_2020": "input/ESG Report 2020.pdf",
    "esg_report_2021": "input/ESG Report 2021.pdf",
    "esg_report_2022": "input/ESG Report 2022.pdf",
    "esg_report_2023": "input/ESG Report 2023.pdf",
    "esg_report_2024": "input/ESG Report 2024.pdf",
    "major_holders": "input/Major Holders Summary.pdf"
}

In [18]:
cleaned_texts = {}
os.makedirs("output", exist_ok=True)

print("="*60)
print("STEP 1: EXTRACTING AND CLEANING PDFs")
print("="*60)

for file_name, file_path in input_files.items():
    print(f"\n{'='*60}")
    print(f"Processing: {file_name}")
    print('='*60)
    
    # Extract
    raw_text = extract_text_pdfplumber(file_path)
    
    # Clean
    print("\nCleaning...")
    cleaned_text = clean_text(raw_text)
    cleaned_texts[file_name] = cleaned_text
    
    # Save cleaned text
    clean_output_path = f"output/{file_name}_clean.txt"
    with open(clean_output_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_text)
    print(f"✓ Saved as {file_name}_clean.txt")

print("\n✓ All files extracted and cleaned!")

# Step 3: Create document store
print("\n" + "="*60)
print("STEP 2: CREATING DOCUMENT STORE (CHUNKING)")
print("="*60)

document_store = create_document_store(
    cleaned_texts, 
    company_name="delta",
    chunk_size=600,  # Adjust between 500-1000
    overlap=100       # Adjust overlap as needed
)

# Step 4: Display statistics
print("\n" + "="*60)
print("DOCUMENT STORE STATISTICS")
print("="*60)

print(f"\nTotal chunks: {len(document_store)}")
print(f"\nChunks per source file:")
print(document_store['source_file'].value_counts().sort_index())

print(f"\nAverage chunk length: {document_store['chunk_text'].str.split().str.len().mean():.0f} words")
print(f"Min chunk length: {document_store['chunk_text'].str.split().str.len().min()} words")
print(f"Max chunk length: {document_store['chunk_text'].str.split().str.len().max()} words")

# Step 5: Display sample chunks
print("\n" + "="*60)
print("SAMPLE CHUNKS")
print("="*60)
print(document_store.head(3))

print("\n" + "="*60)
print("SAMPLE CHUNK TEXT")
print("="*60)
print(f"\nFirst 500 characters of chunk_001:")
print(document_store.loc[0, 'chunk_text'][:500])

# Step 6: Save document store
print("\n" + "="*60)
print("STEP 3: SAVING DOCUMENT STORE")
print("="*60)

os.makedirs("data/delta", exist_ok=True)

# Save as CSV
csv_path = "data/delta/chunks.csv"
document_store.to_csv(csv_path, index=False)
print(f"✓ Saved to CSV: {csv_path}")

# Save as pickle (better for Python)
pkl_path = "data/delta/chunks.pkl"
document_store.to_pickle(pkl_path)
print(f"✓ Saved to Pickle: {pkl_path}")

print("\n" + "="*60)
print("✓ PIPELINE COMPLETE!")
print("="*60)
print(f"\nYou now have:")
print(f"  - {len(cleaned_texts)} cleaned text files in 'output/'")
print(f"  - {len(document_store)} chunks in 'data/delta/chunks.csv'")
print(f"  - Document store ready for retrieval")

STEP 1: EXTRACTING AND CLEANING PDFs

Processing: annual_report_2020
Total pages: 125

Cleaning...
Original length: 456,922 characters
Cleaned length: 454,904 characters
Removed: 2,018 characters
✓ Saved as annual_report_2020_clean.txt

Processing: annual_report_2021
Total pages: 123

Cleaning...
Original length: 438,207 characters
Cleaned length: 436,221 characters
Removed: 1,986 characters
✓ Saved as annual_report_2021_clean.txt

Processing: annual_report_2022
Total pages: 115

Cleaning...
Original length: 410,076 characters
Cleaned length: 408,226 characters
Removed: 1,850 characters
✓ Saved as annual_report_2022_clean.txt

Processing: annual_report_2023
Total pages: 109

Cleaning...
Original length: 381,346 characters
Cleaned length: 379,599 characters
Removed: 1,747 characters
✓ Saved as annual_report_2023_clean.txt

Processing: annual_report_2024
Total pages: 104

Cleaning...
Original length: 363,871 characters
Cleaned length: 362,206 characters
Removed: 1,665 characters
✓ Saved 

### <span style="color: yellow;"> Adjusting Chunk Sizes</span>

In [19]:
# Smaller chunks (500 words)
document_store_small = create_document_store(cleaned_texts, chunk_size=500, overlap=50)

# Larger chunks (1000 words)
document_store_large = create_document_store(cleaned_texts, chunk_size=1000, overlap=150)

# Compare
print(f"Small chunks (500 words): {len(document_store_small)} chunks")
print(f"Medium chunks (600 words): {len(document_store)} chunks")
print(f"Large chunks (1000 words): {len(document_store_large)} chunks")


Creating document store...

Processing: annual_report_2020
  Created 155 chunks

Processing: annual_report_2021
  Created 149 chunks

Processing: annual_report_2022
  Created 139 chunks

Processing: annual_report_2023
  Created 129 chunks

Processing: annual_report_2024
  Created 123 chunks

Processing: esg_report_2020
  Created 51 chunks

Processing: esg_report_2021
  Created 54 chunks

Processing: esg_report_2022
  Created 74 chunks

Processing: esg_report_2023
  Created 62 chunks

Processing: esg_report_2024
  Created 52 chunks

Processing: major_holders
  Created 2 chunks

✓ Document store created!
  Total chunks: 990
  Total documents: 11

Creating document store...

Processing: annual_report_2020
  Created 82 chunks

Processing: annual_report_2021
  Created 79 chunks

Processing: annual_report_2022
  Created 74 chunks

Processing: annual_report_2023
  Created 69 chunks

Processing: annual_report_2024
  Created 65 chunks

Processing: esg_report_2020
  Created 27 chunks

Processin

In [None]:
# # Load the document store
# df = pd.read_csv("data/delta/chunks.csv")

# # Check structure
# print("DataFrame Info:")
# print(df.info())

# print("\nFirst few rows:")
# print(df.head())

# print("\nSample chunk:")
# print(f"\nChunk ID: {df.loc[0, 'chunk_id']}")
# print(f"Company: {df.loc[0, 'company']}")
# print(f"Source: {df.loc[0, 'source_file']}")
# print(f"Text preview: {df.loc[0, 'chunk_text'][:300]}...")

In [None]:
#Convert to text files
# raw_annual_report_2020 = extract_text_pdfplumber("input/10K Report 2020.pdf")
# raw_annual_report_2021 = extract_text_pdfplumber("input/10K Report 2021.pdf")
# raw_annual_report_2022 = extract_text_pdfplumber("input/10K Report 2022.pdf")
# raw_annual_report_2023 = extract_text_pdfplumber("input/10K Report 2023.pdf")
# raw_annual_report_2024 = extract_text_pdfplumber("input/10K Report 2024.pdf")

# raw_esg_report_2020 = extract_text_pdfplumber("input/ESG Report 2020.pdf")
# raw_esg_report_2021 = extract_text_pdfplumber("input/ESG Report 2021.pdf")
# raw_esg_report_2022 = extract_text_pdfplumber("input/ESG Report 2022.pdf")
# raw_esg_report_2023 = extract_text_pdfplumber("input/ESG Report 2023.pdf")
# raw_esg_report_2024 = extract_text_pdfplumber("input/ESG Report 2024.pdf")

# raw_major_holders = extract_text_pdfplumber("input/Major Holders Summary.pdf")

Total pages: 125
Total pages: 123
Total pages: 115
Total pages: 109
Total pages: 104
Total pages: 63
Total pages: 73
Total pages: 91
Total pages: 66
Total pages: 59
Total pages: 2


In [None]:

# filename = os.path.splitext(os.path.basename(annual_report_2020))[0]
# clean_output_path = f"output/{annual_report_2020}_clean.txt"

# with open(clean_output_path, 'w', encoding='utf-8') as f:
#     f.write(clean_text(annual_report_2020))

# print(f"\nCleaned text saved to: {clean_output_path}")