<a href="https://colab.research.google.com/github/devbhise/pdf_extractor/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install necessary libraries
!pip install PyMuPDF transformers sentence-transformers spacy

import fitz  # PyMuPDF for PDF parsing
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import spacy

# Initialize models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Define keywords for different sections (customize based on investor focus)
keywords = {
    "growth_prospects": ["growth", "outlook", "prospects"],
    "business_changes": ["acquisition", "strategy", "restructuring", "business changes"],
    "key_triggers": ["risk", "opportunity", "trigger", "impact"],
    "material_effects": ["financial performance", "earnings", "profit", "revenue"]
}

# Function to parse PDF and split into chunks
def parse_pdf_to_chunks(file_path, chunk_size=1000):
    text_chunks = []
    with fitz.open(file_path) as pdf:
        text = ""
        for page_num in range(pdf.page_count):
            text += pdf[page_num].get_text("text")
            if len(text) >= chunk_size:
                text_chunks.append(text)
                text = ""
        if text:  # Add any remaining text
            text_chunks.append(text)
    return text_chunks

# Function to identify key themes in each chunk based on similarity
def detect_theme(chunk, keywords):
    theme = {}
    chunk_embeddings = embedder.encode(chunk, convert_to_tensor=True)

    for section, kw_list in keywords.items():
        matched_text = ""
        for keyword in kw_list:
            keyword_embedding = embedder.encode(keyword, convert_to_tensor=True)
            cosine_scores = util.cos_sim(keyword_embedding, chunk_embeddings).flatten()
            threshold = 0.3  # Adjust based on keyword relevance
            matched_text += " ".join([chunk.splitlines()[i] for i in range(len(cosine_scores)) if cosine_scores[i] > threshold])
        theme[section] = matched_text
    return theme

# Summarization function for each chunk
def summarize_chunk(chunk, min_length=50, max_length=150):
    try:
        summary = summarizer(chunk, min_length=min_length, max_length=max_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        return "Summary not available"

# Extract entities and relevant insights
def extract_entities_and_sentiment(chunk):
    doc = nlp(chunk)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Main function to process large document in chunks
def analyze_pdf_in_chunks(file_path):
    chunks = parse_pdf_to_chunks(file_path)
    report = {}

    for i, chunk in enumerate(chunks):
        theme_content = detect_theme(chunk, keywords)
        chunk_summary = summarize_chunk(chunk)
        entities = extract_entities_and_sentiment(chunk)

        # Organize output by chunk and theme
        report[f"Chunk_{i+1}"] = {
            "summary": chunk_summary,
            "themes": theme_content,
            "entities": entities
        }

    return report

# Path to PDF
file_path = '/content/SJS Transcript Call.pdf'  # Update with your file path

# Run analysis
report = analyze_pdf_in_chunks(file_path)

# Display report
for chunk, data in report.items():
    print(f"\n--- {chunk} ---")
    print("Summary:", data["summary"])
    for theme, content in data["themes"].items():
        print(f"{theme.capitalize()}:", content[:300] + "...")  # Preview each theme
    print("Entities:", data["entities"])


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.13-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.24.13


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


--- Chunk_1 ---
Summary:  transcripts of Analysts/Investor Meet/ Earnings Call of Q1 FY 2023-24 held on July 27, 2023. You are requested to kindly take the same on record. For S.J.S. Enterprises Limited, Thabraz Hushain W. is the Company Secretary and Compliance Officer.
Growth_prospects:  ...
Business_changes: ...
Key_triggers: ...
Material_effects:  ...
Entities: [('August 03', 'DATE'), ('2023', 'DATE'), ('National Stock Exchange of India Limited \nExchange', 'ORG'), ('5th Floor', 'FAC'), ('G Block', 'ORG'), ('Bandra', 'PERSON'), ('Kurla Complex', 'PERSON'), ('Bandra', 'PERSON'), ('Mumbai', 'GPE'), ('051', 'CARDINAL'), ('SJS', 'ORG'), ('BSE Limited \nCorporate Relationship Department', 'ORG'), ('2nd Floor', 'ORG'), ('Rotunda Building', 'PERSON'), ('P.J. Towers', 'ORG'), ('Dalal Street', 'FAC'), ('Mumbai', 'GPE'), ('400', 'CARDINAL'), ('ISIN', 'ORG'), ('Dear Sir/Madam', 'WORK_OF_ART'), ('Q1', 'CARDINAL'), ('2023-24', 'DATE'), ('the Analysts/Investor Meet/ Earnings Call of Q1 FY', 'EV