<a href="https://colab.research.google.com/github/elifbeyzatok00/GRI_Detection_and_Merger/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from pymupdf)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.9 pymupdf-1.24.9


In [4]:
!pip install spacy



In [12]:
# -*- coding: utf-8 -*-
"""NER with Spacy for GRI Index Tables and Page References in PDFs"""

import fitz  # PyMuPDF
import os
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding

# Function to convert color to string
def get_color_str(color):
    return f"rgb({color[0]*255:.0f},{color[1]*255:.0f},{color[2]*255:.0f})"

# Function to extract text and annotations from PDF
def extract_text_and_annotations(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = ""
    entities = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        page_text = page.get_text()
        all_text += page_text

        annotations = page.annots()
        if annotations:
            for annot in annotations:
                if annot.type[0] == 8:  # Highlight annotation
                    color = annot.colors.get('stroke', annot.colors.get('fill', None))
                    if color:
                        color_str = get_color_str(color)
                        text = annot.info.get('content', '')  # Extract highlighted text
                        start = page_text.find(text)
                        if start != -1:
                            end = start + len(text)
                            entity = (start, end, "GRI_Index" if color_str == 'rgb(255,240,102)' else "Page_Ref")
                            entities.append(entity)

    return all_text, entities

# Function to convert data to SpaCy format
def convert_to_spacy_format(text, entities):
    return [(text, {"entities": entities})]

# Prepare training data
directory_path = '/content/labeled'  # Update this path to your labeled dataset directory
training_data = []

pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

for pdf_file in pdf_files:
    pdf_path = os.path.join(directory_path, pdf_file)
    text, entities = extract_text_and_annotations(pdf_path)
    training_data += convert_to_spacy_format(text, entities)

# Load pre-trained model
nlp = spacy.blank("en")

# Create or get NER component
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add labels
ner.add_label("GRI_Index")
ner.add_label("Page_Ref")
# 'O' is not typically added as a label in SpaCy, entities are not labeled with 'O'

# Training loop
optimizer = nlp.begin_training()

for i in range(10):
    losses = {}
    batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, losses=losses)
    print(f"Losses at iteration {i}: {losses}")

# Save the model
nlp.to_disk("path_to_save_model")

# Using the trained model
def extract_entities_from_pdf(pdf_path, model):
    doc = fitz.open(pdf_path)
    gri_index_tables = []
    page_references = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        spacy_doc = model(text)
        for ent in spacy_doc.ents:
            if ent.label_ == "GRI_Index":
                gri_index_tables.append((page_num + 1, ent.text))
            elif ent.label_ == "Page_Ref":
                page_references.append((page_num + 1, ent.text))

    return gri_index_tables, page_references

# Example usage
unlabeled_pdf_path = '/content/351121_1.pdf'
gri_index_tables, page_references = extract_entities_from_pdf(unlabeled_pdf_path, nlp)

print("GRI Index Tables:")
for page, table in gri_index_tables:
    print(f"Page {page}:\n Table {table}\n")

print("Page References:")
for page, ref in page_references:
    print(f"Page {page}:\n Ref {ref}\n")


Losses at iteration 0: {'ner': 17121.597686767578}
Losses at iteration 1: {'ner': 16628.05062866211}
Losses at iteration 2: {'ner': 15841.18440246582}
Losses at iteration 3: {'ner': 14848.84033203125}
Losses at iteration 4: {'ner': 13868.981246948242}
Losses at iteration 5: {'ner': 12242.894348144531}
Losses at iteration 6: {'ner': 10309.296249389648}
Losses at iteration 7: {'ner': 8076.370872497559}
Losses at iteration 8: {'ner': 5410.864458084106}
Losses at iteration 9: {'ner': 2814.908390045166}
GRI Index Tables:
Page References:
