<a href="https://colab.research.google.com/github/elifbeyzatok00/GRI_Detection_and_Merger/blob/main/Copy_of_GRITable_PageRef_Detect_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyMuPDF



In [None]:
import fitz  # PyMuPDF
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict
import joblib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def get_color_str(color):
    return f"rgb({color[0]*255:.0f},{color[1]*255:.0f},{color[2]*255:.0f})"


In [None]:
def extract_highlighted_text_by_color(pdf_path):
    doc = fitz.open(pdf_path)
    highlighted_text_by_color = {}

    for page_num in range(len(doc)):
        page = doc[page_num]
        annotations = page.annots()
        if annotations:
            for annot in annotations:
                if annot.type[0] == 8:  # Highlight annotation
                    color = annot.colors['stroke'] if 'stroke' in annot.colors else annot.colors['fill']
                    color_str = get_color_str(color)
                    if color_str not in highlighted_text_by_color:
                        highlighted_text_by_color[color_str] = []
                    quadpoints = annot.vertices
                    quads = [quadpoints[i:i + 4] for i in range(0, len(quadpoints), 4)]
                    for quad in quads:
                        rect = fitz.Quad(quad).rect
                        words = page.get_text("words", clip=rect)
                        words.sort(key=lambda w: (w[1], w[0]))  # sort by y, then by x
                        highlighted_text_by_color[color_str].append(" ".join(w[4] for w in words))

    return highlighted_text_by_color

In [None]:
# Extract training data from highlighted PDFs
def prepare_training_data(directory_path):
    training_data = []
    labels = []
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(directory_path, pdf_file)
        highlights_by_color = extract_highlighted_text_by_color(pdf_path)

        for color, texts in highlights_by_color.items():
            if color == 'rgb(255,240,102)':  # GRI Index Table
                for text in texts:
                    training_data.append(text)
                    labels.append('GRI_Index')
            elif color == 'rgb(143,222,249)':  # Page references in GRI Index
                for text in texts:
                    training_data.append(text)
                    labels.append('Page_Ref')

    return training_data, labels

# Train Model

In [None]:
# Train the model
def train_model(training_data, labels):
    vectorizer = TfidfVectorizer()
    classifier = LogisticRegression()
    model = make_pipeline(vectorizer, classifier)
    model.fit(training_data, labels)
    joblib.dump(model, 'gri_model.pkl')
    return model # Added return statement to return the trained model

In [None]:
# Predict using the model
def predict_with_model(pdf_path, model_path='gri_model.pkl'):
    model = joblib.load(model_path)
    doc = fitz.open(pdf_path)
    gri_index_tables = []
    page_references = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        predictions = model.predict([text])

        if 'GRI_Index' in predictions:
            gri_index_tables.append((page_num + 1, text))
        if 'Page_Ref' in predictions:
            page_references.append((page_num + 1, text))

    return gri_index_tables, page_references


In [None]:
# Example usage
directory_path = '/content/labeled_dataset'  # Directory containing the training PDFs
training_data, labels = prepare_training_data(directory_path)
X_train, X_test, y_train, y_test = train_test_split(training_data, labels, test_size=0.2, random_state=42)

In [None]:
model = train_model(X_train, y_train)


## check training_data and labels

In [None]:
import pandas as pd
df = pd.DataFrame({'text': training_data, 'label': labels})
print(df)


                                             text      label
0                                              15   Page_Ref
1                                              39   Page_Ref
2                                              15   Page_Ref
3                                              19   Page_Ref
4                                              15   Page_Ref
...                                           ...        ...
6664                            and economic area  GRI_Index
6665     Some of the main contractors had a total  GRI_Index
6666  of six stop orders and five fines totalling  GRI_Index
6667    of S$50,000. Lessons learnt, if any, were  GRI_Index
6668             shared across the business units  GRI_Index

[6669 rows x 2 columns]


In [None]:
gri_index_df = df[df['label'] == 'GRI_Index']
print(gri_index_df)

                                                   text      label
85                                       GRI content in  GRI_Index
86    We prepared this report in accordance with the...  GRI_Index
87                                GRI 102-55 standards.  GRI_Index
88    GRI Standard/Disclosure Page number(s)/Locatio...  GRI_Index
89                            1. Organizational profile  GRI_Index
...                                                 ...        ...
6664                                  and economic area  GRI_Index
6665           Some of the main contractors had a total  GRI_Index
6666        of six stop orders and five fines totalling  GRI_Index
6667          of S$50,000. Lessons learnt, if any, were  GRI_Index
6668                   shared across the business units  GRI_Index

[5366 rows x 2 columns]


In [None]:
gri_index_df = df[df['label'] == 'Page_Ref']
print(gri_index_df)

     text     label
0      15  Page_Ref
1      39  Page_Ref
2      15  Page_Ref
3      19  Page_Ref
4      15  Page_Ref
...   ...       ...
6304   17  Page_Ref
6305   34  Page_Ref
6306   38  Page_Ref
6307  51-  Page_Ref
6308   52  Page_Ref

[1303 rows x 2 columns]


# Evaluate Model

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # import the necessary functions

def evaluate_model(model, X_test, y_test):
  """
  Evaluates a machine learning model and prints the accuracy.

  Args:
    model: The trained machine learning model.
    X_test: The test data features.
    y_test: The test data labels.
  """
  y_pred = model.predict(X_test)
  report = classification_report(y_test, y_pred) # changed the variable name to report to avoid conflict with the function name
  conf_matrix = confusion_matrix(y_test, y_pred) # changed the variable name to conf_matrix to avoid conflict with the function name
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Classification Report:\n{report}")
  print(f"Confusion Matrix:\n{conf_matrix}")
  print(f"Accuracy: {accuracy}")

In [None]:
evaluate_model(model, X_test, y_test)

Classification Report:
              precision    recall  f1-score   support

   GRI_Index       0.94      0.97      0.95      1066
    Page_Ref       0.85      0.77      0.81       268

    accuracy                           0.93      1334
   macro avg       0.90      0.87      0.88      1334
weighted avg       0.92      0.93      0.93      1334

Confusion Matrix:
[[1029   37]
 [  61  207]]
Accuracy: 0.9265367316341829


In [None]:
# Predict on a new PDF
new_pdf_path = '/content/unlabeled_dataset/351121_1.pdf'
gri_index_tables, page_references = predict_with_model(new_pdf_path)

In [None]:
print("GRI Index Tables:")
for page, table in gri_index_tables:
    print(f"Page {page}:\n Table {table}\n")


GRI Index Tables:
Page 1:
 Table O N W A R D S
A N D 
U P W A R D S
Sustainability Report 2016


Page 2:
 Table 2
A B O U T  T H I S  R E P O R T
SUSTAINABILITY REPORT 2016  
The Sustainability Report 2016 of ista International 
GmbH is ista’s seventh sustainabilty report. It also repre-
sents the progress report for the Global Compact of the 
United Nations. In addition, we are reporting the infor-
mation in our Sustainability Report for the first time in 
accordance with the German Sustainability Code. The 
relevant declaration can be viewed  here.  
G4-17  
FRAME OF REFERENCE
The report includes 50 companies worldwide and is based 
on the key figures for the 2016 calendar year. 
GUIDELINES AND REPORT PREPARATION 
In our reporting, we comply with the internationally  
recognised guidelines of the Global Reporting Initiative 
(GRI). We report in accordance with the version GRI G4 
and satisfy the “in-accordance - core” option. This report 
has also been verified by the Materiality Dis

In [None]:
print("Page References:")
for page, ref in page_references:
    print(f"Page {page}:\n Ref {ref}\n")

Page References:
