<a href="https://colab.research.google.com/github/elifbeyzatok00/3D_Modelling_MyHead/blob/main/Another_copy_of_GRITable_PageRef_Detect_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.9 (from PyMuPDF)
  Downloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.9-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyMuPDFb-1.24.9-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.24.9 PyMuPDFb-1.24.9


In [3]:
import fitz  # PyMuPDF
import os
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict
import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


# Extracting and Preparing Data

In [5]:
def get_color_str(color):
    return f"rgb({color[0]*255:.0f},{color[1]*255:.0f},{color[2]*255:.0f})"


def extract_highlighted_text_by_color(pdf_path):
    doc = fitz.open(pdf_path)
    highlighted_text_by_color = {}

    for page_num in range(len(doc)):
        page = doc[page_num]
        annotations = page.annots()
        if annotations:
            for annot in annotations:
                if annot.type[0] == 8:  # Highlight annotation
                    color = annot.colors['stroke'] if 'stroke' in annot.colors else annot.colors['fill']
                    color_str = get_color_str(color)
                    if color_str not in highlighted_text_by_color:
                        highlighted_text_by_color[color_str] = []
                    quadpoints = annot.vertices
                    quads = [quadpoints[i:i + 4] for i in range(0, len(quadpoints), 4)]
                    for quad in quads:
                        rect = fitz.Quad(quad).rect
                        words = page.get_text("words", clip=rect)
                        words.sort(key=lambda w: (w[1], w[0]))  # sort by y, then by x
                        highlighted_text_by_color[color_str].append(" ".join(w[4] for w in words))

    return highlighted_text_by_color

def prepare_training_data(directory_path):
    training_data = []
    labels = []
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(directory_path, pdf_file)
        highlights_by_color = extract_highlighted_text_by_color(pdf_path)
        doc = fitz.open(pdf_path)

        for color, texts in highlights_by_color.items():
            for text in texts:
                if color == 'rgb(255,240,102)':  # GRI Standard
                    training_data.append(text)
                    labels.append('GRI_Standard')
                elif color == 'rgb(143,222,249)':  # Page references of GRI Standard
                    training_data.append(text)
                    labels.append('Page_Ref')

        # Adding normal text
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text("text")
            if text.strip() not in training_data:  # Avoid duplication
                training_data.append(text.strip())
                labels.append('Outside_Entity')  # 'Outside_Entity' for Outside any entity

    return training_data, labels

directory_path = '/content/labeled_dataset'  # Update this path to your labeled dataset directory
training_data, labels = prepare_training_data(directory_path)


In [6]:
# Debug: Print sample of training data and labels
for i in range(5):
    print(f"Text: {training_data[i]}, Label: {labels[i]}")

Text: G4-1 Statement from most senior decision maker., Label: GRI_Standard
Text: G4-2 Key impacts, risks and opportunities, Label: GRI_Standard
Text: G4-3 Name of the organisation., Label: GRI_Standard
Text: G4-4 Primary brands, products and services, Label: GRI_Standard
Text: G4-5 Location of the organisation‘s headquarters., Label: GRI_Standard


### check training data and labels

In [7]:
import pandas as pd
df = pd.DataFrame({'text': training_data, 'label': labels})
print(df)


                                                  text           label
0      G4-1 Statement from most senior decision maker.    GRI_Standard
1            G4-2 Key impacts, risks and opportunities    GRI_Standard
2                       G4-3 Name of the organisation.    GRI_Standard
3           G4-4 Primary brands, products and services    GRI_Standard
4    G4-5 Location of the organisation‘s headquarters.    GRI_Standard
..                                                 ...             ...
490  •\n•\nWe will not tolerate the use, possession...  Outside_Entity
491  WPP Human Rights Policy Statement\nIntroductio...  Outside_Entity
492  Our policy\nTo embed our commitment to protect...  Outside_Entity
493  Cover illustration \nby Pablo S. Herrero \nBre...  Outside_Entity
494                                            wpp.com  Outside_Entity

[495 rows x 2 columns]


In [8]:
GRI_Standard_df = df[df['label'] == 'GRI_Standard']
print(GRI_Standard_df)

                                                  text         label
0      G4-1 Statement from most senior decision maker.  GRI_Standard
1            G4-2 Key impacts, risks and opportunities  GRI_Standard
2                       G4-3 Name of the organisation.  GRI_Standard
3           G4-4 Primary brands, products and services  GRI_Standard
4    G4-5 Location of the organisation‘s headquarters.  GRI_Standard
..                                                 ...           ...
210  including advertising, promotion, and sponsorship  GRI_Standard
211                               by type of outcomes.  GRI_Standard
212               G4-PR8 Total number of substantiated  GRI_Standard
213          complaints regarding breaches of customer  GRI_Standard
214               privacy and losses of customer data.  GRI_Standard

[215 rows x 2 columns]


In [9]:
Page_Ref_df = df[df['label'] == 'Page_Ref']
print(Page_Ref_df)

               text     label
215          page 2  Page_Ref
216          page 8  Page_Ref
217         page 11  Page_Ref
218         page 11  Page_Ref
219         page 14  Page_Ref
..              ...       ...
362         page 33  Page_Ref
363  pages 90 to 91  Page_Ref
364         page 29  Page_Ref
365         page 34  Page_Ref
366         page 15  Page_Ref

[152 rows x 2 columns]


In [11]:
Outside_Entity_df = df[df['label'] == 'Outside_Entity']
print(Outside_Entity_df)

                                                  text           label
367                    Sustainability Report 2016/2017  Outside_Entity
368  Introduction\n2\t\nFrom our CEO\n4\t\nWho we a...  Outside_Entity
369  WPP is the world’s leading communications serv...  Outside_Entity
370  To say that 2016 was a year of political uncer...  Outside_Entity
371  We are collaborating internally and with the o...  Outside_Entity
..                                                 ...             ...
490  •\n•\nWe will not tolerate the use, possession...  Outside_Entity
491  WPP Human Rights Policy Statement\nIntroductio...  Outside_Entity
492  Our policy\nTo embed our commitment to protect...  Outside_Entity
493  Cover illustration \nby Pablo S. Herrero \nBre...  Outside_Entity
494                                            wpp.com  Outside_Entity

[128 rows x 2 columns]


# Train Model

In [12]:
# Train the model
def train_model(training_data, labels):
    vectorizer = TfidfVectorizer()
    classifier = LogisticRegression()
    model = make_pipeline(vectorizer, classifier)
    model.fit(training_data, labels)
    joblib.dump(model, 'gri_model.pkl')
    return model # Added return statement to return the trained model


In [13]:
# Predict using the model
def predict_with_model(pdf_path, model_path='gri_model.pkl'):
    model = joblib.load(model_path)
    doc = fitz.open(pdf_path)
    GRI_Standards = []
    page_references = []
    outside_entities  = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text("text")
        predictions = model.predict([text])

        if 'GRI_Standard' in predictions:
            GRI_Standards.append((page_num + 1, text))
        if 'Page_Ref' in predictions:
            page_references.append((page_num + 1, text))
        if 'Outside_Entity' in predictions:
            outside_entities.append((page_num + 1, text))

    return GRI_Standards, page_references, outside_entities


In [14]:
# Example usage
directory_path = '/content/labeled_dataset'  # Directory containing the training PDFs
training_data, labels = prepare_training_data(directory_path)
X_train, X_test, y_train, y_test = train_test_split(training_data, labels, test_size=0.2, random_state=42)


In [15]:
model = train_model(X_train, y_train)


# Evaluate Model

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score # import the necessary functions

def evaluate_model(model, X_test, y_test):
  """
  Evaluates a machine learning model and prints the accuracy.

  Args:
    model: The trained machine learning model.
    X_test: The test data features.
    y_test: The test data labels.
  """
  y_pred = model.predict(X_test)
  report = classification_report(y_test, y_pred) # changed the variable name to report to avoid conflict with the function name
  conf_matrix = confusion_matrix(y_test, y_pred) # changed the variable name to conf_matrix to avoid conflict with the function name
  accuracy = accuracy_score(y_test, y_pred)
  print(f"Classification Report:\n{report}")
  print(f"Confusion Matrix:\n{conf_matrix}")
  print(f"Accuracy: {accuracy}")


In [17]:
evaluate_model(model, X_test, y_test)


Classification Report:
                precision    recall  f1-score   support

  GRI_Standard       0.96      1.00      0.98        46
Outside_Entity       1.00      0.94      0.97        31
      Page_Ref       1.00      1.00      1.00        22

      accuracy                           0.98        99
     macro avg       0.99      0.98      0.98        99
  weighted avg       0.98      0.98      0.98        99

Confusion Matrix:
[[46  0  0]
 [ 2 29  0]
 [ 0  0 22]]
Accuracy: 0.9797979797979798


In [20]:
# Predict on a new PDF
new_pdf_path = '/content/unlabeled_dataset/352031_1.pdf'
GRI_Standards, page_references, outside_entities = predict_with_model(new_pdf_path)


In [24]:
print("GRI Standards:")
for standard in GRI_Standards:
    print(f"{standard}\n")


GRI Standards:
(1, '1\nCorporate Responsibility Report 2016\n2016\nresponsibility \nreport\nCorporate \n')

(6, '6\nCorporate Responsibility Report 2016\nHow we govern \nOur board\nThe Grieg Star Group Board of Directors is lead by Chair Elisabeth Grieg and meets \nregularly in Bergen and Oslo to stake out the long term strategies for the company. \nSince last year we have had some changes in the composition of the board. Bjørn Ga-\nbriel Reed and Rune Birkeland have left the board, and Susanne Munch Thore sat a \nshort period. In June 2017 Michelle Williams entered as Board Member. Board mem-\nbers abstain from board discussions if potential conflict of interests occur. Board \nmembers are recruited using independent executive search.\nThe Board of Directors comprises:\nHow we do it\nElisabeth Grieg \n(Chair and owner)\nCamilla Grieg \n(CEO and owner)\nKai Grøtterud\nMichelle Williams\nDidrik Munch\nGRI 4:1\nGRI 4:2\nGRI 4:3\nGRI 4:4\nGRI 4:6\nGRI 4.9\n \n')

(28, '28\nCorporate Respo

In [26]:
print("Page References:")
for ref in page_references:
    print(f"{ref}\n")


Page References:
(2, '2\nCorporate Responsibility Report 2016\nPage 3..........................................From our Chair\nPage 4 & 5...................................Vision, Mission and Values\nPage 6 & 7....................................How we govern\nPage 8 & 9...................................Ethics and anti-corruption\nPage 10-12.\n..................................How we do it\nPage 13.........................................Our vessels\nPage 14-19.\n..................................Our people\nPage 20-21..................................Safety\nPage 22-23.\n.................................How we communicate\nPage 24-31..................................Environment\nPage 32-33..................................Grieg Foundation and local contributions\nPage 34........................................Our history\nPage 35........................................Our fleet\nPage 36-39.\n.................................GRI Index\nContent\nAll images Grieg Star copyright, except 

In [28]:
print("Outside Entities:")
for entity in outside_entities:
    print(f"{entity}\n")

Outside Entities:
(3, '3\nCorporate Responsibility Report 2016\nElisabeth Grieg\nChair, Board of Directors\nThe world around us is changing at a pace \nwe’re hardly able to capture, and sometimes in \ndirections difficult to foresee. Yet, we must do \nour best to understand, because the future of \nthe maritime industry is indisputably inter-\ntwined with the deeper changes in geopolitics, \nworld economy and human development.   \nA constant state of change has always been a \nfeature of our industry, and we know very well \nthat there are moments when more radical \nshifts will occur. I believe we’re at the thresh-\nold of such a moment right now. Driving this \nrevolution are threats from climate change and \nprofound social instability, combined with op-\nportunities produced by the staggering rise of \nurban middle-class and \nbreathtaking techno-\nlogical developments in \nan ever more globalized \nand knowledge-driven \neconomy.  Together, \nthey will fundamentally \nimpact and 