In [None]:
from google.colab import drive
drive.mount("/content/drive") #Datasets were uploaded to Google Drive and fetched from there

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install --upgrade pymupdf



In [None]:
import re
import pymupdf

In [None]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using PyMuPDF."""
    text = ""
    with pymupdf.open(pdf_path) as doc:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
    return text

In [None]:
def split_text_by_pattern_text1(text):
    """
    Splits the text whenever the given regex pattern is found.
    Each returned element in the list will start at the matched pattern.
    Pattern: a newline, then 'Article', then one or more spaces, then a 1–2 digit number,
             then one or more spaces, then a newline, and then a capital letter.

    Matches examples like:
      \nArticle 1 \nSubject
                  ^
                (space)
    but NOT
      \nArticle 1\nSubject
                (no space)
    """

    # The regex pattern:
    pattern = r'(\nArticle\s+([1-9][0-9]?)\s+\n[A-Z])'

    # Find all matches and record their positions
    matches = list(re.finditer(pattern, text))
    if not matches:
        return []

    article_positions = [m.start(1) for m in matches]

    article_positions.append(len(text))

    articles = []
    for i in range(len(article_positions) - 1):
        start_idx = article_positions[i]
        end_idx = article_positions[i + 1]
        segment = text[start_idx:end_idx].strip()
        articles.append(segment)

    return articles

In [None]:
def split_text_by_pattern_text2(text):
    """
    Splits the text whenever the given (slightly relaxed) regex pattern is found.
    Pattern: a newline, then 'Article', then one or more spaces,
             then a 1–2 digit number, then zero or more spaces, then a newline,
             then a capital letter.

    Matches examples like:
      \nArticle 1\nSubject
    or
      \nArticle 1 \nSubject (if there is an extra space).
    """
    # The difference:  \s*\n instead of \s+\n
    pattern = r'(\nArticle\s+([1-9][0-9]?)\s*\n[A-Z])'

    matches = list(re.finditer(pattern, text))
    if not matches:
        return []

    article_positions = [m.start(1) for m in matches]
    article_positions.append(len(text))  # End of text for final segment

    articles = []
    for i in range(len(article_positions) - 1):
        start_idx = article_positions[i]
        end_idx = article_positions[i + 1]
        segment = text[start_idx:end_idx].strip()
        articles.append(segment)

    return articles


In [None]:
import re

def preprocess_text(articles):
    """
    Cleans each string in the provided list by:
    1. Removing all newline characters.
    2. Replacing multiple spaces (including tabs) with a single space.
    3. Stripping leading and trailing whitespace.
    Returns a new list with cleaned strings.
    """
    cleaned_articles = []
    for item in articles:
        # Remove newline characters
        cleaned_item = item.replace('\n', ' ')
        # Replace multiple spaces (including tabs) with a single space
        cleaned_item = re.sub(r'\s+', ' ', cleaned_item)
        # Strip leading and trailing whitespace
        cleaned_item = cleaned_item.strip()
        cleaned_articles.append(cleaned_item)
    return cleaned_articles

In [None]:
def order_initial_articles(articles):
    """
    Reorders the articles list according to the specified sequence:
    Article 1 to 19, Article 21 to 24, Article 27 to 32, Article 20, Article 25, Article 26.
    """
    # Define the order of articles
    order = list(range(1, 20)) + list(range(21, 25)) + list(range(27, 33)) + [20, 25, 26]

    # Adjust for zero-based indexing
    order = [i - 1 for i in order]

    # Reorder the articles
    reordered_articles = [articles[i] for i in order]

    return reordered_articles

In [None]:
def order_final_articles(articles):
    """
    Reorders the articles list according to the specified sequence:
    Article 1 to 5, Article 7, Article 8, Article 10, Article 11, Article 14, Article 15,
    Article 16, Article 18, Article 19, Article 20, Article 22, Article 23, Article 24,
    Article 25, Article 26, Article 28 to 32, Article 34, Article 36, Article 37,
    Article 38, Article 39, Article 6, Article 9, Article 12, Article 13, Article 17,
    Article 21, Article 27, Article 33, Article 35.
    """
    # Define the order of articles
    order = (
        list(range(1, 4)) + [5, 7, 8, 10, 11, 14, 15, 16, 18, 19, 20, 22, 23, 24, 25, 26]
    + list(range(28, 33)) + [34, 36, 37, 38, 39, 4, 6, 9, 12, 13, 17, 21, 27, 33, 35]
    )

    order = [i - 1 for i in order]

    reordered_articles = [articles[i] for i in order]

    return reordered_articles

In [None]:
pdf1_path = '/content/drive/MyDrive/research project/initial_doc_uncut.pdf'
pdf2_path = '/content/drive/MyDrive/research project/final_document.pdf'

text1 = extract_text_from_pdf(pdf1_path)
text2 = extract_text_from_pdf(pdf2_path)

unpreprocessed_articles1 = split_text_by_pattern_text1(text1)
unpreprocessed_articles2 = split_text_by_pattern_text2(text2)

articles1 = preprocess_text(unpreprocessed_articles1)
articles2 = preprocess_text(unpreprocessed_articles2)

ordered_articles1 = order_initial_articles(articles1)
ordered_articles2 = order_final_articles(articles2)

In [None]:
ordered_articles1

['Article 1 Subject matter 1. This Directive lays down rules (a) on obligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the value chain operations carried out by entities with whom the company has an established business relationship and (b) on liability for violations of the obligations mentioned above. The nature of business relationships as ‘established’ shall be reassessed periodically, and at least every 12 months. 2. This Directive shall not constitute grounds for reducing the level of protection of human rights or of protection of the environment or the protection of the climate provided for by the law of Member States at the time of the adoption of this Directive. 3. This Directive shall be without prejudice to obligations in the areas of human rights, protection of the environment and climate change under other Union legislative a

In [None]:
ordered_articles2

['Article 1 Subject matter 1. This Directive lays down rules on: (a) obligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the operations carried out by their business partners in the chains of activities of those companies; (b) liability for violations of the obligations as referred to in point (a); and (c) the obligation for companies to adopt and put into effect a transition plan for climate change mitigation which aims to ensure, through best efforts, compatibility of the business model and of the strategy of the company with the transition to a sustainable economy and with the limiting of global warming to 1,5 oC in line with the Paris Agreement. 2. This Directive shall not constitute grounds for reducing the level of protection of human, employment and social rights, or of protection of the environment or of protection of the climate p

In [None]:
import difflib

# Sample data for initial_doc_text and final_doc_text
initial_doc_text = [
    "Article 1 Subject matter 1. This Directive lays down rules (a) on obligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the value chain operations carried out by entities with whom the company has an established business relationship and (b) on liability for violations of the obligations mentioned above. The nature of business relationships as ‘established’ shall be reassessed periodically, and at least every 12 months. 2. This Directive shall not constitute grounds for reducing the level of protection of human rights or of protection of the environment or the protection of the climate provided for by the law of Member States at the time of the adoption of this Directive. 3. This Directive shall be without prejudice to obligations in the areas of human rights, protection of the environment and climate change under other Union legislative acts. If the provisions of this Directive conflict with a provision of another Union legislative act pursuing the same objectives and providing for more extensive or more specific obligations, the provisions of the other Union legislative act shall prevail to the extent of the conflict and shall apply to those specific obligations."
]

final_doc_text = [
    "Article 1 Subject matter 1. This Directive lays down rules on: (a) obligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the operations carried out by their business partners in the chains of activities of those companies; (b) liability for violations of the obligations as referred to in point (a); and (c) the obligation for companies to adopt and put into effect a transition plan for climate change mitigation which aims to ensure, through best efforts, compatibility of the business model and of the strategy of the company with the transition to a sustainable economy and with the limiting of global warming to 1,5 oC in line with the Paris Agreement. 2. This Directive shall not constitute grounds for reducing the level of protection of human, employment and social rights, or of protection of the environment or of protection of the climate provided for by the national law of the Member States or by the collective agreements applicable at the time of the adoption of this Directive. 3. This Directive shall be without prejudice to obligations in the areas of human, employment and social rights, and of protection of the environment and climate change under other Union legislative acts. If a provision of this Directive conflicts with a provision of another Union legislative act pursuing the same objectives and providing for more extensive or more specific obligations, the provision of that other Union legislative act shall prevail to the extent of the conflict and shall apply as regards those specific obligations."
]

# Function to compare texts and generate differences
def compare_texts(initial_text, final_text):
    differ = difflib.Differ()
    diff = list(differ.compare(initial_text, final_text))

    changes = []
    for line in diff:
        if line.startswith('  '):
            continue
        elif line.startswith('- '):
            changes.append(('subtraction', line[2:]))
        elif line.startswith('+ '):
            changes.append(('addition', line[2:]))
        elif line.startswith('? '):
            continue

    return changes

# Generate differences
differences = compare_texts(initial_doc_text[0].split(), final_doc_text[0].split())

# Output the differences in the requested format
print("Article,Type of Difference,Old Version,New Version,Difference")
for i, (change_type, text) in enumerate(differences):
    article_num = "Article 1"
    old_version = initial_doc_text[0] if change_type == 'subtraction' else ''
    new_version = final_doc_text[0] if change_type == 'addition' else ''
    diff_text = text
    print(f"{article_num},{change_type},{old_version},{new_version},{diff_text}")

Article,Type of Difference,Old Version,New Version,Difference
Article 1,addition,,Article 1 Subject matter 1. This Directive lays down rules on: (a) obligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the operations carried out by their business partners in the chains of activities of those companies; (b) liability for violations of the obligations as referred to in point (a); and (c) the obligation for companies to adopt and put into effect a transition plan for climate change mitigation which aims to ensure, through best efforts, compatibility of the business model and of the strategy of the company with the transition to a sustainable economy and with the limiting of global warming to 1,5 oC in line with the Paris Agreement. 2. This Directive shall not constitute grounds for reducing the level of protection of human, employment and soci

In [None]:
#Testing stuff out

import pandas as pd
from difflib import Differ

# Input texts
initial_article1 = '''Article 1 Subject matter 1. This Directive lays down rules (a) on obligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the value chain operations carried out by entities with whom the company has an established business relationship and (b) on liability for violations of the obligations mentioned above. The nature of business relationships as ‘established’ shall be reassessed periodically, and at least every 12 months. 2. This Directive shall not constitute grounds for reducing the level of protection of human rights or of protection of the environment or the protection of the climate provided for by the law of Member States at the time of the adoption of this Directive. 3. This Directive shall be without prejudice to obligations in the areas of human rights, protection of the environment and climate change under other Union legislative acts. If the provisions of this Directive conflict with a provision of another Union legislative act pursuing the same objectives and providing for more extensive or more specific obligations, the provisions of the other Union legislative act shall prevail to the extent of the conflict and shall apply to those specific obligations.'''

final_article1 = '''Article 1 Subject matter 1. This Directive lays down rules on: (a) obligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the operations carried out by their business partners in the chains of activities of those companies; (b) liability for violations of the obligations as referred to in point (a); and (c) the obligation for companies to adopt and put into effect a transition plan for climate change mitigation which aims to ensure, through best efforts, compatibility of the business model and of the strategy of the company with the transition to a sustainable economy and with the limiting of global warming to 1,5 oC in line with the Paris Agreement. 2. This Directive shall not constitute grounds for reducing the level of protection of human, employment and social rights, or of protection of the environment or of protection of the climate provided for by the national law of the Member States or by the collective agreements applicable at the time of the adoption of this Directive. 3. This Directive shall be without prejudice to obligations in the areas of human, employment and social rights, and of protection of the environment and climate change under other Union legislative acts. If a provision of this Directive conflicts with a provision of another Union legislative act pursuing the same objectives and providing for more extensive or more specific obligations, the provision of that other Union legislative act shall prevail to the extent of the conflict and shall apply as regards those specific obligations.'''

# Function to highlight differences
def highlight_diff(old_text, new_text):
    d = Differ()
    diff = list(d.compare(old_text.split(), new_text.split()))
    old_result = []
    new_result = []
    diff_portion = []

    for word in diff:
        if word.startswith(' '):  # unchanged
            old_result.append(word[2:])
            new_result.append(word[2:])
        elif word.startswith('-'):  # removed
            old_result.append(f'**{word[2:]}**')
        elif word.startswith('+'):  # added
            new_result.append(f'**{word[2:]}**')
            diff_portion.append(word[2:])

    return ' '.join(old_result), ' '.join(new_result), ' '.join(diff_portion)

# Split into paragraphs
initial_paragraphs = initial_article1.split('\n')[0].split('. ')[1:]  # Skip title and split by paragraph
final_paragraphs = final_article1.split('\n')[0].split('. ')[1:]      # Skip title and split by paragraph

# Compare paragraphs
results = []
article_num = "1"

for i in range(max(len(initial_paragraphs), len(final_paragraphs))):
    old_text = initial_paragraphs[i].strip() if i < len(initial_paragraphs) else ''
    new_text = final_paragraphs[i].strip() if i < len(final_paragraphs) else ''

    if old_text and not new_text:
        results.append({
            'Article': article_num,
            'Type': 'subtraction',
            'Old Version': old_text,
            'New Version': '',
            'Different Portion': old_text
        })
    elif new_text and not old_text:
        results.append({
            'Article': article_num,
            'Type': 'addition',
            'Old Version': '',
            'New Version': new_text,
            'Different Portion': new_text
        })
    elif old_text != new_text:
        old_highlighted, new_highlighted, diff_portion = highlight_diff(old_text, new_text)
        results.append({
            'Article': article_num,
            'Type': 'edit',
            'Old Version': old_highlighted,
            'New Version': new_highlighted,
            'Different Portion': diff_portion
        })

# Create DataFrame and save to CSV
df = pd.DataFrame(results)
df.to_csv('article1_comparison.csv', index=False)

# Print results
print(df.to_string())

  Article         Type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Old Version                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         