In [None]:
!pip install requests beautifulsoup4



In [None]:
import requests
from bs4 import BeautifulSoup

def get_website_text(url):
    try:
        # Fetch the webpage
        response = requests.get(url)
        response.raise_for_status()  # Ensure the request was successful

        # Parse the HTML
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract all text as is
        text = soup.get_text()
        return text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

In [None]:
import re

def extract_initial_article_sections(text):
    pattern = re.compile(
        r'(Article [1-9][0-9]?\n.*?(?=Article [1-9][0-9]?\n|$))',
        flags=re.DOTALL
    )
    sections = pattern.findall(text)
    return sections

In [None]:
def extract_final_article_sections(text):
    # Regex Explanation:
    # (Article\u00a0[1-9][0-9]?\n.*?(?=Article\u00a0[1-9][0-9]?\n|$))
    #
    # - Article\u00a0: Matches the literal "Article" followed by a non-breaking space (U+00A0).
    # - [1-9][0-9]? : Matches a number from 1 to 99.
    # - \n : Matches a newline character.
    #
    # After that, we use a lazy quantifier .*? to capture all text until the next occurrence of
    # "Article\u00a0<number>\n" or the end of the text ($).
    #
    # We use DOTALL so that '.' matches newlines as well.

    pattern = re.compile(
        r'(Article\xa0[1-9][0-9]?\n.*?(?=Article\xa0[1-9][0-9]?\n|$))',
        flags=re.DOTALL
    )
    sections = pattern.findall(text)
    return sections

In [None]:
initial_url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:52022PC0071'
initial_text = get_website_text(initial_url)
final_url = 'https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:L_202401760'
final_text = get_website_text(final_url)

In [None]:
# initial_text
# final_text

In [None]:
# Extract article sections from the sample text
extracted_sections_initial = extract_initial_article_sections(initial_text)

extracted_sections_initial

['Article 1\n\n\nSubject matter\n\n\n1.This Directive lays down rules \n\n\n(a)on obligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the value chain operations carried out by\xa0entities with whom the company has an established business relationship and \n\n\n(b)on liability for violations of the obligations mentioned above. \n\n\n\n\nThe nature of business relationships as ‘established’ shall be reassessed periodically, and at least every 12 months.\n\n\n2.This Directive shall not constitute grounds for reducing the level of protection of human rights\xa0or of protection of the environment\xa0or the protection of the climate provided for by the law of Member States at the time of the adoption of this Directive.\n\n\n3.This Directive shall be without prejudice to obligations in the areas of human rights, protection of the environment and 

In [None]:
extracted_sections_final = extract_final_article_sections(final_text)
extracted_sections_final

['Article\xa01\n\nSubject matter\n\n\n1.\xa0\xa0\xa0This Directive lays down rules on:\n\n\n\n\n\n\n(a)\n\n\nobligations for companies regarding actual and potential human rights adverse impacts and environmental adverse impacts, with respect to their own operations, the operations of their subsidiaries, and the operations carried out by their business partners in the chains of activities of those companies;\n\n\n\n\n\n\n\n\n\n\n(b)\n\n\nliability for violations of the obligations as referred to in point (a); and\n\n\n\n\n\n\n\n\n\n\n(c)\n\n\nthe obligation for companies to adopt and put into effect a\xa0transition plan for climate change mitigation which aims to ensure, through best efforts, compatibility of the business model and of the strategy of the company with the transition to a\xa0sustainable economy and with the limiting of global warming to 1,5\xa0oC in line with the Paris Agreement.\n\n\n\n\n\n\n2.\xa0\xa0\xa0This Directive shall not constitute grounds for reducing the leve