In [1]:
import os
import pdfplumber
import re
import pandas as pd
import numpy as np
from pymongo import MongoClient
from datetime import datetime

In [2]:
selected_documents = [
    "Ireland â__ s National Inventory Report.pdf",
    "Irelandâ__s Climate Change Assessment Volume 2..pdf",
    "Report, Climate Change_ A Cross-Party Consensus for Action.pdf",
    "Climate Change and Sustainability in the Agriculture and Food Sectors.pdf",
    "Environmental Protection Agency â__ Ireland's Environment â__ An Integrated Assessment 2020.pdf",
    "Compliance with the Nitrates Directive_ Implications for Ireland.pdf",
    "Irelandâ__s Climate Change Assessment Volume 3..pdf",
    "Our Rural Future - Rural Development Policy 2021 - 2025.pdf"
]

pdf_folder = r"C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Publication"

## 1. Ireland 's National Inventory Report.pdf

In [4]:
document_1 = "Ireland â__ s National Inventory Report.pdf"
pdf_path = os.path.join(pdf_folder, document_1)

In [5]:
extracted_text = {}

try:
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        for i in range(20, 345):
            page = pdf.pages[i]
            text = page.extract_text()
    
            if text:
                lines = text.strip().split("\n")  
                cleaned_text = "\n".join(lines[:-1]) 
                extracted_text[f"Page {i+1}"] = cleaned_text

except Exception as e:
    print(f"Error processing {pdf_file}: {e}")

In [6]:
for page, text in list(extracted_text.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}\n{text}\n{'='*50}\n")


Extracted text from Page 21:
Chapter 1 Introduction
Background and Context
This report constitutes Ireland’s National Inventory Report (NIR), for the years 1990-2022, as required
under the United Nations Framework Convention on Climate Change.
The objective of the NIR is to describe the methodologies, input data, background information and
the entire process of inventory compilation for greenhouse gases and to give explanations for any
improvements and recalculations of the inventories reported in previous submissions. The report is a
key component of the UN review process which assesses the transparency, completeness and overall
quality of the inventories from Annex I Parties.
Introduction and Reporting Requirements under the UNFCCC
The United Nations Framework Convention on Climate Change (UNFCCC) (Articles 4 and 12), hereafter
referred to as the Convention, requires Annex I Parties to develop, publish and make available to the
Conference of the Parties (COP), the Convention’s imple

In [7]:
for page, text in list(extracted_text.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}")
    
    paragraphs = re.split(r'(?<=\.)\n', text) 
    
    for i, para in enumerate(paragraphs):
        print(f"\n[Paragraph {i+1}]\n{para.strip()}\n{'-'*50}")

    print('='*50)


Extracted text from Page 21:

[Paragraph 1]
Chapter 1 Introduction
Background and Context
This report constitutes Ireland’s National Inventory Report (NIR), for the years 1990-2022, as required
under the United Nations Framework Convention on Climate Change.
--------------------------------------------------

[Paragraph 2]
The objective of the NIR is to describe the methodologies, input data, background information and
the entire process of inventory compilation for greenhouse gases and to give explanations for any
improvements and recalculations of the inventories reported in previous submissions. The report is a
key component of the UN review process which assesses the transparency, completeness and overall
quality of the inventories from Annex I Parties.
--------------------------------------------------

[Paragraph 3]
Introduction and Reporting Requirements under the UNFCCC
The United Nations Framework Convention on Climate Change (UNFCCC) (Articles 4 and 12), hereafter
referred t

In [8]:
def process_paragraphs(extracted_text):
    paragraphs_per_page = {}
    for page, text in extracted_text.items():
        paragraphs = re.split(r'(?<=\.)\n', text)
        paragraphs_per_page[page] = paragraphs
    merged_paragraphs = merge_pages(paragraphs_per_page)
    return merged_paragraphs

def merge_pages(paragraphs_per_page):
    merged_paragraphs = []
    pages = list(paragraphs_per_page.keys())
    for i in range(len(pages) - 1):
        current_page_paragraphs = paragraphs_per_page[pages[i]]
        next_page_paragraphs = paragraphs_per_page[pages[i + 1]]
        if current_page_paragraphs and not current_page_paragraphs[-1].endswith('.'):
            merged_paragraph = current_page_paragraphs[-1] + " " + next_page_paragraphs[0]
            current_page_paragraphs[-1] = merged_paragraph
            next_page_paragraphs = next_page_paragraphs[1:]
        merged_paragraphs.append((pages[i], current_page_paragraphs))
        if next_page_paragraphs:
            merged_paragraphs.append((pages[i + 1], next_page_paragraphs))
    return merged_paragraphs

In [9]:
merged_paragraphs = process_paragraphs(extracted_text)

In [144]:
data = []
for page_num, paragraphs in merged_paragraphs:
    for para_num, para in enumerate(paragraphs, start=1):
        data.append((page_num, para_num, para))

para = pd.DataFrame(data, columns=["Page Number", "Paragraph Number", "Paragraph"])

document_name = [document_1] * len(para)
para['Document Name'] = document_name
para['Author'] = 'Environmental Protection Agency'
para['Year'] = 2024
print(para.head(5))

  Page Number  Paragraph Number  \
0     Page 21                 1   
1     Page 21                 2   
2     Page 21                 3   
3     Page 21                 4   
4     Page 21                 5   

                                           Paragraph  \
0  Chapter 1 Introduction\nBackground and Context...   
1  The objective of the NIR is to describe the me...   
2  Introduction and Reporting Requirements under ...   
3  The NIR is compiled according to the structure...   
4  In addition, detailed documentation of methods...   

                                 Document Name  \
0  Ireland â__ s National Inventory Report.pdf   
1  Ireland â__ s National Inventory Report.pdf   
2  Ireland â__ s National Inventory Report.pdf   
3  Ireland â__ s National Inventory Report.pdf   
4  Ireland â__ s National Inventory Report.pdf   

                            Author  Year  
0  Environmental Protection Agency  2024  
1  Environmental Protection Agency  2024  
2  Environmental Prote

## 2. Ireland's Climate Change Assessment Volume 2.pdf

In [11]:
document_2 = "Irelandâ__s Climate Change Assessment Volume 2..pdf"
pdf_path_2 = os.path.join(pdf_folder, document_2)

In [19]:
extracted_text_2 = {}

try:
    with pdfplumber.open(pdf_path_2) as pdf:
        total_pages = len(pdf.pages)
        for i in range(7, 187):
            page = pdf.pages[i]
            text = page.extract_text()
    
            if text:
                lines = text.strip().split("\n")  
                cleaned_text = "\n".join(lines[:-1]) 
                extracted_text_2[f"Page {i+1}"] = cleaned_text

except Exception as e:
    print(f"Error processing {pdf_file}: {e}")

In [20]:
for page, text in list(extracted_text_2.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}\n{text}\n{'='*50}\n")


Extracted text from Page 8:
2 Environmental Protection Agency
0502
yb
ytilartueN
etamilC
gniveihcA
Climate Neutral Ireland
Policy Evidence Economics Society
We have the political We know there are We know there are We must bring
ambition and are in the technical and feasible economic opportunities everyone with us;
process of translating pathways in energy but but also costs and citizens, communities,
that into policies evidence in agriculture impacts that must be businesses, farmers,
and land use is limited managed etc.
Figure SPM.1 Ireland’s strategy towards climate neutrality1.
A. The starting point: greenhouse gas emissions and climate policy in
Ireland
Since the Industrial Revolution, enhanced levels of atmospheric greenhouse gases, particularly carbon dioxide, have changed
the Earth’s energy balance, resulting in less heat being lost to space. This is causing global warming, which is observed as
increased global average temperatures, changes in precipitation patterns, mean sea l

In [23]:
def clean_pdf_dict(pdf_text_dict):
    cleaned_dict = {}

    for page, text in pdf_text_dict.items():
        lines = text.split("\n")  
        cleaned_lines = []
        skip_next_n = 0  

        for i, line in enumerate(lines):
            if skip_next_n > 0:
                skip_next_n -= 1
                continue  

            if re.search(r'^\d*\s*Environmental Protection Agency\s*\d*$', line.strip()):
                skip_next_n = 5 
                continue  

            unwanted_texts = {"Achieving", "Climate", "Neutrality", "by", "2050", "0502",
                              "yb", "ytilartueN", "etamilC", "gniveihcA"}
            if line.strip() in unwanted_texts:
                continue  

            cleaned_lines.append(line)

        cleaned_dict[page] = "\n".join(cleaned_lines)

    return cleaned_dict

In [24]:
extracted_text_2_cleaned = clean_pdf_dict(extracted_text_2)

for page, text in extracted_text_2_cleaned.items():
    print(f"{page}:\n{text}\n")

Page 8:
Climate Neutral Ireland
Policy Evidence Economics Society
We have the political We know there are We know there are We must bring
ambition and are in the technical and feasible economic opportunities everyone with us;
process of translating pathways in energy but but also costs and citizens, communities,
that into policies evidence in agriculture impacts that must be businesses, farmers,
and land use is limited managed etc.
Figure SPM.1 Ireland’s strategy towards climate neutrality1.
A. The starting point: greenhouse gas emissions and climate policy in
Ireland
Since the Industrial Revolution, enhanced levels of atmospheric greenhouse gases, particularly carbon dioxide, have changed
the Earth’s energy balance, resulting in less heat being lost to space. This is causing global warming, which is observed as
increased global average temperatures, changes in precipitation patterns, mean sea level rise and changes in the character of
weather extremes.
Greenhouse gases are composed of

In [25]:
for page, text in list(extracted_text_2_cleaned.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}")
    
    paragraphs = re.split(r'(?<=\.)\n', text) 
    
    for i, para in enumerate(paragraphs):
        print(f"\n[Paragraph {i+1}]\n{para.strip()}\n{'-'*50}")

    print('='*50)


Extracted text from Page 8:

[Paragraph 1]
Climate Neutral Ireland
Policy Evidence Economics Society
We have the political We know there are We know there are We must bring
ambition and are in the technical and feasible economic opportunities everyone with us;
process of translating pathways in energy but but also costs and citizens, communities,
that into policies evidence in agriculture impacts that must be businesses, farmers,
and land use is limited managed etc.
--------------------------------------------------

[Paragraph 2]
Figure SPM.1 Ireland’s strategy towards climate neutrality1.
--------------------------------------------------

[Paragraph 3]
A. The starting point: greenhouse gas emissions and climate policy in
Ireland
Since the Industrial Revolution, enhanced levels of atmospheric greenhouse gases, particularly carbon dioxide, have changed
the Earth’s energy balance, resulting in less heat being lost to space. This is causing global warming, which is observed as
increase

In [26]:
merged_paragraphs_2 = process_paragraphs(extracted_text_2_cleaned)

In [31]:
data2 = []
for page_num, paragraphs in merged_paragraphs_2:
    for para_num, para in enumerate(paragraphs, start=1):
        data2.append((page_num, para_num, para))

para2 = pd.DataFrame(data2, columns=["Page Number", "Paragraph Number", "Paragraph"])

document_name_2 = [document_2] * len(para2)
para2['Document Name'] = document_name_2
para2['Author'] = 'Environmental Protection Agency'
para2['Year'] = 2023
print(para2.head(5))

  Page Number  Paragraph Number  \
0      Page 8                 1   
1      Page 8                 2   
2      Page 8                 3   
3      Page 8                 4   
4      Page 8                 5   

                                           Paragraph  \
0  Climate Neutral Ireland\nPolicy Evidence Econo...   
1  Figure SPM.1 Ireland’s strategy towards climat...   
2  A. The starting point: greenhouse gas emission...   
3  Greenhouse gases are composed of several diffe...   
4  To address climate change, both Ireland and th...   

                                       Document Name  \
0  Irelandâ__s Climate Change Assessment Volume 2...   
1  Irelandâ__s Climate Change Assessment Volume 2...   
2  Irelandâ__s Climate Change Assessment Volume 2...   
3  Irelandâ__s Climate Change Assessment Volume 2...   
4  Irelandâ__s Climate Change Assessment Volume 2...   

                            Author  Year  
0  Environmental Protection Agency  2023  
1  Environmental Protection A

## 3. Report of the Joint Committee on Climate Action Climate Change: A Cross-Party Consensus for Action

In [32]:
document_3 = "Report, Climate Change_ A Cross-Party Consensus for Action.pdf"
pdf_path_3 = os.path.join(pdf_folder, document_3)

In [34]:
extracted_text_3 = {}

try:
    with pdfplumber.open(pdf_path_3) as pdf:
        total_pages = len(pdf.pages)
        for i in range(13, 136):
            page = pdf.pages[i]
            text = page.extract_text()
    
            if text:
                lines = text.strip().split("\n")  
                cleaned_text = "\n".join(lines[:-1]) 
                extracted_text_3[f"Page {i+1}"] = cleaned_text

except Exception as e:
    print(f"Error processing {document_3}: {e}")

In [35]:
for page, text in list(extracted_text_3.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}\n{text}\n{'='*50}\n")


Extracted text from Page 14:
Joint Committee on Climate Action | Climate change: a cross-party consensus for action 8
1.2 Citizens’ Assembly Recommendation
The Citizens’ Assembly recommended as follows:
R1: 97% of Members recommend that to ensure climate change is at the centre of policy-making in
Ireland, as a matter of urgency a new or existing independent body should be resourced
appropriately, operate in an open and transparent manner, and be given a broad range of new
functions and powers in legislation to urgently address climate change.
Such functions and powers should include, but not be limited to:
1. To examine any legislative proposals, it considers relevant to its functions and to report
publicly its views on any implications in relation to climate change; the relevant Minister
must respond publicly to the views expressed in a [any] report prior to the progress of the
particular legislative proposal;
2. To propose national sectoral targets for emissions reductions, to be i

In [42]:
def clean_footer_3(pdf_text_dict):
    cleaned_dict = {}

    footer_pattern = re.compile(r'Joint Committee on Climate Action \| Climate change: a cross-party consensus for action \d+\n?', re.MULTILINE)

    for page, text in pdf_text_dict.items():
        cleaned_text = re.sub(footer_pattern, '', text)
        cleaned_dict[page] = cleaned_text.strip()
    
    return cleaned_dict

In [43]:
extracted_text_3_cleaned = clean_footer_3(extracted_text_3)

for page, text in extracted_text_3_cleaned.items():
    print(f"{page}:\n{text}\n")

Page 14:
1.2 Citizens’ Assembly Recommendation
The Citizens’ Assembly recommended as follows:
R1: 97% of Members recommend that to ensure climate change is at the centre of policy-making in
Ireland, as a matter of urgency a new or existing independent body should be resourced
appropriately, operate in an open and transparent manner, and be given a broad range of new
functions and powers in legislation to urgently address climate change.
Such functions and powers should include, but not be limited to:
1. To examine any legislative proposals, it considers relevant to its functions and to report
publicly its views on any implications in relation to climate change; the relevant Minister
must respond publicly to the views expressed in a [any] report prior to the progress of the
particular legislative proposal;
2. To propose national sectoral targets for emissions reductions, to be implemented by the
Oireachtas, with regular review and reporting cycles;
3. To institute proceedings in any Cou

In [44]:
for page, text in list(extracted_text_3_cleaned.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}")
    
    paragraphs = re.split(r'(?<=\.)\n', text) 
    
    for i, para in enumerate(paragraphs):
        print(f"\n[Paragraph {i+1}]\n{para.strip()}\n{'-'*50}")

    print('='*50)


Extracted text from Page 14:

[Paragraph 1]
1.2 Citizens’ Assembly Recommendation
The Citizens’ Assembly recommended as follows:
R1: 97% of Members recommend that to ensure climate change is at the centre of policy-making in
Ireland, as a matter of urgency a new or existing independent body should be resourced
appropriately, operate in an open and transparent manner, and be given a broad range of new
functions and powers in legislation to urgently address climate change.
--------------------------------------------------

[Paragraph 2]
Such functions and powers should include, but not be limited to:
1. To examine any legislative proposals, it considers relevant to its functions and to report
publicly its views on any implications in relation to climate change; the relevant Minister
must respond publicly to the views expressed in a [any] report prior to the progress of the
particular legislative proposal;
2. To propose national sectoral targets for emissions reductions, to be implement

In [45]:
merged_paragraphs_3 = process_paragraphs(extracted_text_3_cleaned)

In [46]:
data3 = []
for page_num, paragraphs in merged_paragraphs_3:
    for para_num, para in enumerate(paragraphs, start=1):
        data3.append((page_num, para_num, para))

para3 = pd.DataFrame(data3, columns=["Page Number", "Paragraph Number", "Paragraph"])

document_name_3 = [document_3] * len(para3)
para3['Document Name'] = document_name_3
para3['Author'] = 'Joint Committee on Climate Action'
para3['Year'] = 2019
print(para3.head(5))

  Page Number  Paragraph Number  \
0     Page 14                 1   
1     Page 14                 2   
2     Page 14                 3   
3     Page 14                 4   
4     Page 14                 5   

                                           Paragraph  \
0  1.2 Citizens’ Assembly Recommendation\nThe Cit...   
1  Such functions and powers should include, but ...   
2  Recommendation 1 (R1) of the Citizens’ Assembl...   
3  As will be clear from the proposals in this ch...   
4  1.3 Strengthening the statutory basis for sett...   

                                       Document Name  \
0  Report, Climate Change_ A Cross-Party Consensu...   
1  Report, Climate Change_ A Cross-Party Consensu...   
2  Report, Climate Change_ A Cross-Party Consensu...   
3  Report, Climate Change_ A Cross-Party Consensu...   
4  Report, Climate Change_ A Cross-Party Consensu...   

                              Author  Year  
0  Joint Committee on Climate Action  2019  
1  Joint Committee on Cli

## 4. Climate Change and Sustainability in the Agriculture and Food Sectors.pdf

In [47]:
document_4 = "Climate Change and Sustainability in the Agriculture and Food Sectors.pdf"
pdf_path_4 = os.path.join(pdf_folder, document_4)

In [51]:
extracted_text_4 = {}

try:
    with pdfplumber.open(pdf_path_4) as pdf:
        total_pages = len(pdf.pages)
        for i in range(12, 66):
            page = pdf.pages[i]
            text = page.extract_text()
    
            if text:
                lines = text.strip().split("\n")  
                cleaned_text = "\n".join(lines[:-1]) 
                extracted_text_4[f"Page {i+1}"] = cleaned_text

except Exception as e:
    print(f"Error processing {document_4}: {e}")

In [52]:
for page, text in list(extracted_text_4.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}\n{text}\n{'='*50}\n")


Extracted text from Page 13:
An Comhchoiste um Thalmhaíocht, Bia agus Muir Joint Committee on Agriculture, Food and the Marine
13. The Committee recommends that the Organic Farming Scheme continue to be included in
Irish environmental measures, including the Common Agricultural Policy post-2020. An
enhanced and expanded organic farming scheme, focused on emission-reducing practices,
may be of benefit in addressing Irish agricultural emissions.
14. The Committee recommends that a Communications Strategy be developed which highlights
the scale of Irish food production in the dairy and beef sectors, in particular stressing the
levels of efficiency achieved.
15. The Committee recognises the merit of a Single Environmental Area between Ireland and
Northern Ireland, as well as the need for a commonality in approach in addressing GHG
emissions form the island of Ireland as a whole. The Committee recommends an identification
of Member States of the EU, the EEA and the OECD facing similar chal

In [53]:
def clean_footer_4(pdf_text_dict):
    cleaned_dict = {}

    footer_pattern = re.compile(r'An Comhchoiste um Thalmhaíocht, Bia agus Muir Joint Committee on Agriculture, Food and the Marine', re.MULTILINE)

    for page, text in pdf_text_dict.items():
        cleaned_text = re.sub(footer_pattern, '', text)
        cleaned_dict[page] = cleaned_text.strip()
    
    return cleaned_dict

In [54]:
extracted_text_4_cleaned = clean_footer_4(extracted_text_4)

for page, text in extracted_text_4_cleaned.items():
    print(f"{page}:\n{text}\n")

Page 13:
13. The Committee recommends that the Organic Farming Scheme continue to be included in
Irish environmental measures, including the Common Agricultural Policy post-2020. An
enhanced and expanded organic farming scheme, focused on emission-reducing practices,
may be of benefit in addressing Irish agricultural emissions.
14. The Committee recommends that a Communications Strategy be developed which highlights
the scale of Irish food production in the dairy and beef sectors, in particular stressing the
levels of efficiency achieved.
15. The Committee recognises the merit of a Single Environmental Area between Ireland and
Northern Ireland, as well as the need for a commonality in approach in addressing GHG
emissions form the island of Ireland as a whole. The Committee recommends an identification
of Member States of the EU, the EEA and the OECD facing similar challenges, with the
objective of establishing channels of knowledge exchange.
16. The Committee recommends that ways in wh

In [55]:
for page, text in list(extracted_text_4_cleaned.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}")
    
    paragraphs = re.split(r'(?<=\.)\n', text) 
    
    for i, para in enumerate(paragraphs):
        print(f"\n[Paragraph {i+1}]\n{para.strip()}\n{'-'*50}")

    print('='*50)


Extracted text from Page 13:

[Paragraph 1]
13. The Committee recommends that the Organic Farming Scheme continue to be included in
Irish environmental measures, including the Common Agricultural Policy post-2020. An
enhanced and expanded organic farming scheme, focused on emission-reducing practices,
may be of benefit in addressing Irish agricultural emissions.
--------------------------------------------------

[Paragraph 2]
14. The Committee recommends that a Communications Strategy be developed which highlights
the scale of Irish food production in the dairy and beef sectors, in particular stressing the
levels of efficiency achieved.
--------------------------------------------------

[Paragraph 3]
15. The Committee recognises the merit of a Single Environmental Area between Ireland and
Northern Ireland, as well as the need for a commonality in approach in addressing GHG
emissions form the island of Ireland as a whole. The Committee recommends an identification
of Member States of

In [59]:
merged_paragraphs_4 = process_paragraphs(extracted_text_4_cleaned)
print(len(merged_paragraphs_4))

104


In [60]:
data4 = []
for page_num, paragraphs in merged_paragraphs_4:
    for para_num, para in enumerate(paragraphs, start=1):
        data4.append((page_num, para_num, para))

para4 = pd.DataFrame(data4, columns=["Page Number", "Paragraph Number", "Paragraph"])

document_name_4 = [document_4] * len(para4)
para4['Document Name'] = document_name_4
para4['Author'] = 'Joint Committee on Agriculture, Food and the Marine'
para4['Year'] = 2018
print(para4.head(5))

  Page Number  Paragraph Number  \
0     Page 13                 1   
1     Page 13                 2   
2     Page 13                 3   
3     Page 13                 4   
4     Page 13                 5   

                                           Paragraph  \
0  13. The Committee recommends that the Organic ...   
1  14. The Committee recommends that a Communicat...   
2  15. The Committee recognises the merit of a Si...   
3  16. The Committee recommends that ways in whic...   
4  17. The Committee recommends that resourcing b...   

                                       Document Name  \
0  Climate Change and Sustainability in the Agric...   
1  Climate Change and Sustainability in the Agric...   
2  Climate Change and Sustainability in the Agric...   
3  Climate Change and Sustainability in the Agric...   
4  Climate Change and Sustainability in the Agric...   

                                              Author  Year  
0  Joint Committee on Agriculture, Food and the M...  

## 5. Environmental Protection Agency Ireland's Environment An Integrated Assessment 2020.pdf

In [61]:
document_5 = "Environmental Protection Agency â__ Ireland's Environment â__ An Integrated Assessment 2020.pdf"
pdf_path_5 = os.path.join(pdf_folder, document_5)

In [80]:
import fitz  
def extract_text_pymupdf(pdf_path, start_page, end_page):
    extracted_data = []

    with fitz.open(pdf_path) as doc:
        for i in range(start_page - 1, end_page):
            page = doc[i]
            text = page.get_text("text")

            if text:
                paragraphs = [p.strip() for p in re.split(r'\n\s*\n+', text) if p.strip()]

                for para_num, para in enumerate(paragraphs, start=1):
                    extracted_data.append((i + 1, para_num, para))  

    df = pd.DataFrame(extracted_data, columns=["Page Number", "Paragraph Number", "Paragraph"])
    return df

In [81]:
extracted_text_5 = extract_text_pymupdf(pdf_path_5, 24, 451)
extracted_text_5

     Page Number  Paragraph Number  \
0             24                 1   
1             25                 1   
2             26                 1   
3             27                 1   
4             28                 1   
..           ...               ...   
454          447                 1   
455          448                 1   
456          449                 1   
457          450                 1   
458          451                 1   

                                             Paragraph  
0                              Chapter 1\nIntroduction  
1    Chapter 1: Introduction\nIntroduction\n1.  Int...  
2    Ireland’s Environment – An Integrated Assessme...  
3    Chapter 1: Introduction\nFigure 1.1  Ireland i...  
4    Ireland’s Environment – An Integrated Assessme...  
..                                                 ...  
454  Chapter 16: Conclusions\nSYSTEM CHANGE – DELIV...  
455  Ireland’s Environment – An Integrated Assessme...  
456  Chapter 16: Conclusions\n

In [84]:
def clean_headers_5(text):
    text = text.replace("Ireland’s Environment – An Integrated Assessment 2020", "")
    text = re.sub(r'Chapter \d+: .*\n?', '', text)
    return text.strip()

In [89]:
extracted_text_5["Paragraph"] = extracted_text_5["Paragraph"].apply(clean_headers_5)

In [100]:
extracted_text_5['Paragraph'][:5]

0                              Chapter 1\nIntroduction
1    Introduction\n1.  Introduction\nToday’s enviro...
2    A positive outcome from the national response ...
3    Figure 1.1  Ireland in the pandemic – environm...
4    4.\t natural capital investment for ecosystem ...
Name: Paragraph, dtype: object

In [102]:
document_name_5 = [document_5] * len(extracted_text_5)
extracted_text_5['Document Name'] = document_name_5
extracted_text_5['Author'] = 'Environmental Protection Agency'
extracted_text_5['Year'] = 2020
extracted_text_5

Unnamed: 0,Page Number,Paragraph Number,Paragraph,Document Name,Author,Year
0,24,1,Chapter 1\nIntroduction,Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020
1,25,1,Introduction\n1. Introduction\nToday’s enviro...,Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020
2,26,1,A positive outcome from the national response ...,Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020
3,27,1,Figure 1.1 Ireland in the pandemic – environm...,Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020
4,28,1,4.\t natural capital investment for ecosystem ...,Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020
...,...,...,...,...,...,...
454,447,1,SYSTEM CHANGE – DELIVERY ON SECTORAL AND SOCIE...,Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020
455,448,1,SYSTEM CHANGE – DELIVERY ON SECTORAL AND SOCIE...,Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020
456,449,1,SYSTEM CHANGE – DELIVERY ON SECTORAL AND SOCIE...,Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020
457,450,1,"5. Conclusions\nOverall, Ireland needs a nati...",Environmental Protection Agency â__ Ireland's ...,Environmental Protection Agency,2020


## 6. Compliance with the Nitrates Directive Implications for Ireland.pdf

In [103]:
document_6 = "Compliance with the Nitrates Directive_ Implications for Ireland.pdf"
pdf_path_6 = os.path.join(pdf_folder, document_6)

In [111]:
extracted_text_6 = {}

try:
    with pdfplumber.open(pdf_path_6) as pdf:
        total_pages = len(pdf.pages)
        for i in range(8, 66):
            page = pdf.pages[i]
            text = page.extract_text()
    
            if text:
                lines = text.strip().split("\n")  
                cleaned_text = "\n".join(lines[:-1]) 
                extracted_text_6[f"Page {i+1}"] = cleaned_text

except Exception as e:
    print(f"Error processing {document_6}: {e}")

In [117]:
for page, text in list(extracted_text_6.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}\n{text}\n{'='*50}\n")


Extracted text from Page 9:
Compliance with the Nitrates Directive: Implications for Ireland
1. Current Policy Context
Directive 2000/60/EC of the European Parliament and of the Council (the “Water
Framework Directive”) was introduced in October 2000. The stated purpose of the
Water Framework Directive is to “establish a framework for the protection of inland
surface waters, transitional waters, coastal waters and groundwater”1 throughout the
European Union.
The Water Framework Directive requires European Union Member States to achieve
both “good surface water status”,2 in respect of surface waters, and “good groundwater
status”3 in respect of groundwater, in their respective jurisdictions by 2027.4 The
Committee notes that in the context of the Water Framework Directive, “good surface
water status” means the status achieved by a surface water body when both its
ecological status and its chemical status are at least “good”.5
Article 3 of the Water Framework Directive requires each Eur

In [118]:
def clean_footer_6(pdf_text_dict):
    cleaned_dict = {}

    footer_pattern = re.compile(r"Comhlíonadh na Treorach um Níotráití: Na Ciallachais d'Éirinn", re.MULTILINE)

    for page, text in pdf_text_dict.items():
        cleaned_text = re.sub(footer_pattern, '', text)
        cleaned_dict[page] = cleaned_text.strip()
    
    return cleaned_dict

In [119]:
extracted_text_6_cleaned = clean_footer_6(extracted_text_6)

for page, text in extracted_text_6_cleaned.items():
    print(f"{page}:\n{text}\n")

Page 9:
Compliance with the Nitrates Directive: Implications for Ireland
1. Current Policy Context
Directive 2000/60/EC of the European Parliament and of the Council (the “Water
Framework Directive”) was introduced in October 2000. The stated purpose of the
Water Framework Directive is to “establish a framework for the protection of inland
surface waters, transitional waters, coastal waters and groundwater”1 throughout the
European Union.
The Water Framework Directive requires European Union Member States to achieve
both “good surface water status”,2 in respect of surface waters, and “good groundwater
status”3 in respect of groundwater, in their respective jurisdictions by 2027.4 The
Committee notes that in the context of the Water Framework Directive, “good surface
water status” means the status achieved by a surface water body when both its
ecological status and its chemical status are at least “good”.5
Article 3 of the Water Framework Directive requires each European Union Member
St

In [120]:
merged_paragraphs_6 = process_paragraphs(extracted_text_6_cleaned)
print(len(merged_paragraphs_6))

109


In [121]:
data6 = []
for page_num, paragraphs in merged_paragraphs_6:
    for para_num, para in enumerate(paragraphs, start=1):
        data6.append((page_num, para_num, para))

para6 = pd.DataFrame(data6, columns=["Page Number", "Paragraph Number", "Paragraph"])

document_name_6 = [document_6] * len(para6)
para6['Document Name'] = document_name_6
para6['Author'] = 'Joint Committee on Agriculture, Food and the Marine'
para6['Year'] = 2024
print(para6.head(5))

  Page Number  Paragraph Number  \
0      Page 9                 1   
1      Page 9                 2   
2      Page 9                 3   
3      Page 9                 4   
4      Page 9                 5   

                                           Paragraph  \
0  Compliance with the Nitrates Directive: Implic...   
1  The Water Framework Directive requires Europea...   
2                       2 ibid, Article 4 1.(a)(ii).   
3                       3 ibid, Article 4 1.(b)(ii).   
4  4 The original deadlines for the achievement o...   

                                       Document Name  \
0  Compliance with the Nitrates Directive_ Implic...   
1  Compliance with the Nitrates Directive_ Implic...   
2  Compliance with the Nitrates Directive_ Implic...   
3  Compliance with the Nitrates Directive_ Implic...   
4  Compliance with the Nitrates Directive_ Implic...   

                                              Author  Year  
0  Joint Committee on Agriculture, Food and the M...  

## 7. Irelandâ__s Climate Change Assessment Volume 3..pdf

In [124]:
document_7 = "Irelandâ__s Climate Change Assessment Volume 3..pdf"
pdf_path_7 = os.path.join(pdf_folder, document_7)

In [125]:
extracted_text_7 = {}

try:
    with pdfplumber.open(pdf_path_7) as pdf:
        total_pages = len(pdf.pages)
        for i in range(7, 236):
            page = pdf.pages[i]
            text = page.extract_text()
    
            if text:
                lines = text.strip().split("\n")  
                cleaned_text = "\n".join(lines[:-1]) 
                extracted_text_7[f"Page {i+1}"] = cleaned_text

except Exception as e:
    print(f"Error processing {document_7}: {e}")

In [126]:
for page, text in list(extracted_text_7.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}\n{text}\n{'='*50}\n")


Extracted text from Page 8:
6 Environmental Protection Agency
etamilC
erutuF
s’dnalerI
rof
deraperP
gnieB
A. The current state of play on adaptation
A.1 Climate change is happening now and we need to adapt. Volume 1 shows that climate change is happening
now. Extreme weather events, together with sea level rise and coastal erosion highlight an adaptation deficit in
Ireland. Actions taken today to reduce vulnerability and exposure and increase resilience will shape the future and
should be seen as an investment rather than a short-term cost. {Chapter 1}
Flooding in Enniscorthy, Co. Wexford, in 2015, just one of many recent extreme events that highlight exposure, vulnerability
and the need for adaptation. Source: Reproduced with permission from Wexford Hub (https://wexfordhub.com/enniscorthy-
impassable/).
A.2 Adaptation offers opportunities and multiple benefits. Climate change impacts will unfold alongside
other social, environmental and economic challenges and development objectives.

In [128]:
def clean_pdf_dict_7(pdf_text_dict):
    cleaned_dict = {}

    for page, text in pdf_text_dict.items():
        lines = text.split("\n")  
        cleaned_lines = []
        skip_next_n = 0  

        for i, line in enumerate(lines):
            if skip_next_n > 0:
                skip_next_n -= 1
                continue  

            if re.search(r'^\d*\s*Environmental Protection Agency\s*\d*$', line.strip()):
                skip_next_n = 5 
                continue  

            unwanted_texts = {"Being", "Prepared", "for", "Ireland’s", "Future", "Climate",
                              "etamilC", "erutuF", "s’dnalerI", "rof","deraperP","gnieB"}
            if line.strip() in unwanted_texts:
                continue  

            cleaned_lines.append(line)

        cleaned_dict[page] = "\n".join(cleaned_lines)

    return cleaned_dict

In [129]:
extracted_text_7_cleaned = clean_pdf_dict_7(extracted_text_7)

for page, text in extracted_text_7_cleaned.items():
    print(f"{page}:\n{text}\n")

Page 8:
A. The current state of play on adaptation
A.1 Climate change is happening now and we need to adapt. Volume 1 shows that climate change is happening
now. Extreme weather events, together with sea level rise and coastal erosion highlight an adaptation deficit in
Ireland. Actions taken today to reduce vulnerability and exposure and increase resilience will shape the future and
should be seen as an investment rather than a short-term cost. {Chapter 1}
Flooding in Enniscorthy, Co. Wexford, in 2015, just one of many recent extreme events that highlight exposure, vulnerability
and the need for adaptation. Source: Reproduced with permission from Wexford Hub (https://wexfordhub.com/enniscorthy-
impassable/).
A.2 Adaptation offers opportunities and multiple benefits. Climate change impacts will unfold alongside
other social, environmental and economic challenges and development objectives. Project Ireland 2040 sets a
pathway for realising national priorities for urban renewal, housing, 

In [130]:
for page, text in list(extracted_text_7_cleaned.items())[:5]:  
    print(f"\nExtracted text from {page}:\n{'='*50}")
    
    paragraphs = re.split(r'(?<=\.)\n', text) 
    
    for i, para in enumerate(paragraphs):
        print(f"\n[Paragraph {i+1}]\n{para.strip()}\n{'-'*50}")

    print('='*50)


Extracted text from Page 8:

[Paragraph 1]
A. The current state of play on adaptation
A.1 Climate change is happening now and we need to adapt. Volume 1 shows that climate change is happening
now. Extreme weather events, together with sea level rise and coastal erosion highlight an adaptation deficit in
Ireland. Actions taken today to reduce vulnerability and exposure and increase resilience will shape the future and
should be seen as an investment rather than a short-term cost. {Chapter 1}
Flooding in Enniscorthy, Co. Wexford, in 2015, just one of many recent extreme events that highlight exposure, vulnerability
and the need for adaptation. Source: Reproduced with permission from Wexford Hub (https://wexfordhub.com/enniscorthy-
impassable/).
--------------------------------------------------

[Paragraph 2]
A.2 Adaptation offers opportunities and multiple benefits. Climate change impacts will unfold alongside
other social, environmental and economic challenges and development objectiv

In [131]:
merged_paragraphs_7 = process_paragraphs(extracted_text_7_cleaned)

In [133]:
data7 = []
for page_num, paragraphs in merged_paragraphs_7:
    for para_num, para in enumerate(paragraphs, start=1):
        data7.append((page_num, para_num, para))

para7 = pd.DataFrame(data7, columns=["Page Number", "Paragraph Number", "Paragraph"])

document_name_7 = [document_7] * len(para7)
para7['Document Name'] = document_name_7
para7['Author'] = 'Environmental Protection Agency'
para7['Year'] = 2023
print(para7.head(5))

  Page Number  Paragraph Number  \
0      Page 8                 1   
1      Page 8                 2   
2      Page 8                 3   
3      Page 9                 1   
4      Page 9                 2   

                                           Paragraph  \
0  A. The current state of play on adaptation\nA....   
1  A.2 Adaptation offers opportunities and multip...   
2  Ireland’s population is also expected to grow ...   
3  Assigning governmental responsibilities for ma...   
4  A8.2 While many sectors have developed adaptat...   

                                       Document Name  \
0  Irelandâ__s Climate Change Assessment Volume 3...   
1  Irelandâ__s Climate Change Assessment Volume 3...   
2  Irelandâ__s Climate Change Assessment Volume 3...   
3  Irelandâ__s Climate Change Assessment Volume 3...   
4  Irelandâ__s Climate Change Assessment Volume 3...   

                            Author  Year  
0  Environmental Protection Agency  2023  
1  Environmental Protection A

## 8. Our Rural Future - Rural Development Policy 2021 - 2025.pdf

In [134]:
document_8 = "Our Rural Future - Rural Development Policy 2021 - 2025.pdf"
pdf_path_8 = os.path.join(pdf_folder, document_8)

In [135]:
extracted_text_8 = extract_text_pymupdf(pdf_path_8, 10, 102)
extracted_text_8

Unnamed: 0,Page Number,Paragraph Number,Paragraph
0,10,1,6\nOur Rural Future represents a new milestone...
1,11,1,7\n“Our Vision is for a thriving rural \nIrela...
2,12,1,8\nInvestment in transport infrastructure and ...
3,13,1,"9\nconnectivity, town centre regeneration, the..."
4,13,2,Optimising the opportunities for rural \ncommu...
...,...,...,...
230,98,1,94\nThis approach recognises that every area i...
231,99,1,95\nImplementation And Oversight\nOur Rural Fu...
232,100,1,"96\nThe Government will work to enhance, devel..."
233,101,1,97\nActions to Achieve Our Ambition \nTo suppo...


In [137]:
def clean_headers_8(text):
    text = text.replace("Our Rural Future    |    Rural Development Policy 2021-2025", "")
    text = re.sub(r'Chapter \d+: .*\n?', '', text)
    return text.strip()

In [138]:
extracted_text_8["Paragraph"] = extracted_text_8["Paragraph"].apply(clean_headers_8)

In [139]:
extracted_text_8['Paragraph'][:5]

0    6\nOur Rural Future represents a new milestone...
1    7\n“Our Vision is for a thriving rural \nIrela...
2    8\nInvestment in transport infrastructure and ...
3    9\nconnectivity, town centre regeneration, the...
4    Optimising the opportunities for rural \ncommu...
Name: Paragraph, dtype: object

In [140]:
document_name_8 = [document_8] * len(extracted_text_8)
extracted_text_8['Document Name'] = document_name_8
extracted_text_8['Author'] = 'Government of Ireland'
extracted_text_8['Year'] = 2021
extracted_text_8

Unnamed: 0,Page Number,Paragraph Number,Paragraph,Document Name,Author,Year
0,10,1,6\nOur Rural Future represents a new milestone...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
1,11,1,7\n“Our Vision is for a thriving rural \nIrela...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
2,12,1,8\nInvestment in transport infrastructure and ...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
3,13,1,"9\nconnectivity, town centre regeneration, the...",Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
4,13,2,Optimising the opportunities for rural \ncommu...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
...,...,...,...,...,...,...
230,98,1,94\nThis approach recognises that every area i...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
231,99,1,95\nImplementation And Oversight\nOur Rural Fu...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
232,100,1,"96\nThe Government will work to enhance, devel...",Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
233,101,1,97\nActions to Achieve Our Ambition \nTo suppo...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021


In [136]:
extracted_text_8.to_excel(r'para8.xlsx')

# Combining all dataframes

In [143]:
for i, df in enumerate(dataframes):
    print(f"Element {i}: Type {type(df)}")

Element 0: Type <class 'str'>
Element 1: Type <class 'pandas.core.frame.DataFrame'>
Element 2: Type <class 'pandas.core.frame.DataFrame'>
Element 3: Type <class 'pandas.core.frame.DataFrame'>
Element 4: Type <class 'pandas.core.frame.DataFrame'>
Element 5: Type <class 'pandas.core.frame.DataFrame'>
Element 6: Type <class 'pandas.core.frame.DataFrame'>
Element 7: Type <class 'pandas.core.frame.DataFrame'>


In [146]:
dataframes = [para, para2, para3, para4, extracted_text_5, para6, para7, extracted_text_8]
allPublications = pd.concat(dataframes, ignore_index = True)
allPublications

Unnamed: 0,Page Number,Paragraph Number,Paragraph,Document Name,Author,Year
0,Page 21,1,Chapter 1 Introduction\nBackground and Context...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
1,Page 21,2,The objective of the NIR is to describe the me...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
2,Page 21,3,Introduction and Reporting Requirements under ...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
3,Page 21,4,The NIR is compiled according to the structure...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
4,Page 21,5,"In addition, detailed documentation of methods...",Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
...,...,...,...,...,...,...
9719,98,1,94\nThis approach recognises that every area i...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
9720,99,1,95\nImplementation And Oversight\nOur Rural Fu...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
9721,100,1,"96\nThe Government will work to enhance, devel...",Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021
9722,101,1,97\nActions to Achieve Our Ambition \nTo suppo...,Our Rural Future - Rural Development Policy 20...,Government of Ireland,2021


In [147]:
def insert_dataframe_to_mongo(df: pd.DataFrame, db_name: str, collection_name: str, uri: str = "mongodb://localhost:27017/"):
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    
    data = df.applymap(lambda x: float(x) if isinstance(x, (np.float32, np.float64)) else x).to_dict(orient='records')
    
    if data:
        collection.insert_many(data)
        print(f"Inserted {len(data)} records into {db_name}.{collection_name}")
    else:
        print("DataFrame is empty. No records inserted.")
    
    client.close()

In [148]:
insert_dataframe_to_mongo(allPublications, "publications", "all_selected_publications")

  data = df.applymap(lambda x: float(x) if isinstance(x, (np.float32, np.float64)) else x).to_dict(orient='records')


Inserted 9724 records into publications.all_selected_publications
