In [1]:
import pdfplumber
import pandas as pd
import os
import re

In [2]:
folder_path = r'C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Action Plans'

# Action Plan for Rural Development

In [6]:
pdf_path = r'C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Action Plans\Action Plan for Rural Development..pdf'
tables = []

In [7]:
def find_pages_with_tables(pdf_path):
    pages_with_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start = 1):
            extracted_table = page.extract_table()
            if extracted_table:
                pages_with_tables.append(page_num)
    print(f"\n Pages with tables: {pages_with_tables}")
    return pages_with_tables

In [8]:
pages_with_table = find_pages_with_tables(pdf_path)


 Pages with tables: [1, 2, 11, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51, 52, 53, 54, 57, 58, 59, 60, 62]


In [59]:
REQUIRED_COLUMNS = ['No', 'Action', 'Timeline', 'Responsible Bodies']

def normalize_columns(columns):
    return [col.strip().replace("No.","No") if col else "" for col in columns]
def extract_valid_tables(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start = 1):
            extracted_table = page.extract_table()
            if extracted_table:
                headers = extracted_table[0]
                normalized_headers = normalize_columns(headers)
                if normalized_headers == REQUIRED_COLUMNS:
                    df = pd.DataFrame(extracted_table[1:], columns = normalized_headers)
                    df['Page'] = page_num
                    tables.append(df)
                else:
                    print(f"Skipping table on page {page_num}")
        if tables:
            final_df = pd.concat(tables, ignore_index = True)
            print("\n Extracted Tables: \n", final_df)
            return final_df
        else:
            print(f"No matching tables found.")
            return None

In [8]:
final_df = extract_valid_tables(pdf_path)

Skipping table on page 1
Skipping table on page 2
Skipping table on page 11
Skipping table on page 18
Skipping table on page 28
Skipping table on page 29
Skipping table on page 40
Skipping table on page 41
Skipping table on page 48
Skipping table on page 52
Skipping table on page 53
Skipping table on page 54
Skipping table on page 62

 Extracted Tables: 
       No                                             Action   Timeline  \
0      1  Support over 600 towns and villages through an...  2017-2019   
1      2  Develop and pilot an initiative to encourage i...    Q3 2017   
2      3  Complementing the Town & Village Renewal Schem...    Q2 2017   
3      4  Finalise and implement the recommendations of ...  2017-2019   
4      5  As part of Rebuilding Ireland, the Action Plan...  2017-2021   
..   ...                                                ...        ...   
247  272  Introduce a trial to lower the water level on ...       2017   
248  273  Evaluate the benefits from any short and

In [16]:
final_df['No'] = pd.to_numeric(final_df['No'], errors = 'coerce')
final_df = final_df.dropna(subset = ['No']).copy()
final_df['No'] = final_df['No'].astype(int)
missing_records = set(range(final_df['No'].min(), final_df['No'].max() + 1)) - set(final_df['No'])
missing_records = sorted(missing_records)
print(missing_records)

[37, 38, 165, 166, 208, 209, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 244, 245, 246, 247, 264]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['No'] = pd.to_numeric(final_df['No'], errors = 'coerce')


In [9]:
final_df.to_excel(r'Action Plan for Rural Development.xlsx')

# Ireland’s Second One Health National Action Plan on Antimicrobial Resistance 2021 - 2025

In [2]:
pdf_path_4 = r'C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Action Plans\Ireland’s Second One Health National Action Plan on Antimicrobial Resistance 2021 - 2025.pdf'
tables_4 = []

In [13]:
REQUIRED_COLUMNS_4_1 = ['No', 'Activity', 'Timeframe']
REQUIRED_COLUMNS_4_2 = ['Strategic Interventions/Actions', 'Responsibility','Timeframe']

def normalize_headers_4(headers):
    return [col.strip().lower() if col else "" for col in headers]
    
def extract_valid_tables_4(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_table = page.extract_table()
            if extracted_table:
                headers = extracted_table[0]

                normalized_headers = normalize_headers(headers)
                normalized_required_1 = normalize_headers(REQUIRED_COLUMNS_4_1)
                normalized_required_2 = normalize_headers(REQUIRED_COLUMNS_4_2)

                print(f"Page {page_num} Headers: {headers}")

                if normalized_headers == normalized_required_1 or normalized_headers == normalized_required_2:
                    df = pd.DataFrame(extracted_table[1:], columns=headers)
                    df['Page'] = page_num
                    tables.append(df)
                else:
                    print(f"Skipping table on page {page_num} due to header mismatch.")

        if tables:
            final_df = pd.concat(tables, ignore_index=True)
            print("\n Extracted Tables: \n", final_df)
            return final_df
        else:
            print("No matching tables found.")
            return None


In [14]:
final_df_4 = extract_valid_tables_4(pdf_path_4)

Page 2 Headers: ['2021-2025', '']
Skipping table on page 2 due to header mismatch.
Page 3 Headers: ['2021-2025', '']
Skipping table on page 3 due to header mismatch.
Page 4 Headers: ['2021-2025', '']
Skipping table on page 4 due to header mismatch.
Page 5 Headers: ['2021-2025', '']
Skipping table on page 5 due to header mismatch.
Page 6 Headers: ['2021-2025', '']
Skipping table on page 6 due to header mismatch.
Page 7 Headers: ['2021-2025', '']
Skipping table on page 7 due to header mismatch.
Page 8 Headers: ['', '2021-2025']
Skipping table on page 8 due to header mismatch.
Page 9 Headers: ['', '2021-2025']
Skipping table on page 9 due to header mismatch.
Page 10 Headers: ['', '2021-2025']
Skipping table on page 10 due to header mismatch.
Page 11 Headers: ['']
Skipping table on page 11 due to header mismatch.
Page 12 Headers: ['', '2021-2025']
Skipping table on page 12 due to header mismatch.
Page 13 Headers: ['', '2021-2025']
Skipping table on page 13 due to header mismatch.
Page 14 H

In [15]:
final_df_4.to_excel(r'Ireland’s Second One Health National Action Plan on Antimicrobial Resistance 2021 - 2025.xlsx')

# A Healthy Weight for Ireland - Obesity Policy and Action Plan 2016-2025

In [16]:
pdf_path_5 = r'C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Action Plans\A Healthy Weight for Ireland - Obesity Policy and Action Plan 2016-2025.pdf'
tables_5 = []

In [23]:
REQUIRED_COLUMNS_5 = ['Ref.', 'Action', 'Lead Responsible', 'Partners', 'Timeframe']

def normalize_headers_5(headers):
    """Normalize headers by stripping spaces and converting to lowercase."""
    return [col.strip().lower() if col else "" for col in headers]

def extract_valid_tables_5(pdf_path):
    tables = []
    last_valid_headers = None
    last_df = None 

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_tables = page.extract_tables()
            
            if not extracted_tables:
                print(f"No tables found on page {page_num}")
                continue

            for table_num, extracted_table in enumerate(extracted_tables, start=1):
                headers = extracted_table[0]
                
                if all(headers):
                    normalized_headers = normalize_headers_5(headers)
                    normalized_required = normalize_headers_5(REQUIRED_COLUMNS_5)

                    print(f"Page {page_num}, Table {table_num} Headers: {headers}") 
                    if normalized_headers == normalized_required:
                        df = pd.DataFrame(extracted_table[1:], columns=headers)
                        df['Page'] = page_num
                        tables.append(df)
                        last_valid_headers = headers
                        last_df = df
                    else:
                        print(f"Skipping table {table_num} on page {page_num} due to header mismatch.")
                
                else:
                    if last_valid_headers and len(extracted_table[0]) == len(last_valid_headers):
                        print(f"Appending continued table on page {page_num}")
                        df = pd.DataFrame(extracted_table, columns=last_valid_headers)
                        df['Page'] = page_num
                        
                        if last_df is not None:
                            last_df = pd.concat([last_df, df], ignore_index=True)  
                            tables[-1] = last_df 
                        else:
                            tables.append(df)
                    else:
                        print(f"Skipping unrecognized table on page {page_num} due to column mismatch.")

        if tables:
            final_df = pd.concat(tables, ignore_index=True)
            print("\n Extracted Tables: \n", final_df)
            return final_df
        else:
            print("No matching tables found.")
            return None

In [24]:
final_df_5 = extract_valid_tables_5(pdf_path_5)

Skipping unrecognized table on page 1 due to column mismatch.
Page 1, Table 2 Headers: ['A Healthy Weight\nfor Ireland']
Skipping table 2 on page 1 due to header mismatch.
No tables found on page 2
No tables found on page 3
No tables found on page 4
Skipping unrecognized table on page 5 due to column mismatch.
No tables found on page 6
Skipping unrecognized table on page 7 due to column mismatch.
No tables found on page 8
Skipping unrecognized table on page 9 due to column mismatch.
No tables found on page 10
Page 11, Table 1 Headers: ['AA HHeeaalltthhyy WWeeiigghhtt ffoorr IIrreellaanndd:: OObbeessiittyy PPoolliiccyy aanndd AAccttiioonn PPllaann 22001166 -- 22002255']
Skipping table 1 on page 11 due to header mismatch.
No tables found on page 12
No tables found on page 13
No tables found on page 14
No tables found on page 15
No tables found on page 16
No tables found on page 17
No tables found on page 18
No tables found on page 19
No tables found on page 20
No tables found on page 21


In [25]:
final_df_5.to_excel(r'A Healthy Weight for Ireland - Obesity Policy and Action Plan 2016-2025.xlsx')

# Ireland's 4th National Biodiversity Plan 2023 to 2030

In [2]:
pdf_path_6 = r"C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Action Plans\Ireland's 4th National Biodiversity Plan 2023 to 2030.pdf"
tables_6 = []

In [24]:
REQUIRED_COLUMNS_6 = ['Target', 'Action\nnumber', 'Action', 'Partner(s)', 'Indicator']

def normalize_headers_6(headers):
    """Normalize headers by stripping spaces and converting to lowercase."""
    return [col.strip().lower() if col else "" for col in headers]

def extract_valid_tables_6(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_table = page.extract_table()
            if extracted_table:
                headers = extracted_table[0]

                normalized_headers = normalize_headers_6(headers)
                normalized_required = normalize_headers_6(REQUIRED_COLUMNS_6)

                print(f"Page {page_num} --- Column Headers: {normalized_headers} ---  Required Headers: {normalized_required}")

                if normalized_headers == normalized_required:
                    df = pd.DataFrame(extracted_table[1:], columns=headers)
                    df['Page'] = page_num
                    tables.append(df)
                else:
                    print(f"Skipping table on page {page_num} due to header mismatch.")

        if tables:
            final_df = pd.concat(tables, ignore_index=True)
            print("\n Extracted Tables: \n", final_df)
            return final_df
        else:
            print("No matching tables found.")
            return None

In [25]:
final_df_6 = extract_valid_tables_6(pdf_path_6)

Page 3 --- Column Headers: ['', '', '', ''] ---  Required Headers: ['target', 'action\nnumber', 'action', 'partner(s)', 'indicator']
Skipping table on page 3 due to header mismatch.
Page 10 --- Column Headers: ['the five objectives\nobjective 1: objective 2:\nadopt a whole-of- meet urgent\ngovernment, whole- conservation and\nof-society approach restoration needs\nto biodiversity\nproposed actions include supporting actions will build on\ncapacity and resource reviews existing conservation measures.', '', '', ''] ---  Required Headers: ['target', 'action\nnumber', 'action', 'partner(s)', 'indicator']
Skipping table on page 10 due to header mismatch.
Page 11 --- Column Headers: ['', 'objective 3: objective 4: objective 5:\nsecure nature’s enhance the evidence strengthen ireland’s\ncontribution to base for action on contribution to\npeople biodiversity international\nbiodiversity initiatives\nactions highlight the relationship this objective focuses on biodiversity collaboration with oth

In [26]:
final_df_6.to_excel(r"Ireland's 4th National Biodiversity Plan 2023 to 2030.xlsx")

# Ireland’s National Action Plan on Antimicrobial Resistance 2017 – 2020

In [27]:
pdf_path_7 = r"C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Action Plans\Ireland’s National Action Plan on Antimicrobial Resistance 2017 – 2020.pdf"
tables_7 = []

In [34]:
def extract_all_tables_7(pdf_path):
    tables_dict = {}

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_tables = page.extract_tables()
            
            if extracted_tables:
                for i, table in enumerate(extracted_tables):
                    print(f"\n🔍 Raw Table {i+1} from Page {page_num}:")
                    for row in table:
                        print(row)
                    df = pd.DataFrame(table)
                    tables_dict[f"Page_{page_num}_Table_{i+1}"] = df
                    print(f"✅ Table {i+1} extracted from Page {page_num}")
            else:
                print(f"⚠️ No table found on Page {page_num}")

    if tables_dict:
        print(f"\n📄 Extracted tables from {len(tables_dict)} pages.")
    else:
        print("⚠️ No tables found in the document.")

    return tables_dict

In [35]:
tables_7 = extract_all_tables_7(pdf_path_7)


🔍 Raw Table 1 from Page 1:
['', '']
✅ Table 1 extracted from Page 1
⚠️ No table found on Page 2
⚠️ No table found on Page 3
⚠️ No table found on Page 4

🔍 Raw Table 1 from Page 5:
['Ministerial Foreword 6\nDepartmental Foreword 7\nKey concepts and definitions 8\n1. Introduction 10\n2. Situational analyses and assessment – Human Health 15\n3. Situational analyses and assessment – Animal Health 38\n4. Situational analyses and assessment – Environmental Health 50\n5. Governance and responsibilities for implementation of\nIreland’s National Action Plan on Antimicrobial Resistance 2017-2020 53\n6. Ireland’s 3-Year Plan - Priority Strategic Interventions and Activities 58', None]
['', 'Strategic Objective 1: Improve awareness and knowledge of antimicrobial resistance 65']
['', 'Strategic Objective 2: Enhance surveillance of antibiotic resistance and antibiotic use 71']
['', 'Strategic Objective 3: Reduce the spread of infection and disease 77']
['', 'Strategic Objective 4: Optimise the use 

In [37]:
print(tables_7)
print(f"Type: {type(tables_7)}, Length: {len(tables_7)}")

{'Page_1_Table_1':   0 1
0    , 'Page_5_Table_1':                                                    0  \
0  Ministerial Foreword 6\nDepartmental Foreword ...   
1                                                      
2                                                      
3                                                      
4                                                      
5                                                      
6  Appendix 1: National Interdepartmental Antimic...   

                                                   1  
0                                               None  
1  Strategic Objective 1: Improve awareness and k...  
2  Strategic Objective 2: Enhance surveillance of...  
3  Strategic Objective 3: Reduce the spread of in...  
4  Strategic Objective 4: Optimise the use of ant...  
5  Strategic Objective 5: Promote research and su...  
6                                               None  , 'Page_6_Table_1':                   0                       

In [40]:
def combine_tables(tables_dict, page_numbers):
    selected_tables = []
    for key, df in tables_dict.items():
        page_num = int(key.split("_")[1])
        if page_num in page_numbers:
            selected_tables.append(df)
            print(f"Adding {key} to final DataFrame")
    if selected_tables:
        final_df = pd.concat(selected_tables, ignore_index = True)
        print("\n Combined DataFrame created successfully.")
        return final_df
    else:
        print("No tables")
        return None

In [49]:
selected_pages = [69,70,71,72,74,75,76,77,78,80,81,82,83,86,87,88,89,90,91,92,93]
final_df_7 = combine_tables(tables_7, selected_pages)

Adding Page_69_Table_1 to final DataFrame
Adding Page_70_Table_1 to final DataFrame
Adding Page_71_Table_1 to final DataFrame
Adding Page_72_Table_1 to final DataFrame
Adding Page_74_Table_1 to final DataFrame
Adding Page_75_Table_1 to final DataFrame
Adding Page_76_Table_1 to final DataFrame
Adding Page_77_Table_1 to final DataFrame
Adding Page_78_Table_1 to final DataFrame
Adding Page_80_Table_1 to final DataFrame
Adding Page_81_Table_1 to final DataFrame
Adding Page_82_Table_1 to final DataFrame
Adding Page_83_Table_1 to final DataFrame
Adding Page_86_Table_1 to final DataFrame
Adding Page_87_Table_1 to final DataFrame
Adding Page_88_Table_1 to final DataFrame
Adding Page_89_Table_1 to final DataFrame
Adding Page_90_Table_1 to final DataFrame
Adding Page_91_Table_1 to final DataFrame
Adding Page_91_Table_2 to final DataFrame
Adding Page_92_Table_1 to final DataFrame
Adding Page_93_Table_1 to final DataFrame

 Combined DataFrame created successfully.


In [50]:
final_df_7

Unnamed: 0,0,1,2,3
0,Strategic Objective 1\nImprove knowledge and a...,,,
1,Strategic interventions,Activities,Responsible,Timeline
2,1.1. Design and\nimplement awareness\nstrategi...,1.1.1 Develop and deliver\nAMR and Infection P...,"DoH, HSE Health &\nWellbeing, HPSC, HSE\nNatio...",Priority 1\nPriority 2\nPriority 1
3,1.2. Education of\nHealthcare workers and\nthe...,1.2. Establish a competency\nframework for AMR...,"HSE Health &\nWellbeing, HPSC, HSE\nNational D...",Priority 1\nPriority 1\nPriority 2\nPriority 2
4,,1.2.5 Undergraduate and\nPost-graduate core cu...,,Priority 2\nPriority 3\nPriority 1
...,...,...,...,...
71,Strategic interventions,Activities,Responsible,Timeline
72,5.1 Carry out health\neconomic analysis of\nco...,5.1.1 Measure all evaluable\ncosts of HCAI & A...,"DoH\nDoH, HSE",Priority 1\nPriority 3
73,5.2 Research in relation\nto development of\nb...,5.2.1. Develop mechanisms\nto facilitate resea...,"DoH, HSE, HRB\nDoH, HSE, HRB HIQA\nHIQA",Priority 3\nPriority 3\nPriority 1
74,5.3 Research in relation\nto development of\nb...,5.3.1 Discuss promotion of\nfurther research i...,"HRB, SFI,\nEnterprise Ireland,\nPharmaceutical...",Priority 1\nPriority 2


In [51]:
final_df_7.to_excel(r"Ireland’s National Action Plan on Antimicrobial Resistance 2017 – 2020.xlsx")

# National Biodiversity Action Plan 2017 - 2021

In [52]:
pdf_path_8 = r"C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Action Plans\National Biodiversity Action Plan 2017 - 2021.pdf"
tables_8 = []

In [53]:
REQUIRED_COLUMNS_8 = ['Action', 'Timeframe', 'Lead/key partners', 'Performance indicators']

def normalize_headers_8(headers):
    return [col.strip().lower() if col else "" for col in headers]
    
def extract_valid_tables_8(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            extracted_table = page.extract_table()
            if extracted_table:
                headers = extracted_table[0]

                normalized_headers = normalize_headers_8(headers)
                normalized_required = normalize_headers_8(REQUIRED_COLUMNS_8)

                print(f"Page {page_num} Headers: {headers}")

                if normalized_headers == normalized_required:
                    df = pd.DataFrame(extracted_table[1:], columns=headers)
                    df['Page'] = page_num
                    tables.append(df)
                else:
                    print(f"Skipping table on page {page_num} due to header mismatch.")

        if tables:
            final_df = pd.concat(tables, ignore_index=True)
            print("\n Extracted Tables: \n", final_df)
            return final_df
        else:
            print("No matching tables found.")
            return None

In [54]:
final_df_8 = extract_valid_tables_8(pdf_path_8)

Page 14 Headers: ['In 2015, revenue from overseas tourism amounted to €6 billion13.\nIreland’s natural, unspoilt environment was cited by 86% of visitors\nsurveyed in 2015 as an important reason for visiting Ireland14.']
Skipping table on page 14 due to header mismatch.
Page 29 Headers: ['', '']
Skipping table on page 29 due to header mismatch.
Page 30 Headers: ['Action', 'Timeframe', 'Lead/key partners', 'Performance indicators']
Page 31 Headers: ['Action', 'Timeframe', 'Lead/key partners', 'Performance indicators']
Page 32 Headers: ['Action', 'Timeframe', 'Lead/key partners', 'Performance indicators']
Page 33 Headers: ['Action', 'Timeframe', 'Lead/key partners', 'Performance indicators']
Page 36 Headers: ['', None]
Skipping table on page 36 due to header mismatch.
Page 38 Headers: ['Action', 'Timeframe', 'Lead/key partners', 'Performance indicators']
Page 39 Headers: ['Action', 'Timeframe', 'Lead/key partners', 'Performance indicators']
Page 40 Headers: ['Action', 'Timeframe', 'Lead/

In [55]:
final_df_8.to_excel(r"National Biodiversity Action Plan 2017 - 2021.xlsx")

# Realising our Rural Potential - Action Plan for Rural Development

In [56]:
pdf_path_9 = r"C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Action Plans\Realising our Rural Potential - Action Plan for Rural Development.pdf"

In [60]:
final_df_9 = extract_valid_tables(pdf_path_9)

Skipping table on page 1
Skipping table on page 2
Skipping table on page 11
Skipping table on page 18
Skipping table on page 28
Skipping table on page 29
Skipping table on page 40
Skipping table on page 41
Skipping table on page 48
Skipping table on page 52
Skipping table on page 53
Skipping table on page 54
Skipping table on page 62

 Extracted Tables: 
       No                                             Action   Timeline  \
0      1  Support over 600 towns and villages through an...  2017-2019   
1      2  Develop and pilot an initiative to encourage i...    Q3 2017   
2      3  Complementing the Town & Village Renewal Schem...    Q2 2017   
3      4  Finalise and implement the recommendations of ...  2017-2019   
4      5  As part of Rebuilding Ireland, the Action Plan...  2017-2021   
..   ...                                                ...        ...   
247  272  Introduce a trial to lower the water level on ...       2017   
248  273  Evaluate the benefits from any short and

In [62]:
final_df_9.to_excel(r"Realising our Rural Potential - Action Plan for Rural Development.xlsx")