In [54]:
from docx import Document
import pdfplumber
import pandas as pd
import re
import json

In [14]:
def extract_missions(pdf_path, start_page=54):
missions = []
    
    with fitz.open(pdf_path) as doc:
        for page_num in range(start_page - 1, len(doc)):
            text = doc[page_num].get_text("text")
            lines = text.split("\n")
            
            if len(lines) >= 2:
                mission_match = re.search(r"MISSION\s*(\d+)", lines[0], re.IGNORECASE)
                if mission_match:
                    mission_number = mission_match.group(1)
                    mission_statement = " ".join(lines[1:]).strip()
                    missions.append([mission_number, mission_statement, page_num + 1])
    
    return pd.DataFrame(missions, columns=["MISSION", "MISSION STATEMENT", "PAGE NUMBER"])

In [16]:
doc_path = r"C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Strategies\Food Vision 2030.pdf"
df = extract_missions(doc_path, start_page=54)
df = df[~df["PAGE NUMBER"].isin([181,182])]
print(df)

  MISSION                                  MISSION STATEMENT  PAGE NUMBER
0       1  A Climate Smart,  Environmentally  Sustainable...           54
1       2  Viable & Resilient  Primary Producers  with En...           92
2       3  Food which is Safe,  Nutritious and  Appealing...          126
3       4  An Innovative,  Competitive &  Resilient Agri-...          145


In [21]:
def extract_goals(pdf_path, missions_df):
    goals = []
    
    with fitz.open(pdf_path) as doc:
        for page_num in range(len(doc)):
            text = doc[page_num].get_text("text")
            lines = [line.strip() for line in text.split("\n") if line.strip()] 
            if len(lines) >= 2:
                goal_match = re.match(r"GOAL\s*(\d+)", lines[0], re.IGNORECASE)
                if goal_match:
                    goal_number = int(goal_match.group(1))
                    goal_statement = " ".join(lines[1:]).strip()
                    mission_number, mission_statement = None, None
                    for i in range(len(missions_df) - 1):
                        if missions_df.loc[i, "PAGE NUMBER"] <= page_num + 1 < missions_df.loc[i + 1, "PAGE NUMBER"]:
                            mission_number = int(missions_df.loc[i, "MISSION"])
                            mission_statement = missions_df.loc[i, "MISSION STATEMENT"]
                            break
                    if mission_number is None:
                        mission_number = int(missions_df.iloc[-1]["MISSION"])
                        mission_statement = missions_df.iloc[-1]["MISSION STATEMENT"] 
                    goals.append([mission_number, mission_statement, goal_number, goal_statement, page_num + 1])
    
    goals_df = pd.DataFrame(goals, columns=["MISSION", "MISSION STATEMENT", "GOAL", "GOAL STATEMENT", "PAGE NUMBER"])
    goals_df = goals_df.sort_values(by=["MISSION", "GOAL"]).reset_index(drop=True)
    
    return goals_df

In [37]:
goals_df = extract_goals(doc_path, df)

In [38]:
def clean_statements(df, columns=["MISSION STATEMENT", "GOAL STATEMENT"]):
    def remove_trailing_numbers(text):
        return re.sub(r"\s*\d{2,3}$", "", text)
    df = df.copy()
    for col in columns:
        df[col] = df[col].astype(str).apply(remove_trailing_numbers)

    return df

In [39]:
goals_df = clean_statements(goals_df)
goals_df.loc[len(goals_df)] = [4, "An Innovative, Competitive & Resilient Agri-Food Sector, Driven by Technology and Talent", 1, "Move to a Challenge-Focused Innovation System", 148]
goals_df = goals_df[~goals_df["PAGE NUMBER"].isin([56,95, 128,147])]
goals_df = goals_df.sort_values(by=["MISSION", "GOAL"]).reset_index(drop=True)
goals_df

Unnamed: 0,MISSION,MISSION STATEMENT,GOAL,GOAL STATEMENT,PAGE NUMBER
0,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58
1,1,"A Climate Smart, Environmentally Sustainable...",2,Restore and Enhance Biodiversity,65
2,1,"A Climate Smart, Environmentally Sustainable...",3,Protect High Status Sites and Contribute to th...,69
3,1,"A Climate Smart, Environmentally Sustainable...",4,"Develop Diverse, Multi-Functional Forests",73
4,1,"A Climate Smart, Environmentally Sustainable...",5,Enhance the Environmental Sustainability of th...,77
5,1,"A Climate Smart, Environmentally Sustainable...",6,"Embed the Agri-Food Sector in the Circular, Re...",81
6,1,"A Climate Smart, Environmentally Sustainable...",7,Strengthen and Invest in Origin Green and Othe...,87
7,2,Viable & Resilient Primary Producers with En...,1,Improve the Competitiveness and Productivity o...,96
8,2,Viable & Resilient Primary Producers with En...,2,Improve the Creation and Equitable Distributio...,108
9,2,Viable & Resilient Primary Producers with En...,3,Increase Primary Producer Diversification & Re...,114


In [49]:
import fitz  # PyMuPDF
import pandas as pd

def extract_goal_pages(pdf_path, goals_df):
    """
    Extracts all text between the start and end pages for each goal.
    
    Args:
        pdf_path (str): Path to the PDF file.
        goals_df (pd.DataFrame): DataFrame containing extracted goals.

    Returns:
        pd.DataFrame: DataFrame containing goal information with full extracted pages.
    """
    extracted_data = []

    with fitz.open(pdf_path) as doc:
        for i in range(len(goals_df)):
            start_page = goals_df.loc[i, "PAGE NUMBER"]
            end_page = goals_df.loc[i + 1, "PAGE NUMBER"] - 1 if i + 1 < len(goals_df) else len(doc)

            extracted_text = []
            for page_num in range(start_page - 1, end_page):  # Convert to 0-based index
                text = doc[page_num].get_text("text")
                extracted_text.append(text)

            full_text = "\n".join(extracted_text).strip()

            extracted_data.append([
                goals_df.loc[i, "MISSION"],
                goals_df.loc[i, "MISSION STATEMENT"],
                goals_df.loc[i, "GOAL"],
                goals_df.loc[i, "GOAL STATEMENT"],
                start_page,
                end_page,
                full_text
            ])
            print(f"Extracted pages {start_page}-{end_page} for Goal {goals_df.loc[i, 'GOAL']}")  # Debugging

    return pd.DataFrame(extracted_data, columns=[
        "MISSION", "MISSION STATEMENT", "GOAL", "GOAL STATEMENT", "START PAGE", "END PAGE", "EXTRACTED TEXT"
    ])


In [77]:
actions_df = extract_goal_pages(doc_path, goals_df)
actions_df.head(3)

Extracted pages 58-64 for Goal 1
Extracted pages 65-68 for Goal 2
Extracted pages 69-72 for Goal 3
Extracted pages 73-76 for Goal 4
Extracted pages 77-80 for Goal 5
Extracted pages 81-86 for Goal 6
Extracted pages 87-95 for Goal 7
Extracted pages 96-107 for Goal 1
Extracted pages 108-113 for Goal 2
Extracted pages 114-119 for Goal 3
Extracted pages 120-128 for Goal 4
Extracted pages 129-132 for Goal 1
Extracted pages 133-136 for Goal 2
Extracted pages 137-140 for Goal 3
Extracted pages 141-147 for Goal 4
Extracted pages 148-150 for Goal 1
Extracted pages 151-154 for Goal 2
Extracted pages 155-157 for Goal 3
Extracted pages 158-162 for Goal 4
Extracted pages 163-166 for Goal 5
Extracted pages 167-169 for Goal 6
Extracted pages 170-192 for Goal 7


Unnamed: 0,MISSION,MISSION STATEMENT,GOAL,GOAL STATEMENT,START PAGE,END PAGE,EXTRACTED TEXT
0,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58,64,GOAL 1\nDevelop a Climate Neutral Agri-\nFood ...
1,1,"A Climate Smart, Environmentally Sustainable...",2,Restore and Enhance Biodiversity,65,68,GOAL 2\nRestore and \nEnhance Biodiversity\n6...
2,1,"A Climate Smart, Environmentally Sustainable...",3,Protect High Status Sites and Contribute to th...,69,72,GOAL 3\nProtect High Status Sites and \nContri...


In [78]:
def extract_action_details(text):
    """
    Extracts the ACTION number and ACTION statement from the given text.
    Returns a dictionary where keys are ACTION numbers and values are ACTION statements.
    """
    action_dict = {}
    
    # Regular expression to capture "ACTION" followed by a number
    action_pattern = re.compile(r'\bACTION\s+(\d+)', re.IGNORECASE)
    
    # Find all matches of ACTION numbers
    matches = action_pattern.finditer(text)
    
    for match in matches:
        action_number = match.group(1)  # Extract ACTION number
        remaining_text = text[match.end():].strip()  # Get text after ACTION number
        
        # Extract the first complete sentence after ACTION number
        sentence_match = re.search(r'([^.!?]+[.!?])', remaining_text)
        if sentence_match:
            action_statement = sentence_match.group(1).strip()
            action_dict[action_number] = action_statement

    return action_dict

In [79]:
actions_df["EXTRACTED ACTIONS"] = actions_df["EXTRACTED TEXT"].apply(extract_action_details)

In [80]:
rows = []
for _, row in actions_df.iterrows():
    if isinstance(row["EXTRACTED ACTIONS"], dict):
        for action, statement in row["EXTRACTED ACTIONS"].items():
            new_row = row.to_dict()
            new_row["ACTION"] = action
            new_row["ACTION STATEMENT"] = statement
            rows.append(new_row)

df_final = pd.DataFrame(rows)

df_final.drop(columns=["EXTRACTED ACTIONS"], inplace=True)

df_final = df_final.reset_index(drop=True)

print(df_final)

     MISSION                                  MISSION STATEMENT  GOAL  \
0          1  A Climate Smart,  Environmentally  Sustainable...     1   
1          1  A Climate Smart,  Environmentally  Sustainable...     1   
2          1  A Climate Smart,  Environmentally  Sustainable...     1   
3          1  A Climate Smart,  Environmentally  Sustainable...     1   
4          1  A Climate Smart,  Environmentally  Sustainable...     1   
..       ...                                                ...   ...   
209        4  An Innovative,  Competitive &  Resilient Agri-...     7   
210        4  An Innovative,  Competitive &  Resilient Agri-...     7   
211        4  An Innovative,  Competitive &  Resilient Agri-...     7   
212        4  An Innovative,  Competitive &  Resilient Agri-...     7   
213        4  An Innovative,  Competitive &  Resilient Agri-...     7   

                                        GOAL STATEMENT  START PAGE  END PAGE  \
0    Develop a Climate Neutral Agri- Food S

In [81]:
df_final.head(2)

Unnamed: 0,MISSION,MISSION STATEMENT,GOAL,GOAL STATEMENT,START PAGE,END PAGE,EXTRACTED TEXT,ACTION,ACTION STATEMENT
0,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58,64,GOAL 1\nDevelop a Climate Neutral Agri-\nFood ...,1,Immediately implement the ‘Ag \nClimatise’ Roa...
1,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58,64,GOAL 1\nDevelop a Climate Neutral Agri-\nFood ...,2,Produce detailed plans by Q2 2022 to \nmanage ...


In [82]:

def extract_action_content(row):
    text = row["EXTRACTED TEXT"]
    action_statement = row["ACTION STATEMENT"].strip()

    # Find the exact match of ACTION STATEMENT in EXTRACTED TEXT
    match = re.search(re.escape(action_statement), text, re.DOTALL)
    if not match:
        return None  # If no match is found, return None

    start_idx = match.end()  # Start extracting after ACTION STATEMENT

    # Find the next occurrence of "ACTION" after the current ACTION STATEMENT
    next_action_match = re.search(r'\bACTION\s+\d+', text[start_idx:], re.DOTALL)
    
    if next_action_match:
        end_idx = start_idx + next_action_match.start()  # Stop at the next ACTION
        action_content = text[start_idx:end_idx].strip()
    else:
        action_content = text[start_idx:].strip()  # Take till the end if no next ACTION

    return action_content

In [84]:
df_final["ACTION CONTENT"] = df_final.apply(extract_action_content, axis=1)

# Display result
print(df_final[["ACTION", "ACTION STATEMENT", "ACTION CONTENT"]])

    ACTION                                   ACTION STATEMENT  \
0        1  Immediately implement the ‘Ag \nClimatise’ Roa...   
1        2  Produce detailed plans by Q2 2022 to \nmanage ...   
2        3  Update Ag Climatise, as required, to \nensure ...   
3        4                         Roll out ‘Carbon Farming’.   
4        5  Ireland will play a leading role in \nshaping ...   
..     ...                                                ...   
209      5  Multiple sources of finance and funding \n(inc...   
210      6  DAFM should lead a multi-stakeholder \nproject...   
211      7  All stakeholders should work together \nto ide...   
212      8  The agri-food sector needs to engage \nwith ch...   
213      9  Build on the experience of the national \ndial...   

                                        ACTION CONTENT  
0    ‘Ag Climatise’ sets a vision for a ‘climate ne...  
1    Ag Climatise makes clear that any increase in ...  
2    Ag Climatise is only a first step in agricu

In [86]:
def remove_newlines(df, columns):
    df[columns] = df[columns].apply(lambda col: col.str.replace(r'\n+', ' ', regex=True).str.strip())
    return df

columns_to_clean = ["MISSION STATEMENT", "GOAL STATEMENT", "EXTRACTED TEXT", "ACTION STATEMENT", "ACTION CONTENT"]

action_content = remove_newlines(df_final, columns_to_clean)

In [87]:
action_content.head()

Unnamed: 0,MISSION,MISSION STATEMENT,GOAL,GOAL STATEMENT,START PAGE,END PAGE,EXTRACTED TEXT,ACTION,ACTION STATEMENT,ACTION CONTENT
0,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58,64,GOAL 1 Develop a Climate Neutral Agri- Food Sy...,1,Immediately implement the ‘Ag Climatise’ Road...,‘Ag Climatise’ sets a vision for a ‘climate ne...
1,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58,64,GOAL 1 Develop a Climate Neutral Agri- Food Sy...,2,Produce detailed plans by Q2 2022 to manage t...,Ag Climatise makes clear that any increase in ...
2,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58,64,GOAL 1 Develop a Climate Neutral Agri- Food Sy...,3,"Update Ag Climatise, as required, to ensure c...",Ag Climatise is only a first step in agricultu...
3,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58,64,GOAL 1 Develop a Climate Neutral Agri- Food Sy...,4,Roll out ‘Carbon Farming’.,Ag Climatise commits to a pilot scheme for on-...
4,1,"A Climate Smart, Environmentally Sustainable...",1,Develop a Climate Neutral Agri- Food System so...,58,64,GOAL 1 Develop a Climate Neutral Agri- Food Sy...,5,Ireland will play a leading role in shaping h...,As research progresses on the different charac...


In [88]:
action_content.to_excel(r"actions.xlsx")