In [78]:
import fitz
import pandas as pd
import re
import openai
from datetime import datetime

In [64]:
def extract_three_column_text(pdf_path):
    doc = fitz.open(pdf_path)
    extracted_pages = []

    for page in doc:
        blocks = page.get_text("blocks")
        images = page.get_images(full=True)

        image_positions = []
        for img in images:
            try:
                bbox = page.get_image_bbox(img[0])
                if bbox:
                    image_positions.append(bbox)
            except ValueError:
                continue  

        blocks = sorted(blocks, key=lambda b: (b[1], b[0]))

        page_width = page.rect.width
        column_width = page_width / 3

        col1, col2, col3 = [], [], []

        for block in blocks:
            x0, y0, x1, y1, text, _, _ = block  
            if any(img_bbox[1] <= y0 <= img_bbox[3] for img_bbox in image_positions):
                continue 

            if x0 < column_width:
                col1.append((y0, text))
            elif x0 < 2 * column_width:
                col2.append((y0, text))
            else:
                col3.append((y0, text))

        col1.sort()
        col2.sort()
        col3.sort()

        page_text = "\n".join(
            [t[1] for t in col1] +
            [t[1] for t in col2] +
            [t[1] for t in col3]
        )

        extracted_pages.append(page_text)

    return extracted_pages

In [66]:
pdf_text = extract_three_column_text(r"C:\Users\0132499s\OneDrive - National University of Ireland, Galway\Documents\Documents\Strategies\Food Vision 2030.pdf")
for i, page in enumerate(pdf_text):
    print(f"Page {i+1}:\n{page}\n{'='*50}")

Page 1:
Food Vision
2030

A World Leader in 
Sustainable Food Systems

i

Page 2:
© Department of Agriculture, Food and the Marine

Original images provided by Bord Iascaigh Mhara,  Bord Bia and Jack Caffrey || The Pimlico Project.

ii

Page 3:
CONTENTS

Foreword	
2

Stakeholder Committee	
6

Executive Summary	
8

Introduction and Context 	
30

Strategic Vision and Structure	
50

Mission 1  
A Climate Smart, Environmentally Sustainable  
Agri-Food Sector	
52

Mission 2  
Viable & Resilient Primary Producers with  
Enhanced Wellbeing	
90

Mission 3  
Food which is Safe, Nutritious and Appealing,  
Trusted and Valued at Home and Abroad	
124

Mission 4  
An Innovative, Competitive & Resilient Agri-Food  
Sector, Driven by Technology and Talent	
143

Monitoring and Implementation Framework	
170

Abbreviations	
184

1

Page 4:
FOREWORD

The 2020s should be the Sustainability Decade for the Irish agri-food sector.  After eighteen months of work, this is the 
conclusion of the 2030 Agri-Food 

In [67]:
def create_dataframe(pdf_text):
    data = []

    for page_number, page_text in enumerate(pdf_text):
        paragraphs = re.split(r"\n\s*\n", page_text.strip())  

        for para in paragraphs:
            if para.strip():
                data.append({"Page Number": page_number + 1, "Paragraph": para.replace("\n", " ")})

    df = pd.DataFrame(data)
    return df

In [68]:
df = create_dataframe(pdf_text)
print(len(df))
df.head()

2664


Unnamed: 0,Page Number,Paragraph
0,1,Food Vision 2030
1,1,A World Leader in Sustainable Food Systems
2,1,i
3,2,"© Department of Agriculture, Food and the Marine"
4,2,Original images provided by Bord Iascaigh Mhar...


In [69]:
def cleaning_paragraph(df):
    def clean_text(text):
        if not re.search(r'[a-zA-Z]', text):
            return None
        if len(text.strip()) < 5:
            return None
        text = re.sub(r"[^a-zA-Z0-9\s€()]", "", text)
        text = re.sub(r"\s{3,}", " ", text)
        return text.strip()
    df["Paragraph"] = df["Paragraph"].apply(clean_text)
    df.dropna(subset=["Paragraph"], inplace=True)
    return df.reset_index(drop=True)

In [74]:
df_cleaned = cleaning_paragraph(df)
print(len(df_cleaned))
df_cleaned.head()

2223


Unnamed: 0,Page Number,Paragraph
0,1,Food Vision 2030
1,1,A World Leader in Sustainable Food Systems
2,2,Department of Agriculture Food and the Marine
3,2,Original images provided by Bord Iascaigh Mhar...
4,3,CONTENTS


In [73]:
def get_all_stakeholder_from_text(statement):
    try:
        
        system_prompt = (
            "You are an assistant designed to analyze text and identify stakeholders organisations."
            "based on text and then categorize them into groups.\n\n"
            "When analyzing a statement:\n"
            "- Identify possible stakeholder organisations that may fall under any of the categories.\n"
            "- Do not extract names of individuals, titles, government or designations (e.g., 'Government', 'Mr. Cotter', 'Minister', 'Deputy Pringle').\n"
            "- If no valid stakeholders are mentioned in the statement, respond with: 'No stakeholders identified in this statement.'\n\n"
            "Provide your output in the following structured format:\n"
            "1. **Stakeholder Name**: (name of the identified stakeholder)\n"
            "2. **Category**: (name of the group the stakeholder belongs to, please respond with only the most suitable category name.)\n"
            "3. **Mention in Statement**: (exact sentence or phrase from the statement mentioning the stakeholder)\n\n"
            "If no stakeholders are identified, respond with: 'No stakeholders identified in this statement.'"
        )

        user_prompt = (
            f"Analyze the following statement and identify stakeholders organisations:\n\n"
            f"Statement:\n\"{statement}\"\n\n"
            "List all stakeholders mentioned in the statement along with their categories. Do not extract names of individuals, titles, government or designations."
        )

        response = openai.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=400,  
            temperature=0.2
        )
        content =  response.choices[0].message.content.strip()
        return content

    except Exception as e:
        print(f"Error extracting names: {e}")
        return "Error processing the statement."

In [79]:
print("Start Time: ", datetime.now())
df_cleaned["Extracted Stakeholders All"] = df_cleaned.apply(
    lambda row: get_all_stakeholder_from_text(
        row["Paragraph"]),
    axis=1
)
print("End Time: ", datetime.now())

Start Time:  2025-03-21 07:27:48.775654
End Time:  2025-03-21 08:21:05.104021


In [84]:
df_filtered = df_cleaned[df_cleaned['Extracted Stakeholders All'] != 'No stakeholders identified in this statement.']
df_filtered = df_filtered.reset_index(drop=True)
df_filtered.head()

Unnamed: 0,Page Number,Paragraph,Extracted Stakeholders All
0,2,Department of Agriculture Food and the Marine,1. **Stakeholder Name**: Department of Agricul...
1,2,Original images provided by Bord Iascaigh Mhar...,1. **Stakeholder Name**: Bord Iascaigh Mhara ...
2,4,The 2020s should be the Sustainability Decade ...,1. **Stakeholder Name**: 2030 AgriFood Strateg...
3,4,The Stakeholder Committees terms of reference ...,1. **Stakeholder Name**: Stakeholder Committee...
4,4,In addition to how the market shaped the strat...,1. **Stakeholder Name**: European Green Deal (...


In [85]:
def flatten_stakeholders(df):
    def extract_stakeholder_info(text):
        stakeholder_pattern = r'\*\*Stakeholder Name\*\*: (.*?)\n'
        category_pattern = r'\*\*Category\*\*: (.*?)\n'
        mention_pattern = r'\*\*Mention in Statement\*\*: "(.*?)"'
        
        stakeholder_match = re.search(stakeholder_pattern, text)
        category_match = re.search(category_pattern, text)
        mention_match = re.search(mention_pattern, text)
    
        stakeholder = stakeholder_match.group(1) if stakeholder_match else None
        category = category_match.group(1) if category_match else None
        mention = mention_match.group(1) if mention_match else None

        return pd.Series([stakeholder, category, mention])
    
    df[['Stakeholder', 'Category', 'Mention']] = df['Extracted Stakeholders All'].apply(extract_stakeholder_info)
    
    df = df.drop(columns=['Extracted Stakeholders All'])
    
    return df

In [86]:
stakeholder_documents_flat = flatten_stakeholders(df_filtered)
stakeholder_documents_flat.head()

Unnamed: 0,Page Number,Paragraph,Stakeholder,Category,Mention
0,2,Department of Agriculture Food and the Marine,Department of Agriculture Food and the Marine,Government Agency,Department of Agriculture Food and the Marine
1,2,Original images provided by Bord Iascaigh Mhar...,Bord Iascaigh Mhara,Fisheries Organisation,Original images provided by Bord Iascaigh Mhara
2,4,The 2020s should be the Sustainability Decade ...,2030 AgriFood Strategy Committee,Agricultural Organizations,this is the conclusion of the 2030 AgriFood St...
3,4,The Stakeholder Committees terms of reference ...,Stakeholder Committees,Advisory Groups,The Stakeholder Committees terms of reference ...
4,4,In addition to how the market shaped the strat...,European Green Deal (EGD),Environmental Policy Initiative,The European Green Deal (EGD) including the Fa...


In [87]:
def summarise(data):
    thoughts = []
    
    for _, row in data.iterrows():
        stakeholder = row['Stakeholder']
        para = row['Paragraph']
     
        system_prompt = (
            "You are a summarization tool that extracts, in very few words, the context in which a stakeholder is mentioned, "
            "based strictly on the provided paragraph.\n"
            "Return only a single concise summary focusing on the context in which the stakeholders is mentioned in the text .\n"
            "Do not include any other stakeholders in the summary, and do not generate responses for entities that are not explicitly mentioned in relation to the stakeholder.\n"
            "If no meaningful mention of stakeholder is found in the paragraph, return 'No relevant context found'."
        )
        
        user_prompt = (
            f"The following paragraph contains references to multiple stakeholders. "
            f"Your task is to summarize the context in which the **Primary Stakeholder** is mentioned only. "
            f"Do not extract or include mentions of any other stakeholders. "
            f"If no direct mention is found of the  stakeholder, respond with 'No relevant context found'.\n\n"
            f"Stakeholder: {stakeholder}\n"
            f"Context: {para}\n\n"
            "Provide a one-line summary of the context in the text:\n"
            "Summary:"
        )
             
        response = openai.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=400,  
            temperature=0.2
        )
    
        result = response.choices[0].message.content.strip()
        thoughts.append(result)

    data['Context'] = thoughts
    return data

In [88]:
context_df = summarise(stakeholder_documents_flat)

In [90]:
context_df_filtered = context_df[context_df['Context'] != 'No relevant context found.']
context_df_filtered = context_df_filtered.reset_index(drop=True)
context_df_filtered.head()

Unnamed: 0,Page Number,Paragraph,Stakeholder,Category,Mention,Context
0,2,Original images provided by Bord Iascaigh Mhar...,Bord Iascaigh Mhara,Fisheries Organisation,Original images provided by Bord Iascaigh Mhara,Original images provided by Bord Iascaigh Mhara.
1,4,The 2020s should be the Sustainability Decade ...,2030 AgriFood Strategy Committee,Agricultural Organizations,this is the conclusion of the 2030 AgriFood St...,The 2030 AgriFood Strategy Committee concluded...
2,4,The Stakeholder Committees terms of reference ...,Stakeholder Committees,Advisory Groups,The Stakeholder Committees terms of reference ...,The Stakeholder Committees are tasked with dev...
3,4,In addition to how the market shaped the strat...,European Green Deal (EGD),Environmental Policy Initiative,The European Green Deal (EGD) including the Fa...,The European Green Deal (EGD) signifies a majo...
4,4,We anticipate that the market and policy envir...,agrifood sector,Industry,the agrifood sector over the next decade,The agrifood sector will face a significantly ...


In [91]:
context_df_filtered.to_excel(r"Food Vision 2023.xlsx")