In [21]:
from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import openai
from datetime import datetime
import re
import asyncio
import json 
import ast

# Read selected publications data

In [2]:
client = MongoClient("mongodb://localhost:27017/")
db = client["publications"]
coll = db["all_selected_publications"]

In [3]:
documents = pd.DataFrame(coll.find())
documents.head()

Unnamed: 0,_id,Page Number,Paragraph Number,Paragraph,Document Name,Author,Year
0,67b7667ac805a37c09fcd8e2,Page 21,1,Chapter 1 Introduction\nBackground and Context...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
1,67b7667ac805a37c09fcd8e3,Page 21,2,The objective of the NIR is to describe the me...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
2,67b7667ac805a37c09fcd8e4,Page 21,3,Introduction and Reporting Requirements under ...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
3,67b7667ac805a37c09fcd8e5,Page 21,4,The NIR is compiled according to the structure...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024
4,67b7667ac805a37c09fcd8e6,Page 21,5,"In addition, detailed documentation of methods...",Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024


# Create term document matrix for each of the paragraphs in the publication data

In [4]:
def create_tdm(df):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(df["Paragraph"].fillna(""))
    feature_names = vectorizer.get_feature_names_out()
    tdm_list = []
    for row in tfidf_matrix:
        term_scores = {feature_names[i]: row[0, i] for i in row.nonzero()[1]}
        tdm_list.append(term_scores)
    df["tdm"] = tdm_list
    return df

In [5]:
documents = create_tdm(documents)
documents.head()

Unnamed: 0,_id,Page Number,Paragraph Number,Paragraph,Document Name,Author,Year,tdm
0,67b7667ac805a37c09fcd8e2,Page 21,1,Chapter 1 Introduction\nBackground and Context...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024,"{'chapter': 0.15916505128470432, 'introduction..."
1,67b7667ac805a37c09fcd8e3,Page 21,2,The objective of the NIR is to describe the me...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024,"{'background': 0.1794641387812835, 'and': 0.12..."
2,67b7667ac805a37c09fcd8e4,Page 21,3,Introduction and Reporting Requirements under ...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024,"{'introduction': 0.05931178205837574, 'and': 0..."
3,67b7667ac805a37c09fcd8e5,Page 21,4,The NIR is compiled according to the structure...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024,"{'and': 0.12684540899326205, 'national': 0.103..."
4,67b7667ac805a37c09fcd8e6,Page 21,5,"In addition, detailed documentation of methods...",Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024,"{'and': 0.05671909986558157, 'for': 0.07193373..."


# Query and Rank the paragraphs which contain dairy, cattle and milk using term document matrix.

In [6]:
def rank_dairy(df, terms = ["dairy", "milk", "cattle"]):
    rankings = []
    for tdm_dict in df["tdm"]:
        if isinstance(tdm_dict, dict):
            score = sum(tdm_dict.get(term, 0) for term in terms)
        else:
            score = 0
        rankings.append(score)
    df["Dairy Rank"] = rankings
    df_filtered = df[df["Dairy Rank"] > 0].copy()
    df_filtered = df_filtered.sort_values(by = "Dairy Rank", ascending = False).reset_index(drop = True)
    return df_filtered    

In [7]:
dairy_documents = rank_dairy(documents)
dairy_documents.head()
print(len(dairy_documents))

435


In [8]:
dairy_documents.head()

Unnamed: 0,_id,Page Number,Paragraph Number,Paragraph,Document Name,Author,Year,tdm,Dairy Rank
0,67b7667ac805a37c09fcdee2,Page 173,2,The livestock types relevant for Ireland are a...,Ireland â__ s National Inventory Report.pdf,Environmental Protection Agency,2024,"{'ireland': 0.10717409480625864, 'for': 0.0784...",0.977478
1,67b7667ac805a37c09fce92a,Page 118,3,4\nFigure 6.3 summarises the total CH emission...,Irelandâ__s Climate Change Assessment Volume 2...,Environmental Protection Agency,2023,"{'and': 0.03470422193982509, 'this': 0.1231478...",0.625523
2,67b7667ac805a37c09fce92e,Page 118,4,4\nFigure 6.3 summarises the total CH emission...,Irelandâ__s Climate Change Assessment Volume 2...,Environmental Protection Agency,2023,"{'and': 0.049350503659450366, 'this': 0.116746...",0.593009
3,67b7667ac805a37c09fcf011,Page 15,2,"The dairy sector is particularly important, wi...",Climate Change and Sustainability in the Agric...,"Joint Committee on Agriculture, Food and the M...",2018,"{'and': 0.1056734909370533, 'this': 0.03749820...",0.580111
4,67b7667ac805a37c09fcf015,Page 15,2,"The dairy sector is particularly important, wi...",Climate Change and Sustainability in the Agric...,"Joint Committee on Agriculture, Food and the M...",2018,"{'and': 0.1056734909370533, 'this': 0.03749820...",0.580111


# Do the same thing but using ChatGPT
## But this is taking too much time since it is processing 9000 records and maybe the other method is just as correct and acceptable. So, skipping this for now.

In [9]:
def identify_dairy(df, topcis = ["dairy", "milk", "cattle"]):

    system_prompt = (
        "You are an assistant trained to analyze text and identify specific dairy topics mentioned in the text. "
        "Your task is to carefully examine the provided text and identify significant mentions of the dairy topics. "
        "Only consider a topic to be 'mentioned' if it is central to the discussion or if the text provides"
        "some meaningful information or opinion about the topic."
        "Consider a dairy topic to be mentioned if the text revolves around the terms: Dairy, Milk, and Cattle."
    )
    
    def extract_topics_from_text(text):

        user_prompt = (
            f"The topics to identify are:\n{topics}\n\n"
            "Below is a statement. Identify which topics, dairy, milk, or cattle are significantly mentioned. "
            "Provide a concise list of only the topics that are clearly relevant to dairy, milk, cattle. "
            "If none are relevant, return an empty list.\n\n"
            f"Statement: {text}"
        )
        
        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini", 
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                max_tokens=400,  
                temperature=0.2
            )
            topics_mentioned = response.choices[0].message.content.strip()
            return eval(topics_mentioned) if topics_mentioned.strip().startswith("[") else []
        except Exception as e:
            print(f"Error processing text: {e}")
            return []

    df["Dairy ChatGPT"] = df["Paragraph"].apply(extract_topics_from_text)
    return df

In [10]:
'''
print("Start Time: ", datetime.now())
dairy_topics_documents = identify_dairy(documents)
print(dairy_topics_documents)
print("End Time: ", datetime.now())
'''

'\nprint("Start Time: ", datetime.now())\ndairy_topics_documents = identify_dairy(documents)\nprint(dairy_topics_documents)\nprint("End Time: ", datetime.now())\n'

In [11]:
'''
dairy_topics_documents_filtered = dairy_topics_documents[dairy_topics_documents["Dairy ChatGPT"].apply(lambda x:x != [])]
print(len(dairy_topics_documents_filtered))
'''

'\ndairy_topics_documents_filtered = dairy_topics_documents[dairy_topics_documents["Dairy ChatGPT"].apply(lambda x:x != [])]\nprint(len(dairy_topics_documents_filtered))\n'

# Identify the topics, Emission, Carbon and Livelihood from the dairy identified paragraphs using ChatGPT.

In [12]:
topics = ['carbon', 'neutrality', 'emissions', 'abatement', 
          'water', 'waterways', 'rivers', 'nitrates', 'eutrophication', 'effluent', 
          'discharge', 'inherit', 'nitrogen efficient', 'nitrogen fixing', 
          'succession', 'rural development', 'community', 'social sustainability', 'society', 'economic viability']

In [13]:
def identify_topics(df, topics):

    system_prompt = (
        "You are an assistant trained to analyze text and identify specific topics mentioned in the text. "
        "Your task is to carefully examine the provided text and identify significant mentions of the topics. "
        "Only consider a topic to be 'mentioned' if it is central to the discussion or if the text provides"
        "some meaningful information or opinion about the topic."
    )
    
    def extract_topics_from_text(text):

        user_prompt = (
            f"The topics to identify are:\n{topics}\n\n"
            "Below is a statement. Identify which topics are significantly mentioned. "
            "Provide a concise list of only the topics that are clearly relevant "
            "If none are relevant, return an empty list.\n\n"
            f"Statement: {text}"
        )
        
        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini", 
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                max_tokens=400,  
                temperature=0.2
            )
            topics_mentioned = response.choices[0].message.content.strip()
            return eval(topics_mentioned) if topics_mentioned.strip().startswith("[") else []
        except Exception as e:
            print(f"Error processing text: {e}")
            return []

    df["Topics"] = df["Paragraph"].apply(extract_topics_from_text)
    return df

In [14]:
topics_documents = identify_topics(dairy_documents, topics)
print(topics_documents)

                          _id Page Number  Paragraph Number  \
0    67b7667ac805a37c09fcdee2    Page 173                 2   
1    67b7667ac805a37c09fce92a    Page 118                 3   
2    67b7667ac805a37c09fce92e    Page 118                 4   
3    67b7667ac805a37c09fcf011     Page 15                 2   
4    67b7667ac805a37c09fcf015     Page 15                 2   
..                        ...         ...               ...   
430  67b7667ac805a37c09fcd9a9     Page 48                 1   
431  67b7667ac805a37c09fcd9b4     Page 53                 1   
432  67b7667ac805a37c09fcd9b3     Page 52                 1   
433  67b7667ac805a37c09fcd9ab     Page 50                 1   
434  67b7667ac805a37c09fcd9b5     Page 54                 1   

                                             Paragraph  \
0    The livestock types relevant for Ireland are a...   
1    4\nFigure 6.3 summarises the total CH emission...   
2    4\nFigure 6.3 summarises the total CH emission...   
3    The da

# Identify all the stakeholder from text using ChatGPT

In [15]:
def get_all_stakeholder_from_text(statement):
    try:
        
        system_prompt = (
            "You are an assistant designed to analyze text and identify stakeholders organisations."
            "based on text and then categorize them into groups.\n\n"
            "When analyzing a statement:\n"
            "- Identify possible stakeholder organisations that may fall under any of the categories.\n"
            "- Do not extract names of individuals, titles, government or designations (e.g., 'Government', 'Mr. Cotter', 'Minister', 'Deputy Pringle').\n"
            "- If no valid stakeholders are mentioned in the statement, respond with: 'No stakeholders identified in this statement.'\n\n"
            "Provide your output in the following structured format:\n"
            "1. **Stakeholder Name**: (name of the identified stakeholder)\n"
            "2. **Category**: (name of the group the stakeholder belongs to, please respond with only the most suitable category name.)\n"
            "3. **Mention in Statement**: (exact sentence or phrase from the statement mentioning the stakeholder)\n\n"
            "If no stakeholders are identified, respond with: 'No stakeholders identified in this statement.'"
        )

        user_prompt = (
            f"Analyze the following statement and identify stakeholders organisations:\n\n"
            f"Statement:\n\"{statement}\"\n\n"
            "List all stakeholders mentioned in the statement along with their categories. Do not extract names of individuals, titles, government or designations."
        )

        response = openai.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=400,  
            temperature=0.2
        )
        content =  response.choices[0].message.content.strip()
        return content

    except Exception as e:
        print(f"Error extracting names: {e}")
        return "Error processing the statement."

In [16]:
print("Start Time: ", datetime.now())
topics_documents["Extracted Stakeholders All"] = topics_documents.apply(
    lambda row: get_all_stakeholder_from_text(
        row["Paragraph"]),
    axis=1
)
print("End Time: ", datetime.now())

Start Time:  2025-02-26 12:15:14.201472


KeyboardInterrupt: 

In [None]:
stakeholder_documents = topics_documents[topics_documents["Extracted Stakeholders All"].apply(lambda x: x is not None and 'No stakeholders identified' not in x)]
print(stakeholder_documents)

In [None]:
def flatten_stakeholders(df):
    def extract_stakeholder_info(text):
        stakeholder_pattern = r'\*\*Stakeholder Name\*\*: (.*?)\n'
        category_pattern = r'\*\*Category\*\*: (.*?)\n'
        mention_pattern = r'\*\*Mention in Statement\*\*: "(.*?)"'
        
        stakeholder_match = re.search(stakeholder_pattern, text)
        category_match = re.search(category_pattern, text)
        mention_match = re.search(mention_pattern, text)
    
        stakeholder = stakeholder_match.group(1) if stakeholder_match else None
        category = category_match.group(1) if category_match else None
        mention = mention_match.group(1) if mention_match else None

        return pd.Series([stakeholder, category, mention])
    
    df[['Stakeholder', 'Category', 'Mention']] = df['Extracted Stakeholders All'].apply(extract_stakeholder_info)
    
    df = df.drop(columns=['Extracted Stakeholders All'])
    
    return df

In [None]:
stakeholder_documents_flat = flatten_stakeholders(stakeholder_documents)
stakeholder_documents_flat = stakeholder_documents_flat.drop_duplicates(subset = ["Stakeholder","Mention"])
print(stakeholder_documents_flat)

In [None]:
new_stakeholders = list(set(list(stakeholder_documents_flat['Stakeholder'])))
print(type(new_stakeholders))
new_stakeholders = pd.DataFrame(new_stakeholders)
new_stakeholders.to_excel('allStake.xlsx')

In [None]:
valid_stakeholders = pd.read_excel(r'allStake.xlsx')
print(valid_stakeholders)

In [None]:
stakeholder_documents_flat.to_excel(r'stakeholder_documents_flat.xlsx')

# Identify secondary stakeholder or the object of the sentence using ChatGPT.

In [None]:
def get_stakeholder_from_text_primary_secondary(statement):
    try:
        system_prompt = (
            "You are an expert in information extraction. Your task is to identify stakeholders in a given sentence. "
            "There are two types of stakeholders: \n"
            "**Primary Stakeholder(s)**: These are the main entities that drive, fund, lead, or are responsible for the action or decision described in the text. "
            "They are often government bodies, organizations, institutions, or authorities. If multiple primary stakeholders are present, identify all of them. \n"
            "**Secondary Stakeholder(s) (Object)**: These are the entities that are affected by, receive funding/support, or are involved in the action described. "
            "These could be programmes, policies, companies, projects, or groups of individuals. If multiple secondary stakeholders exist, list them all. \n"
            "When analyzing a statement:\n"
            "- Do not extract names of individuals, titles, government or designations (e.g., 'Government', 'Mr. Cotter', 'Minister', 'Deputy Pringle').\n"
            "- If no valid stakeholders are mentioned in the statement, respond with: 'No stakeholders identified in this statement.'\n\n"
            "Provide your output in the following structured format, as a list of lists, for each pair of Primary and its corresponding stakeholder:\n"
            "[[Primary Stakeholder Name: (name of the identified primary stakeholder), Secondary Stakeholder Name: (name of the identified secondary stakeholder)],[Primary Stakeholder Name 2: (name of the identified primary stakeholder), Secondary Stakeholder Name 2: (name of the identified secondary stakeholder)]]\n"
            "If multiple stakeholder sets / pairs are identified then put it in a new list and inside the outside list.\n"
            "If no stakeholders are identified, respond with: 'No stakeholders identified in this statement.'"
        )

        user_prompt = (
            f"Analyze the following statement and identify stakeholders organisations based on the provided text\n\n"
            f"Statement:\n\"{statement}\"\n\n"
            "List all stakeholders mentioned in the statement along with their categories. Do not extract names of individuals, titles, government or designations."
        )

        response = openai.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=400,  
            temperature=0.05
        )

        content = response.choices[0].message.content.strip()
        return content

    except Exception as e:
        print(f"Error extracting names: {e}")
        return "Error processing the statement."

In [None]:
topics_documents["Extracted Stakeholders"] = topics_documents.apply(lambda row: get_stakeholder_from_text_primary_secondary(row["Paragraph"]),axis=1)

In [None]:
topics_documents.to_excel(r'topics_documents.xlsx')

In [33]:
topics_documents_read = pd.read_excel(r'flattened_both_df.xlsx')

In [34]:
def parse_stakeholders(stakeholder_str):
    stakeholder_str = stakeholder_str.strip('"')
    matches = re.findall(r"Primary Stakeholder Name: (.*?), Secondary Stakeholder Name: (.*?)]", stakeholder_str)
    stakeholders = [{"Primary Stakeholder": primary.strip(), "Secondary Stakeholder": secondary.strip()} for primary, secondary in matches]
    return stakeholders

expanded_rows = []
for _, row in topics_documents_read.iterrows():
    stakeholders = parse_stakeholders(row["Extracted Stakeholders"])
    for stakeholder in stakeholders:
        new_row = row.copy()
        new_row["Primary Stakeholder"] = stakeholder["Primary Stakeholder"]
        new_row["Secondary Stakeholder"] = stakeholder["Secondary Stakeholder"]
        expanded_rows.append(new_row)

new_df = pd.DataFrame(expanded_rows).drop(columns=["Extracted Stakeholders", "_id","tdm","Dairy Rank","Topics","Extracted Stakeholders All"])
new_df.reset_index(drop=True, inplace=True)
print(new_df)

     Page Number  Paragraph Number  \
0       Page 118                 4   
1       Page 118                 4   
2        Page 15                 2   
3        Page 15                 2   
4        Page 15                 2   
...          ...               ...   
1005         360                 1   
1006         360                 1   
1007         336                 1   
1008         336                 1   
1009         336                 1   

                                              Paragraph  \
0     4\nFigure 6.3 summarises the total CH emission...   
1     4\nFigure 6.3 summarises the total CH emission...   
2     The dairy sector is particularly important, wi...   
3     The dairy sector is particularly important, wi...   
4     The dairy sector is particularly important, wi...   
...                                                 ...   
1005  Microbiological contamination\nThe most import...   
1006  Microbiological contamination\nThe most import...   
1007  With r

In [58]:
def summarise(data):
    thoughts = []
    
    for _, row in data.iterrows():
        primary_stakeholder = row['Primary Stakeholder']
        secondary_stakeholder = row['Secondary Stakeholder']
        para = row['Paragraph']
     
        system_prompt = (
            "You are a summarization tool that extracts, in very few words, the context in which a primary stakeholder is mentioned alongside a secondary stakeholder, "
            "based strictly on the provided paragraph.\n"
            "Return only a single concise summary focusing on the interaction or relationship between the given primary and secondary stakeholders.\n"
            "Do not include any other stakeholders in the summary, and do not generate responses for entities that are not explicitly mentioned in relation to both the primary and secondary stakeholders.\n"
            "If no meaningful relationship between the primary and secondary stakeholders is found in the paragraph, return 'No relevant context found'."
        )
        
        user_prompt = (
            f"The following paragraph contains references to multiple stakeholders. "
            f"Your task is to summarize the context in which the **Primary Stakeholder** is mentioned specifically with the **Secondary Stakeholder** only. "
            f"Do not extract or include mentions of any other stakeholders. "
            f"If no direct connection is found between the two stakeholders, respond with 'No relevant context found'.\n\n"
            f"Primary Stakeholder: {primary_stakeholder}\n"
            f"Secondary Stakeholder: {secondary_stakeholder}\n"
            f"Context: {para}\n\n"
            "Provide a one-line summary of their interaction or context in the text:\n"
            "Summary:"
        )
             
        response = openai.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=400,  
            temperature=0.2
        )
    
        result = response.choices[0].message.content.strip()
        thoughts.append(result)

    data['Context'] = thoughts
    return data

In [59]:
context_df = summarise(new_df)

In [60]:
context_df.head()

Unnamed: 0,Page Number,Paragraph Number,Paragraph,Document Name,Author,Year,Primary Stakeholder,Secondary Stakeholder,Context,Tag
0,Page 118,4,4\nFigure 6.3 summarises the total CH emission...,Irelandâ__s Climate Change Assessment Volume 2...,Environmental Protection Agency,2023,Climate Change Advisory Council,Irish agriculture,No relevant context found.,Reports
1,Page 118,4,4\nFigure 6.3 summarises the total CH emission...,Irelandâ__s Climate Change Assessment Volume 2...,Environmental Protection Agency,2023,Central Statistics Office,Irish agriculture,The Central Statistics Office provides data on...,Reports
2,Page 15,2,"The dairy sector is particularly important, wi...",Climate Change and Sustainability in the Agric...,"Joint Committee on Agriculture, Food and the M...",2018,Irish dairy sector,Irish dairy herd,The Irish dairy sector is highlighted as cruci...,Positive
3,Page 15,2,"The dairy sector is particularly important, wi...",Climate Change and Sustainability in the Agric...,"Joint Committee on Agriculture, Food and the M...",2018,ICMSA,Irish beef and dairy export markets,ICMSA highlights the total value of Irish beef...,Reports
4,Page 15,2,"The dairy sector is particularly important, wi...",Climate Change and Sustainability in the Agric...,"Joint Committee on Agriculture, Food and the M...",2018,"Department of Agriculture, Food and the Marine",Irish beef and dairy export markets,"The Department of Agriculture, Food and the Ma...",Reports


In [61]:
def tag_summary(df):
    system_prompt = (
        "You are a tagging assistant that classifies the relationship between a Primary and Secondary Stakeholder based strictly on the provided context and paragraph.\n"
        "Your task is to assign the most relevant **single-word** tag that accurately describes the nature of their interaction.\n"
        "Tags should reflect the core action or relationship expressed in the context, avoiding broad or vague classifications.\n"
        "You may use the following suggested tags, but you are not restricted to them: \n"
        "Positive, Negative, Neutral, Recommendation, Collaboration, Reports, Aims, Inspects, Mentions, Manages, Funds, Implements, Regulates, Supports, Criticizes, Publishes, Proposes, Advises, Evaluates, Monitors, etc.\n"
        "You **must not** assign a tag that does not align with the actual meaning of the context.\n"
        "Ensure the assigned tag accurately represents the specific action or relationship depicted in the context."
    )
    
    tags = []

    for _, row in df.iterrows():
        primary_stakeholder = row['Primary Stakeholder']
        secondary_stakeholder = row['Secondary Stakeholder']
        para = row['Paragraph']
        context = row["Context"]

        user_prompt = (
            f"The following is a description of the interaction between the **Primary Stakeholder** and the **Secondary Stakeholder** based on the given context and paragraph.\n"
            f"Your task is to classify this interaction into the **most accurate** single-word tag.\n\n"
            f"Primary Stakeholder: {primary_stakeholder}\n"
            f"Secondary Stakeholder: {secondary_stakeholder}\n"
            f"Context: {context}\n"
            f"Paragraph: {para}\n\n"
            "Consider the following guidelines:\n"
            "1. **Reports** – Use this tag **only if** the context describes a factual presentation of statistics, research, or findings without any additional action or decision-making.\n"
            "2. **Aims** – If the stakeholder expresses a specific goal or intention for the future.\n"
            "3. **Implements** – If the stakeholder takes action to introduce or enforce a policy, rule, or system.\n"
            "4. **Regulates** – If the stakeholder enforces compliance with laws, guidelines, or industry standards.\n"
            "5. **Supports** – If the stakeholder provides assistance, backing, or encouragement.\n"
            "6. **Criticizes** – If the stakeholder expresses concerns, objections, or disapproval.\n"
            "7. **Publishes** – If the stakeholder is responsible for making official reports, statistics, or information public.\n"
            "8. **Proposes** – If the stakeholder suggests a new policy, plan, or change.\n"
            "9. **Advises** – If the stakeholder gives recommendations or guidance.\n"
            "10. **Evaluates / Monitors** – If the stakeholder assesses or tracks performance, compliance, or outcomes.\n\n"
            "You are not restricted to the above options and may assign another tag if it better fits the context.\n"
            "**Select exactly one tag that best represents the relationship in the context:**"
        )

        response = openai.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=400,  
            temperature=0.2
        )

        tag = response.choices[0].message.content.strip()
        tags.append(tag)
    
    df['Tag'] = tags
    return df


In [62]:
tagged = tag_summary(context_df)

In [63]:
new_df.to_excel(r'primary_secondary.xlsx')