In [1]:
from pymongo import MongoClient
import pandas as pd
from rapidfuzz import fuzz, process
from itertools import permutations
import openai
from tqdm import tqdm
import time
import math
from datetime import datetime
import re

In [4]:
doc_list = pd.read_excel(r'C:\Jupyter Notebook\Relevant Docs\final transcripts.xlsx')
doc_list = doc_list['Doc'].tolist()
topHundred = [s.lower().replace('.pdf','') for s in doc_list[:100]]
print(len(topHundred))

100


In [6]:
client = MongoClient('mongodb://localhost:27017/')
db_pdf = client['transcripts']
collection_pdf = db_pdf['P1_speaker_speech']
data = list(collection_pdf.find({"file_name" : {"$in" : topHundred}}))
data = pd.DataFrame(data)
print(data.head())

                        _id  \
0  67117806b775f29fa83977da   
1  67117806b775f29fa83977db   
2  67117806b775f29fa83977dc   
3  67117806b775f29fa83977dd   
4  67117807b775f29fa83977de   

                                           file_name   ID  Year Sliding_year  \
0  joint committee on climate action debate - wed...  234  2018    2018-2020   
1  joint committee on climate action debate - wed...  234  2018    2018-2020   
2  joint committee on climate action debate - wed...  234  2018    2018-2020   
3  joint committee on climate action debate - wed...  234  2018    2018-2020   
4  joint committee on climate action debate - wed...  234  2018    2018-2020   

                  Speaker                                         Exact Text  \
0                Chairman  I welcome members of the committee and viewers...   
1  Ms Justice Mary Laffoy  I am grateful to the Chairman and members of t...   
2                Chairman  I thank Ms Justice Laffoy for all her work and...   
3  Ms Justic

In [7]:
topics = ['carbon', 'neutrality', 'emissions', 'abatement', 
          'water', 'waterways', 'rivers', 'nitrates', 'eutrophication', 'effluent', 
          'discharge', 'inherit', 'nitrogen efficient', 'nitrogen fixing', 
          'succession', 'rural development', 'community', 'social sustainability', 'society', 'economic viability']

In [8]:
def identify_topics(df, topics):

    system_prompt = (
        "You are an assistant trained to analyze text and identify specific topics mentioned in the text. "
        "Your task is to carefully examine the provided text and identify significant mentions of the topics. "
        "Only consider a topic to be 'mentioned' if it is central to the discussion or if the text provides"
        "some meaningful information or opinion about the topic."
    )
    
    def extract_topics_from_text(text):

        user_prompt = (
            f"The topics to identify are:\n{topics}\n\n"
            "Below is a statement. Identify which topics are significantly mentioned. "
            "Provide a concise list of only the topics that are clearly relevant "
            "If none are relevant, return an empty list.\n\n"
            f"Statement: {text}"
        )
        
        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini", 
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                max_tokens=400,  
                temperature=0.2
            )
            topics_mentioned = response.choices[0].message.content.strip()
            return eval(topics_mentioned) if topics_mentioned.strip().startswith("[") else []
        except Exception as e:
            print(f"Error processing text: {e}")
            return []

    df["Topics"] = df["Exact Text"].apply(extract_topics_from_text)
    return df

In [9]:
result_df = identify_topics(data, topics)
print(result_df)

                            _id  \
0      67117806b775f29fa83977da   
1      67117806b775f29fa83977db   
2      67117806b775f29fa83977dc   
3      67117806b775f29fa83977dd   
4      67117807b775f29fa83977de   
...                         ...   
31100  672cd5e2106e3b7eac9fca5f   
31101  672cd5e2106e3b7eac9fca60   
31102  672cd5e2106e3b7eac9fca61   
31103  672cd5e2106e3b7eac9fca62   
31104  672cd5e2106e3b7eac9fca63   

                                               file_name   ID  Year  \
0      joint committee on climate action debate - wed...  234  2018   
1      joint committee on climate action debate - wed...  234  2018   
2      joint committee on climate action debate - wed...  234  2018   
3      joint committee on climate action debate - wed...  234  2018   
4      joint committee on climate action debate - wed...  234  2018   
...                                                  ...  ...   ...   
31100        seanad éireann debate - tuesday, 5 apr 2022  513  2022   
31101      

In [10]:
def summarise(refs, context):
     
     prompt = (
         f" The attached list comprises Topics extracted from the source text. "
         f" For each item in the list, provide a one line summary of the position expressed by the speaker of the text in relation to the item, agriculture and sustainability "
         f" Refer specifically to the text only and do not generalise beyond the statements in the text."
         f" Do not include any feedback in your answer \n "
         f" List :\n{refs}\n\n"
         f" Source Text :\n{context}\n\n"
         " Returned items:"
     )
     
     system_prompt = (
        "You are a tool that summarises, in one line, the speaker's position on each of the listed Topics, that have been extracted from the attached text. "
        "When processing a list of topics, return the results as a list of summaries. "
        "Each summary must correspond to the topic in the same order as they are provided in the input list. "
        "Ensure each summary is concise, specific, and does not include additional context beyond what is explicitly stated in the text. "
        "If a topic is not discussed in the text, indicate 'No position expressed' for that topic."
    ) 
     
     response = openai.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ],
            max_tokens=400,  
            temperature=0.2
        )

     return response.choices[0].message.content.strip()

In [11]:
def extract_thoughts_summary(row, summarise_function):
    speaker = row['Speaker']
    topics = row['Topics']
    context = row['Exact Text']

    if not topics or not context:
        return {}

    system_prompt = (
        "You are an AI assistant tasked with analyzing the stance of a speaker on specific topics. "
        "Your job is to extract the speaker's thoughts or stance on each topic mentioned in the provided context."
    )
    user_prompt_template = (
        f"Speaker: {speaker}\n"
        f"Topic: {topics}\n"
        f"Context: {context}\n\n"
        "Analyze the speaker's stance on the topic mentioned and provide:\n"
        "1. The extracted thought or summary of their stance."
    )

    thoughts_dict = {}

    for topic in topics:
        try:
            user_prompt = user_prompt_template.format(speaker=speaker, topic=topic, context=context)
            
            response = summarise_function(system_prompt, user_prompt)

            thought_summary = "No specific thought expressed."
            if isinstance(response, str):
                lines = response.split("\n")
                for line in lines:
                    if line.lower().startswith("1."):
                        thought_summary = line.split("1.", 1)[-1].strip()

            thoughts_dict[topic] = {
                "Thought": thought_summary or "No specific thought expressed."
            }

        except Exception as e:
            print(f"Error processing topic '{topic}' for Speaker '{speaker}': {e}")
            thoughts_dict[topic] = {
                "Thought": "Error processing thought."
            }

    return thoughts_dict

In [12]:
data['Topic Thoughts'] = data.apply(lambda row: extract_thoughts_summary(row, summarise), axis = 1)

In [13]:
data.head()

Unnamed: 0,_id,file_name,ID,Year,Sliding_year,Speaker,Exact Text,Category,Text,term_document_matrix,term_document_matrix_dept,extracted_pol,Extracted Stakeholders,Mapped Formal Policy,Topic,Exact Text For Combined,Topics,Topic Thoughts
0,67117806b775f29fa83977da,joint committee on climate action debate - wed...,234,2018,2018-2020,Chairman,I welcome members of the committee and viewers...,"Climate Action, Environment and Communication",members committee viewers ceedgs Oirechts dy s...,"{'17': 1, '2009': 1, 'absolute': 1, 'act': 1, ...","{'17(2)(l': 1, '2009': 1, 'a': 6, 'absolute': ...",Citizens’ Assembly,Citizens’ Assembly,,,,[],{}
1,67117806b775f29fa83977db,joint committee on climate action debate - wed...,234,2018,2018-2020,Ms Justice Mary Laffoy,I am grateful to the Chairman and members of t...,"Climate Action, Environment and Communication",Chirmn members committee vittion ddress dy wor...,"{'10': 1, '11': 1, '13': 4, '15': 1, '17': 1, ...","{'1': 4, '10': 1, '11': 1, '13': 4, '15': 1, '...","Citizens’ Assembly, national dialogue on clima...","Citizens’ Assembly, national dialogue on clima...",,,,[],{}
2,67117806b775f29fa83977dc,joint committee on climate action debate - wed...,234,2018,2018-2020,Chairman,I thank Ms Justice Laffoy for all her work and...,"Climate Action, Environment and Communication",thnk Lffoy work synops recommendtions orr thre...,"{'able': 1, 'address': 1, 'all': 1, 'an': 1, '...","{'able': 1, 'address': 1, 'all': 1, 'an': 1, '...",,,,,,[emissions],{'emissions': {'Thought': 'The speaker acknowl...
3,67117806b775f29fa83977dd,joint committee on climate action debate - wed...,234,2018,2018-2020,Ms Justice Mary Laffoy,The Chairman is referring to one of the exampl...,"Climate Action, Environment and Communication",Chirmn one exmples ommendtion body function po...,"{'action': 1, 'an': 2, 'and': 2, 'as': 2, 'be'...","{'1': 1, 'a': 1, 'action': 1, 'an': 2, 'and': ...",,,,,,[],{}
4,67117807b775f29fa83977de,joint committee on climate action debate - wed...,234,2018,2018-2020,Ms Justice Mary Laffoy,Such an injunction would be an order for the S...,"Climate Action, Environment and Communication",junction would orr Stte job body would court t...,"{'action': 1, 'ages': 1, 'an': 2, 'and': 2, 'a...","{'a': 2, 'action': 1, 'ages': 1, 'an': 2, 'and...",,,,,,[],{}


In [14]:
def clean_policy_thoughts(thoughts_dict):
    if isinstance(thoughts_dict, dict):
        return {policy: thought for policy, thought in thoughts_dict.items() if thought not in ["No specific thought expressed", "No position expressed"]}
    return thoughts_dict

data['Topic Thoughts'] = data['Topic Thoughts'].apply(clean_policy_thoughts)

In [15]:
def flatten_thoughts_dataframe(df, speaker_col="Speaker", thoughts_col="Topic Thoughts"):

    flattened_rows = []

    for _, row in df.iterrows():
        speaker = row[speaker_col]
        thoughts_data = row[thoughts_col]

        if isinstance(thoughts_data, dict):
            for topic, details in thoughts_data.items():
                thought = details.get("Thought", "No specific thought expressed.")
                flattened_rows.append((speaker, topic, thought))

    flattened_df = pd.DataFrame(flattened_rows, columns=["Speaker", "Topics", "Topic Thoughts"])
    return flattened_df

In [16]:
flattened_data = flatten_thoughts_dataframe(data)
print(flattened_data)

                      Speaker                 Topics  \
0                    Chairman              emissions   
1      Ms Justice Mary Laffoy                 carbon   
2      Ms Justice Mary Laffoy              emissions   
3      Ms Justice Mary Laffoy              abatement   
4          Deputy Paul Murphy                 carbon   
...                       ...                    ...   
11190  Senator  Aisling Dolan                  water   
11191  Senator  Aisling Dolan              waterways   
11192  Senator  Aisling Dolan                 rivers   
11193  Senator  Aisling Dolan              community   
11194  Senator  Aisling Dolan  social sustainability   

                                          Topic Thoughts  
0      The speaker acknowledges the potential for the...  
1      The speaker supports a balanced approach to ca...  
2      The speaker supports a balanced approach to ag...  
3      The speaker supports a balanced approach to ag...  
4                               

In [17]:
def extract_sentiment_from_thoughts(df):
    system_prompt = (
        "You are a sentiment analysis assistant. Your task is to classify the sentiment of a speaker's thought "
        "on a specific policy into one of three categories: Support, Oppose, or Neutral. The sentiment should be "
        "based only on the given thought, and the response must be a single word: Support, Oppose, or Neutral."
    )
    
    sentiments = []

    for _, row in df.iterrows():
        speaker = row['Speaker']
        topics = row["Topics"]
        thought = row["Topic Thoughts"]

        user_prompt = (
            f"Topic: {topics}\n"
            f"Thought: {thought}\n\n"
            "Based on the thought, identify the sentiment as one of the following options:\n"
            "1. Support: If the speaker's words express agreement, endorsement, or positivity toward the topics.\n"
            "2. Oppose: If the speaker's words express disagreement, rejection, or negativity toward the topics.\n"
            "3. Neutral: If the speaker's words do not clearly indicate support or opposition.\n\n"
            "Provide your response as exactly one word: Support, Oppose, or Neutral."
        )

        response = openai.chat.completions.create(
            model="gpt-4o-mini", 
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            max_tokens=400,  
            temperature=0.2
        )

        sentiment = response.choices[0].message.content.strip()
        sentiments.append(sentiment)
    
    df['Sentiment'] = sentiments
    return df


In [18]:
sentiment_data = extract_sentiment_from_thoughts(flattened_data)

In [26]:
sentiment_data

Unnamed: 0,Speaker,Topics,Topic Thoughts,Sentiment
0,Mr. Aaron Forde,sustainability,The speaker supports sustainability initiative...,Support
1,Mr. Aaron Forde,economic viability,The speaker supports sustainability initiative...,Support
2,Chairman,carbon,No position expressed,Neutral
3,Dr. James Glynn,carbon,The speaker supports the use of carbon capture...,Support
4,Dr. James Glynn,neutrality,The speaker supports the use of carbon capture...,Support
...,...,...,...,...
373,Mr. Leo Clancy,abatement,No position expressed,Neutral
374,Ms Alexa Toomey,carbon,No position expressed on carbon.,Neutral
375,Ms Alexa Toomey,neutrality,No position expressed on carbon.,Neutral
376,Ms Alexa Toomey,emissions,No position expressed,Neutral


In [19]:
Dairy = ['dairy', 'milk', 'cattle'] 
Carbon = ['carbon', 'neutrality', 'emissions', 'abatement', 'nitrogen efficient', 'nitrogen fixing', 'sustainability'] 
Water = ['water', 'waterways', 'rivers', 'nitrates', 'eutrophication', 'effluent', 'discharge']
Livelihood = ['inherit', 'succession', 'rural development', 'community', 'social sustainability', 'society', 'economic viability']

def categorize_topic(topics):
    if topics in Dairy:
        return 'Dairy'
    elif topics in Carbon:
        return 'Carbon'
    elif topics in Water:
        return 'Water'
    elif topics in Livelihood:
        return 'Livelihood'
    else:
        return 'Other'

sentiment_data['Topic Category'] = sentiment_data['Topics'].apply(categorize_topic)
sentiment_data = sentiment_data[sentiment_data['Topic Thoughts'] != 'No position expressed']

print(sentiment_data)

                      Speaker                 Topics  \
0                    Chairman              emissions   
1      Ms Justice Mary Laffoy                 carbon   
2      Ms Justice Mary Laffoy              emissions   
3      Ms Justice Mary Laffoy              abatement   
9          Deputy Paul Murphy                 carbon   
...                       ...                    ...   
11190  Senator  Aisling Dolan                  water   
11191  Senator  Aisling Dolan              waterways   
11192  Senator  Aisling Dolan                 rivers   
11193  Senator  Aisling Dolan              community   
11194  Senator  Aisling Dolan  social sustainability   

                                          Topic Thoughts Sentiment  \
0      The speaker acknowledges the potential for the...   Neutral   
1      The speaker supports a balanced approach to ca...   Support   
2      The speaker supports a balanced approach to ag...   Support   
3      The speaker supports a balanced approach

In [20]:
def process_thoughts(thought):
    if ':' in thought:
        return thought.split(':', 1)[1].strip()
    else:
        return thought

sentiment_data.loc[:, 'Topic Thoughts'] = sentiment_data['Topic Thoughts'].apply(process_thoughts)


In [21]:
sp = 'Mr. Martin Keane'
t = 'Mr. Keane supports the dairy industry and emphasizes the importance of quality assurance in milk production.'

print(' '.join(sp.split()[:2]))

title = sp.split()[0]
lastname = sp.split()[-1]
title_last_name = title + ' ' + lastname
print(title_last_name)

Mr. Martin
Mr. Keane


In [22]:
def replace_speaker(thought, speaker):
    title = speaker.split()[0]
    lastname = speaker.split()[-1]
    title_last_name = title + ' ' + lastname
    if speaker in thought:
        return thought.replace(speaker, 'The speaker')
    elif title_last_name in thought:
        return thought.replace(title_last_name, 'The speaker')
    else:
        return thought

sentiment_data.loc[:, 'Topic Thoughts'] = sentiment_data.apply(lambda row: replace_speaker(row['Topic Thoughts'], row['Speaker']), axis=1)

print(sentiment_data)

                      Speaker                 Topics  \
0                    Chairman              emissions   
1      Ms Justice Mary Laffoy                 carbon   
2      Ms Justice Mary Laffoy              emissions   
3      Ms Justice Mary Laffoy              abatement   
9          Deputy Paul Murphy                 carbon   
...                       ...                    ...   
11190  Senator  Aisling Dolan                  water   
11191  Senator  Aisling Dolan              waterways   
11192  Senator  Aisling Dolan                 rivers   
11193  Senator  Aisling Dolan              community   
11194  Senator  Aisling Dolan  social sustainability   

                                          Topic Thoughts Sentiment  \
0      The speaker acknowledges the potential for the...   Neutral   
1      The speaker supports a balanced approach to ca...   Support   
2      The speaker supports a balanced approach to ag...   Support   
3      The speaker supports a balanced approach

In [23]:
sentiment_data

Unnamed: 0,Speaker,Topics,Topic Thoughts,Sentiment,Topic Category
0,Chairman,emissions,The speaker acknowledges the potential for the...,Neutral,Carbon
1,Ms Justice Mary Laffoy,carbon,The speaker supports a balanced approach to ca...,Support,Carbon
2,Ms Justice Mary Laffoy,emissions,The speaker supports a balanced approach to ag...,Support,Carbon
3,Ms Justice Mary Laffoy,abatement,The speaker supports a balanced approach to ag...,Support,Carbon
9,Deputy Paul Murphy,carbon,No position expressed on carbon.,Neutral,Carbon
...,...,...,...,...,...
11190,Senator Aisling Dolan,water,No position expressed,Neutral,Water
11191,Senator Aisling Dolan,waterways,No position expressed,Neutral,Water
11192,Senator Aisling Dolan,rivers,No position expressed,Neutral,Water
11193,Senator Aisling Dolan,community,No position expressed,Neutral,Livelihood


In [26]:
sentiment_data.to_excel(r'C:\Jupyter Notebook\Extractions\outputs\refined_data_topics.xlsx', index=False)

In [24]:
def insert_dataframe_into_mongodb(mongo_uri, db_name, collection_name, df):
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]

    data = df.to_dict(orient="records")

    if data:
        result = collection.insert_many(data)
        print(f"{len(result.inserted_ids)} documents inserted successfully.")
        return len(result.inserted_ids)
    else:
        print("No data to insert.")
        return 0

In [25]:
insert_dataframe_into_mongodb('mongodb://localhost:27017/', 'extraction', 'transcript_topics', sentiment_data)

8618 documents inserted successfully.


8618