# Extracting Speaker and Texts and Topic Modelling

## The Speaker names and thier corresponding Texts are extracted from the Public Account transcripts, and arranged / categorized by 2 year intervals, followed by Topic Modelling using NMF.

### Importing Libraries

In [1]:
import fitz
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
import os
from pymongo import MongoClient
from langdetect import detect
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

### Reading Files Names and Years

#### Reading the files from the source folder and joining with the MongoDB Metadata Collection to get the year of publish, and categorising each file in a two-year interval category.

In [2]:
def extract_year_pdf_files(base_folder, db_url, db_name, collection_name):
    pdf_files = []
    
    # Recursively traverse through all subfolders
    for root, dirs, files in os.walk(base_folder):
        for file in files:
            if file.endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))
    
    pdf_df = pd.DataFrame([os.path.splitext(os.path.basename(f))[0] for f in pdf_files], columns=['file_name'])
    
    # Connect to MongoDB
    client = MongoClient(db_url)
    db = client[db_name]
    collection = db[collection_name]
    
    # Fetch data from MongoDB collection
    mongo_data = list(collection.find({}, {'_id': 0, 'Title': 1, 'Publish_Year': 1, 'ID': 1}))
    mongo_df = pd.DataFrame(mongo_data)
    mongo_df['Year'] = mongo_df['Publish_Year']
    
    # Normalize file names for merging
    pdf_df['file_name'] = pdf_df['file_name'].str.strip().str.lower()
    mongo_df['Title'] = mongo_df['Title'].str.strip().str.lower()
    
    # Merge DataFrames
    merged_df = pd.merge(pdf_df, mongo_df, left_on='file_name', right_on='Title')
    columns_keep = ['file_name', 'ID', 'Year']
    final_df = merged_df[columns_keep]
    final_df = final_df.sort_values(by='Year')
    
    # Define sliding window function
    def get_sliding_window(year):
        start_year = (year // 2) * 2
        return f"{start_year}-{start_year + 2}"
    
    # Apply sliding window function
    final_df['Sliding_year'] = final_df['Year'].apply(get_sliding_window)
    
    return final_df

In [3]:
folder_path = r'C:\Users\0132499s\Documents\Documents\Transcript\test'
db_url = 'mongodb://localhost:27017/'
db_name = 'foodsystems'
collection_name = 'document_metadata'

files_with_year = extract_year_pdf_files(folder_path, db_url, db_name, collection_name)
print(files_with_year)

                                     file_name   ID  Year Sliding_year
0  seanad éireann debate - monday, 21 jun 2021  425  2021    2020-2022


### Extract file data

#### Extracting File data by parsing each file from the folder and applying another method extract_speaker_text to extract individual speaker and their corresponding texts.

In [4]:
def all_document(root_folder):
    all_data = []
    
    # Recursively traverse through all subfolders
    for root, dirs, files in os.walk(root_folder):
        for filename in files:
            if filename.endswith('.pdf'):
                pdf_path = os.path.join(root, filename)
                extracted_data_index = extract_topics_from_index(pdf_path)
                final_topics, normalized_topics, line_breaks, up_to_line_break_final, normalized_up_to_line_break = zip(*extracted_data_index)
                df = extract_speaker_text(pdf_path, speaker_titles, final_topics, normalized_topics)
                #df = extract_speaker_text(pdf_path, speaker_titles, final_topics, normalized_topics, line_breaks, up_to_line_break_final, normalized_up_to_line_break)
                category = os.path.basename(root)  # Get the subfolder name as category
                df['Category'] = category  # Add the category column
                all_data.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df

#### Merging the year categories dataframe with the extracted Speaker and Text dataframe, in order to categorise each speaker speeach and thier texts in two-year intervals.

In [5]:
def merge_text_year(text_df, year_df):
    text_df['file_name'] = text_df['file_name'].str.lower()
    combined_df = year_df.merge(text_df, right_on = 'file_name', left_on = 'file_name')
    return combined_df

#### Compiling a list of titles with which the names of speakers starts. This list will be used in extract_speaker_text function to extract the names of the speakers. The string matching will start from these titles and will end at :, which will comprise of the complete name of the Speaker.

In [6]:
speaker_titles = ['Chairman',
'Dr.',
'Mr.',
'Minister',
'Deputy',
'Ms',
'An Leas-Cheann Comhairle',
'Acting Chairman',
'An Ceann Comhairle',
'Senator',
'Co-Chairman',
'An Cathaoirleach',
'An Leas-Chathaoirleach',
'Acting Chairperson',
'Professor',
'Vice',
'Clerk',
'Deputies',
'Comptroller',
'Audit',
'General',
'The Taoiseach',
'(Deputy',
'The Tánaiste']

### Extracting the topics from the first page

In [7]:
def normalize_text(text):
    # Remove all spaces and newlines for comparison purposes
    return re.sub(r'\s+', '', text).strip()

def extract_topics_from_index(pdf_path):
    doc = fitz.open(pdf_path)
    first_page_text = doc[0].get_text("text")
    
    # Updated regex pattern to capture topics correctly, including apostrophes and handling line breaks
    index_pattern = re.compile(r"\d{2}/\d{2}/\d{4}[A-Z0-9]*\d+\s*(.+?)(?=\s+\d{2}/\d{2}/\d{4}|$)", re.DOTALL)
    topics = index_pattern.findall(first_page_text)
    
    # Cleaning and formatting the extracted topics
    def clean_topic(topic):
        topic = re.sub(r'[^A-Za-zÁáÉéÍíÓóÚú0-9\s\-\(\)\[\]:,\'’]', '', topic).strip()
        topic = re.sub(r'(?<=\D)(?=\d)', ' ', topic)
        topic = re.sub(r'\s*\d+\s*$', '', topic).strip()  # Remove trailing page numbers
        topic = re.sub(r'\s+', ' ', re.sub(r'(?<=\w)-\s+(?=\d)', '-', topic))
        return topic
    
    cleaned_topics = [clean_topic(topic) for topic in topics]
    final_topics = [clean_topic(topic) for topic in cleaned_topics]
    
    # Normalize the topics for comparison
    normalized_topics = [normalize_text(topic) for topic in final_topics]
    
    # Determine if a topic has a line break and extract the string up until the line break
    line_breaks = [1 if '\n' in topic else 0 for topic in topics]
    up_to_line_break = [topic.split('\n')[0] if '\n' in topic else topic for topic in topics]
    up_to_line_break_final = [clean_topic(topic) for topic in up_to_line_break]
    normalized_up_to_line_break = [normalize_text(topic) for topic in up_to_line_break_final]

    return list(zip(final_topics, normalized_topics, line_breaks, up_to_line_break_final, normalized_up_to_line_break))


In [8]:
pdf_p = r'C:\Users\0132499s\Documents\Documents\Transcript\test\Seanad Éireann debate - Monday, 21 Jun 2021.pdf'
topics = extract_topics_from_index(pdf_p)
print(topics)
topics_df =pd.DataFrame(topics)

[('Gnó an tSeanaid - Business of Seanad', 'GnóantSeanaid-BusinessofSeanad', 0, 'Gnó an tSeanaid - Business of Seanad', 'GnóantSeanaid-BusinessofSeanad'), ('Nithe i dtosach suíonna - Commencement Matters', 'Nitheidtosachsuíonna-CommencementMatters', 0, 'Nithe i dtosach suíonna - Commencement Matters', 'Nitheidtosachsuíonna-CommencementMatters'), ('Travel Documents', 'TravelDocuments', 0, 'Travel Documents', 'TravelDocuments'), ('School Accommodation', 'SchoolAccommodation', 0, 'School Accommodation', 'SchoolAccommodation'), ('Job Creation', 'JobCreation', 0, 'Job Creation', 'JobCreation'), ('Pharmacy Services', 'PharmacyServices', 0, 'Pharmacy Services', 'PharmacyServices'), ('Employment Support Services', 'EmploymentSupportServices', 0, 'Employment Support Services', 'EmploymentSupportServices'), ('Defence Forces', 'DefenceForces', 0, 'Defence Forces', 'DefenceForces'), ('Gnó an tSeanaid - Business of Seanad', 'GnóantSeanaid-BusinessofSeanad', 0, 'Gnó an tSeanaid - Business of Seanad',

In [9]:
s = 'Covid- 19 Pandemic'
print(s.split)

<built-in method split of str object at 0x00000276EDB08870>


In [31]:
def normalize_text(text):
    return re.sub(r'\s+', ' ', text).strip()

t = '''
Circular Economy, Waste Management (Amendment) and Minerals Development 
(Amendment) Bill 2022: Referral to Select Committee
'''
t2 = 'Circular Economy Waste Management (Amendment) and Minerals Development (Amendment) Bill 2022: Re ferral to Select Committee'
print(normalize_text(t))
print(normalize_text(t2))

Circular Economy, Waste Management (Amendment) and Minerals Development (Amendment) Bill 2022: Referral to Select Committee
Circular Economy Waste Management (Amendment) and Minerals Development (Amendment) Bill 2022: Re ferral to Select Committee


#### Extracting the Speaker and their texts from the PDF files.

In [11]:
def extract_speaker_text(pdf_path, speaker_titles, topics, normalized_topics):
    data = []
    speaker_pattern = re.compile(r"^\s*(" + "|".join(re.escape(title) for title in speaker_titles) + r")\s*[^:]*:")
    unwanted_pattern1 = re.compile(r"\d{2}/\d{2}/\d{4}[A-Z0-9]*\d(?=[A-Za-z.])")
    unwanted_pattern2 = re.compile(r"\d{2}/\d{2}/\d{4}[A-Z0-9]*\s*\d+")
    time_pattern = re.compile(r"\b\d{1,2} o’clock\b", re.IGNORECASE)
    attached_time_pattern = re.compile(r"(\w*)\d{1,2} o’clock(\w*)", re.IGNORECASE)
    current_topic = None
    file_name = os.path.splitext(os.path.basename(pdf_path))[0]
    pending_speaker = None
    pending_speech = ""

    doc = fitz.open(pdf_path)
    for page_num, page in enumerate(doc, start=1):
        text = page.get_text("text")
        text = time_pattern.sub("", text)
        text = attached_time_pattern.sub(lambda m: (m.group(1) or '') + (m.group(2) or ''), text)
        lines = text.split('\n')

        for i, line in enumerate(lines):
            line = unwanted_pattern1.sub("", line).strip()
            line = unwanted_pattern2.sub("", line).strip()
            normalized_line = normalize_text(line)
            for original_topic, normalized_topic in zip(topics, normalized_topics):
                if normalized_line == normalized_topic:
                    if pending_speaker:
                        pending_speech = unwanted_pattern1.sub("", pending_speech).strip()
                        pending_speech = unwanted_pattern2.sub("", pending_speech).strip()
                        data.append({"file_name": file_name, "Topic": current_topic, "Speaker": pending_speaker, "Exact Text": pending_speech})
                        pending_speaker = None
                        pending_speech = ""
                    current_topic = original_topic
                    break
            else:
                match = speaker_pattern.match(line)
                if match:
                    if pending_speaker:
                        pending_speech = unwanted_pattern1.sub("", pending_speech).strip()
                        pending_speech = unwanted_pattern2.sub("", pending_speech).strip()
                        data.append({"file_name": file_name, "Topic": current_topic, "Speaker": pending_speaker, "Exact Text": pending_speech})
                    pending_speaker = match.group().strip(":")
                    pending_speech = line.split(":", 1)[1].strip()
                else:
                    if pending_speaker:
                        pending_speech += " " + line.strip()
                    else:
                        if i + 1 < len(lines) and lines[i + 1].strip().startswith("("):
                            line += " " + lines[i + 1].strip()
                            match = speaker_pattern.match(line)
                            if match:
                                pending_speaker = match.group().strip(":")
                                pending_speech = line.split(":", 1)[1].strip()
                                continue

    if pending_speaker:
        pending_speech = unwanted_pattern1.sub("", pending_speech).strip()
        pending_speech = unwanted_pattern2.sub("", pending_speech).strip()
        data.append({"file_name": file_name, "Topic": current_topic, "Speaker": pending_speaker, "Exact Text": pending_speech.strip()})

    df = pd.DataFrame(data)
    return df

In [12]:
def test_partial_topic_matching():
    norm_up_to_line_break = 'PlanningandDevelopment(SolarPanelsforPublicBuildings,Schools,HomesandOtherPremises)(Amend'
    normalized_line = 'PlanningandDevelopment(SolarPanelsforPublicBuildings,Schools,Homesand'
    
    if normalized_line == norm_up_to_line_break[:len(normalized_line)]:
        print("Match found")
    else:
        print("No match found")

test_partial_topic_matching()


Match found


In [13]:
import re

# Updated regex pattern to handle no space between date and following text
unwanted = re.compile(r"\d{2}/\d{2}/\d{4}[A-Z0-9]*\d+")

l1 = '''
2 p.m. 5/07/2015Y00100Events at Ballymurphy in 1971 and Legacy Issues: '''
line = unwanted.sub("", l1).strip()
print(line)


2 p.m. 5/07/2015Y00100Events at Ballymurphy in 1971 and Legacy Issues:


### Data Pre-Processing and Cleaning

#### Remove Header and Footer from each page. Each page contains either Date or Abbrevaited Department Name as Header and Page Number as Footer.

In [14]:
headers = ['Dáil Éireann']

In [15]:
def remove_header_footer(text, headers):
    # Create a regex pattern to match any string from the headers list and dates
    header_pattern = r"(" + "|".join(re.escape(header) for header in headers) + r")"
    date_pattern = r"\d{1,2}\s+(January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}"
    page_number_pattern = r"\d+"

    combined_pattern = re.compile(
        rf"(\s*{header_pattern}\s*{page_number_pattern}\s*|\s*{page_number_pattern}\s*{header_pattern}\s*|\s*{date_pattern}\s*{page_number_pattern}\s*|\s*{page_number_pattern}\s*{date_pattern}\s*)", re.IGNORECASE
    )
    
    cleaned_text = combined_pattern.sub(" ", text)
    return cleaned_text

    
text = 'public health nurse who will do an assessment and make a referral for 29 November 2023 613 the vital home care supports. e well educated and enthusiastic about their job. They are finding Dáil Éireann 614 a huge impediment getting a place to live in the areas we are talking about.'
cleaned_text = remove_header_footer(text, headers)
print(cleaned_text)

public health nurse who will do an assessment and make a referral for the vital home care supports. e well educated and enthusiastic about their job. They are finding a huge impediment getting a place to live in the areas we are talking about.


#### Removing Stopwords from the text.

In [16]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

t = 'correspondence committee meeting agreed department item dated note members public ask information matter week witnesses audir regarding publish statements clerk secretary session NAMA issues issue accounts propose minutes HSE local education financial board published county received requested day make hear housing letter come respect june want items meetings matters individual proposed discussion april questions privilege chief school follow 2017 deal'
print(remove_stop_words(t))

correspondence committee meeting agreed department item dated note members public ask information matter week witnesses audir regarding publish statements clerk secretary session NAMA issues issue accounts propose minutes HSE local education financial board published county received requested day make hear housing letter come respect june want items meetings matters individual proposed discussion april questions privilege chief school follow 2017 deal


#### Removing the grammer terms; Personal and Possesive Nouns, Verb, Adjectives, Adverbs, Preposition, Conjunction, Interjection, Determiner, Pronouns. Keeping everything else from the text, which should include mostly Nouns, numbers and some other words which remained untagged.

In [17]:
def extract_grammer(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    remove_pos = ['PRP', 'PRP$', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS','IN', 'CC', 'UH', 'DT', 'PDT', 'WDT', 'WP', 'WP$', 'WRB']
    filtered_words = [word for word, pos in tagged_words if pos not in remove_pos]
    return ' '.join(filtered_words)
    #return tagged_words

t = 'correspondence committee meeting agreed department item dated note members public ask information matter week witnesses audir regarding publish statements clerk secretary session NAMA issues issue accounts propose minutes HSE local education financial board published county received requested day make hear housing letter come respect june want items meetings matters individual proposed discussion april questions privilege chief school follow 2017 deal'
print(extract_grammer(t))

correspondence committee department item members information matter week witnesses statements secretary session NAMA issues issue accounts minutes HSE education board day housing letter respect june items meetings matters discussion april questions school 2017 deal


#### Remove words with just two or less letters. As well as, sentences with less than five words. Since, in both these cases, the words / sentences are not very informative.

In [18]:
def remove_two_letter_words(text):
    return ' '.join(word for word in text.split() if len(word) > 2)
                    
def remove_line_less_than_five(df):
    def word_count(text):
        return len(text.split())
        
    df['Text'] = df['Text'].apply(remove_two_letter_words)
        
    filtered_df = df[df['Text'].apply(word_count) > 5]
    return filtered_df

#### Removing commas, special characters with commas, and extra spaces.

In [19]:
def remove_special_char(text):
    # Remove commas
    cleaned_text = text.replace(',', '')
    # Replace special characters with blank spaces
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', cleaned_text)
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

t = 'correspondence committee meeting agreed department item dated note members public ask information matter week witnesses audir regarding publish statements clerk secretary session NAMA issues issue accounts propose minutes HSE local education financial board published county received requested day make hear housing letter come respect june want items meetings matters individual proposed discussion april questions privilege chief school follow 2017 deal'
print(remove_special_char(t))

correspondence committee meeting agreed department item dated note members public ask information matter week witnesses audir regarding publish statements clerk secretary session NAMA issues issue accounts propose minutes HSE local education financial board published county received requested day make hear housing letter come respect june want items meetings matters individual proposed discussion april questions privilege chief school follow 2017 deal


#### Removing extra spaces and numbers, since they are not very informative without context.

In [20]:
def remove_extra_spaces_and_numbers(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b\w\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

t = 'cember 5 30 p m agenda examation update eradication TB may meetg Wednesday morng new CBF classifications confirmed likely jot committee adjourned 8 50 p m 5 30 p m W'
print(remove_extra_spaces_and_numbers(t))

cember agenda examation update eradication TB may meetg Wednesday morng new CBF classifications confirmed likely jot committee adjourned


#### Remove names of speakers from the final text, since they are just addressing each other in a dialog and it is non-informative.

In [21]:
def remove_speaker_words(df):
    all_speaker_words = set(speaker_titles)
    for speaker_name in df['Speaker']:
        speaker_words = speaker_name.split()
        all_speaker_words.update(speaker_words)
    print(f"All speaker words :{all_speaker_words}")
    for index, row in df.iterrows():
        text = row['Text']
        for word in all_speaker_words:
            text = text.replace(word, '')
        df.at[index, 'Text'] = text.strip()
    return df

#### A function to detect the language of a piece of text, since some of the texts are available in Irish.

In [22]:
def detect_language(text):
    try:
        lang = detect(text)
        return 'Irish' if lang == 'ga' else 'English'
    except:
        return 'Unknown'

### Extracting all the texts from all the PDF files in one go and storing all the data in a dataframe, df_with_text.

In [23]:
df_with_text = all_document(folder_path)
print(df_with_text.shape)
print(df_with_text)

(302, 5)
                                       file_name  \
0    Seanad Éireann debate - Monday, 21 Jun 2021   
1    Seanad Éireann debate - Monday, 21 Jun 2021   
2    Seanad Éireann debate - Monday, 21 Jun 2021   
3    Seanad Éireann debate - Monday, 21 Jun 2021   
4    Seanad Éireann debate - Monday, 21 Jun 2021   
..                                           ...   
297  Seanad Éireann debate - Monday, 21 Jun 2021   
298  Seanad Éireann debate - Monday, 21 Jun 2021   
299  Seanad Éireann debate - Monday, 21 Jun 2021   
300  Seanad Éireann debate - Monday, 21 Jun 2021   
301  Seanad Éireann debate - Monday, 21 Jun 2021   

                                                 Topic  \
0                 Gnó an tSeanaid - Business of Seanad   
1                                     Travel Documents   
2                                     Travel Documents   
3                                     Travel Documents   
4                                 School Accommodation   
..                

In [24]:
distinct_topics = df_with_text['Topic'].unique()
print(distinct_topics)

['Gnó an tSeanaid - Business of Seanad' 'Travel Documents'
 'School Accommodation' 'Job Creation' 'Pharmacy Services'
 'Employment Support Services' 'Defence Forces'
 'An tOrd Gnó - Order of Business'
 'Eleventh Report of Committee on Parliamentary Privileges and Oversight: Motion'
 'Climate Action and Low Carbon Development (Amendment) Bill 2021: Second Stage'
 'Aviation Sector: Statements'
 'Gender Pay Gap Information Bill 2019: Committee Stage']


In [26]:
def save_to_excel(df, excel_path):
    df.to_excel(excel_path, index=False)

excel_path_separate = r'C:\Users\0132499s\Documents\Documents\Transcript\test\text.xlsx'

save_to_excel(df_with_text, excel_path_separate)

In [48]:
def normalize_text(text):
    # Remove all spaces and newlines for comparison purposes
    return re.sub(r'\s+', '', text).strip()

def update_topics_with_normalization(df, topics_df):
    # Filter topics_df to only include rows where line_break is 1
    filtered_topics_df = topics_df[topics_df[2] == 1]
    print(str(filtered_topics_df[4]))
    # Iterate through each row in the DataFrame
    for i in range(len(df)):
        # Split the Exact Text into separate lines
        lines = df.at[i, 'Exact Text'].split('. ')
        for line in lines:
            normalized_line = normalize_text(line)
            print(f"LINE: {line}, NORMALIZED: {normalized_line}")
            # Check if the normalized line matches the start of any normalized topic
            for norm_up_to_line_break in filtered_topics_df[4]:
                print(f"NORMALIZED TOPIC: {norm_up_to_line_break}, NORMALIZED: {normalized_line}")
                if normalized_line == norm_up_to_line_break[:len(normalized_line)]:
                    print('norm_up_to_line_break: ',norm_up_to_line_break)
                    # Remove the partial topic from the Exact Text
                    df.at[i, 'Exact Text'] = df.at[i, 'Exact Text'].replace(line, '').strip()
                    # Update the Topic for subsequent rows until a new topic is encountered
                    partial_topic = line.strip()
                    for j in range(i + 1, len(df)):
                        if df.at[j, 'Topic'] == '':
                            df.at[j, 'Topic'] = partial_topic
                        else:
                            break
                    break
    return df

In [None]:
PlanningandDevelopment(SolarPanelsforPublicBuildings,Schools,HomesandOtherPremises)(Amendment)Bill2021
PlanningandDevelopment(SolarPanelsforPublicBuildings,Schools,HomesandOtherPremises)(Amend

In [49]:
updated_df = update_topics_with_normalization(df_with_text, topics_df)
print(updated_df)

10    PlanningandDevelopment(SolarPanelsforPublicBui...
Name: 4, dtype: object
LINE: I have received notice from Senator Regina Doherty that, on the motion for the Commencement of the House today, she proposes to raise the following matter: The need for the Minister of State with responsibility for public procurement and eGov­ ernment to outline his plan for the implementation and operation of the EU digital Covid certificate in Ireland, NORMALIZED: IhavereceivednoticefromSenatorReginaDohertythat,onthemotionfortheCommencementoftheHousetoday,sheproposestoraisethefollowingmatter:TheneedfortheMinisterofStatewithresponsibilityforpublicprocurementandeGov­ernmenttooutlinehisplanfortheimplementationandoperationoftheEUdigitalCovidcertificateinIreland
NORMALIZED TOPIC: PlanningandDevelopment(SolarPanelsforPublicBuildings,Schools,HomesandOtherPremises)(Amend, NORMALIZED: IhavereceivednoticefromSenatorReginaDohertythat,onthemotionfortheCommencementoftheHousetoday,sheproposestoraisethefollowingmat

In [87]:
distinct_topics_updated_df = updated_df['Topic'].unique()
print(distinct_topics_updated_df)

['Gnó an tSeanaid - Business of Seanad' 'Travel Documents'
 'School Accommodation' 'Job Creation' 'Pharmacy Services'
 'Employment Support Services' 'Defence Forces'
 'An tOrd Gnó - Order of Business'
 'Eleventh Report of Committee on Parliamentary Privileges and Oversight: Motion'
 'Climate Action and Low Carbon Development (Amendment) Bill 2021: Second Stage'
 'Aviation Sector: Statements'
 'Gender Pay Gap Information Bill 2019: Committee Stage']


In [75]:
def save_to_excel(df, excel_path):
    df.to_excel(excel_path, index=False)

excel_path_separate = r'C:\Users\0132499s\Documents\Documents\Transcript\test\test2.xlsx'

save_to_excel(updated_df, excel_path_separate)

### Merging the extracted dataframe with the year category dataframe, so that each of the Speaker and Texts is now categorised in two-year intervals.

In [67]:
f = 'Dáil Éireann debate - Thursday, 31 Mar 2022'
matching_row = files_with_year.loc[files_with_year['file_name'] == f.lower()]
print(matching_row)

                                     file_name   ID  Year Sliding_year
0  dáil éireann debate - thursday, 31 mar 2022  512  2022    2022-2024


In [68]:
final_df = merge_text_year(df_with_text, files_with_year)
print(final_df)

                                         file_name   ID  Year Sliding_year  \
0     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
1     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
2     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
3     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
4     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
...                                            ...  ...   ...          ...   
1867  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   
1868  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   
1869  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   
1870  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   
1871  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   

                                                  Topic  \
0   

In [69]:
distinct_topic_count = final_df.groupby('Topic').size().reset_index(name = 'Counts')
print(distinct_topic_count)

                                                Topic  Counts
0                                Agriculture Industry       5
1                     An tOrd Gnó - Order of Business       3
2                           Apprenticeship Programmes       1
3                         Aviation Sector: Statements       2
4                                       Beef Industry       2
..                                                ...     ...
79                                   Travel Documents       1
80                         Visit of Kenyan Delegation       1
81             Women’s Health Action Plan: Statements       1
82                                Youth Work Projects       4
83  Ábhair Shaincheisteanna Tráthúla - Topical Iss...       2

[84 rows x 2 columns]


### Data Pre-Processing and Cleaning steps applied on the final_df, which has Speaker and Text details categorised by years.

In [70]:
# Remove Header-Footer
final_df['Text'] = final_df['Exact Text'].apply(lambda x: remove_header_footer(x, headers))
final_df['Exact Text'] = final_df['Exact Text'].apply(lambda x: remove_header_footer(x, headers))
final_df['Exact Text'] = final_df['Exact Text'].str.replace('\u00A0', ' ', regex=False)

# Remove lines with less than 5 words
final_df = remove_line_less_than_five(final_df)

# Remove stop words
final_df['Text'] = final_df['Text'].apply(remove_stop_words)

# Remove
final_df['Text'] = final_df['Text'].apply(extract_grammer)

# Remove speaker names from the final speeches
final_df = remove_speaker_words(final_df)

final_df['Text'] = final_df['Text'].apply(remove_special_char)

# Remove extra spaces from the Speaker names
final_df['Speaker'] = final_df['Speaker'].apply(remove_extra_spaces_and_numbers)
final_df['Text'] = final_df['Text'].apply(remove_extra_spaces_and_numbers)

All speaker words :{'Comhairle', 'Sherlock', 'Professor', 'Frankie', 'Patrick', 'Durkan)', 'Corcoran', 'Donnghaile', 'Browne', 'Paul', 'Simon', 'Andrew', 'Ross', 'Children', 'An Ceann Comhairle', 'Matt', 'Reilly', 'Lochlainn', 'General', 'Higgins', 'O’Brien)', 'Richard', 'Seery', 'Denise', 'Keating)', 'Whitmore', 'Robbie', 'McEntee)', 'Buttimer)', 'Environment,', 'Shane', 'Mary', 'Ríordáin', 'Fleming', 'Berry', 'Sharon', 'Farrell', 'McEntee', 'The Tánaiste', 'Troy)', 'Communications', 'Chairman', 'Doyle', 'Smith', 'Butler)', 'Dolan)', 'Ruairí', 'Crowe', 'McGahon)', 'Cannon', 'Marine', 'Ferris', 'Brendan', 'Josepha', 'Neville', 'Protection', 'Warfield', 'Louise', 'Fergus', 'Timmy', 'for', 'Cathal', 'Gallagher', 'McLellan', 'Pa', 'Currie', 'Bacik', 'Joan', 'Smyth', 'McGreehan', 'Sorca', 'Chairperson', 'Tom', 'O’Gorman', 'Gníomhach', 'Paschal', 'Gino', 'Jennifer', 'Vincent', 'Mark', 'Health', 'Damien', 'Mitchell', 'Bernard', 'Ahearn', 'Sean', 'Sport', 'Horkan', 'Róisín', 'Garvey', 'Cullin

In [71]:
final_df = final_df[final_df['Topic'].notna()
&
final_df['Topic'].str.strip().astype(bool)]

In [72]:
distinct_topics = final_df['Topic'].unique()
print(distinct_topics)

['Company Closures'
 'Child Care (Guardian Ad Litem) Bill 2015: First Stage 60 5072015Y 00100Events at Ballymurphy in 1971 and Legacy Issues: Motion'
 'Company Law' 'Employment Rights' 'IDA Supports' 'Trade Agreements'
 'Enterprise Support Schemes' 'Departmental Communications'
 'Construction Contracts'
 'Defence (Amendment) Bill 2015 [Seanad]: Second Stage'
 'Defence (Amendment) Bill 2015 [Seanad]: Committee and Remaining Stages'
 'Leaders’ Questions' 'Order of Business' 'Northern Ireland: Statements'
 'Message from Seanad' 'Tax Code' 'Apprenticeship Programmes'
 'Teaching Qualifications' 'Cycling Facilities Funding'
 'Climate Action and Low Carbon Development Bill 2015: Report and Final Stages 120 2015DDD 00050Children (Amendment) Bill 2015 [Seanad]: Order for Report Stage'
 'Children (Amendment) Bill 2015 [Seanad]: Report and Final Stages 128 0Petroleum (Exploration and Extraction) Safety Bill 2015 [Seanad]: Second Stage'
 'Social Services and Support: Motion (Resumed) [Private Memb

In [73]:
distinct_speaker = final_df['Speaker'].unique()
print(distinct_speaker)

['Deputy Dara Calleary' 'Deputy Gerald Nash' 'Deputy Peadar Tóibín'
 'Minister for Jobs, Enterprise and Innovation (Deputy Richard Bruton)'
 'Deputy Richard Bruton' 'Deputy Clare Daly' 'Deputy Damien English'
 '(Deputy Richard Bruton)' 'An Leas-Cheann Comhairle'
 'Deputy Terence Flanagan' '(Deputy Gerald Nash)' 'Deputy John Browne'
 'Minister for Defence (Deputy Simon Coveney)' 'Deputy Seán Fearghaíl'
 'Deputy Simon Coveney' 'Deputy Denis Naughten'
 'Acting Chairman (Deputy Liam Twomey)' 'Deputy Gabrielle McFadden'
 'Deputy Micheál Martin' 'Deputy Noel Coonan' 'An Ceann Comhairle'
 'The Taoiseach' 'Deputy Barry Cowen' 'Deputy Willie ’Dea'
 'Deputy Mattie McGrath' 'Deputy Mary Lou McDonald'
 'Deputy Finian McGrath' 'Deputy Joe Carey' 'Deputy James Reilly'
 'Deputy Dessie Ellis' 'Deputy Ray Butler' 'Deputy Brian Stanley'
 'Deputy Paul Murphy' 'Deputy Martin Ferris' 'Deputy Mick Wallace'
 'Deputy Eric Byrne' 'Deputy Róisín Shortall' 'Deputy Paul Kehoe'
 'Deputy Ruth Coppinger' 'Deputy Mar

### Append the Topics at the starting of the Exact Text string, in a separate column.

In [74]:
def append_topic(row):
    return f"'{row['Topic'].lower()}' {row['Exact Text']}"

final_df['Exact Text For Combined'] = final_df['Exact Text']
final_df['Exact Text'] = final_df.apply(append_topic, axis = 1)

In [75]:
print(final_df)

                                         file_name   ID  Year Sliding_year  \
0     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
1     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
2     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
3     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
4     dáil éireann debate - wednesday, 15 jul 2015   59  2015    2014-2016   
...                                            ...  ...   ...          ...   
1867  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   
1868  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   
1869  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   
1870  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   
1871  dáil éireann debate - wednesday, 29 nov 2023  669  2023    2022-2024   

                                                  Topic  \
0   

### Export as Excel
#### Saving both dataframes with separate Speaker Texts and with grouped by Speaker and Year Interval Texts as excel files. Can be skipped since the complete 'Text' did not fit in the excel, more than half of the 'Text' got cut off.

In [None]:
def save_to_excel(df, excel_path):
    df.to_excel(excel_path, index=False)

excel_path_separate = r'C:\Users\0132499s\Documents\Documents\Transcript\test\SlidingYear_Separate_Speaker.xlsx'

save_to_excel(final_df, excel_path_separate)

### Update: 15.10.2024

#### Pillar I of the technical architecture does not require grouped texts, since we will be extracting the exact texts said by the stakeholder. So inserting each text individually for each dialogue.

The function will check if there is already a records available for same 'Speaker', 'File Name' and 'Exact Text', the insert will only be done if no such record is available in the collection.

In [54]:
def store_exact_df_in_mongodb(df, db_url, db_name, collection_name):
    client = MongoClient(db_url)
    db = client[db_name]
    collection = db[collection_name]
    inserted_count = 0
    
    for index, row in df.iterrows():
        query = {
            "Speaker": row["Speaker"],
            "file_name": row["file_name"],
            "Exact Text": row["Exact Text"],
            "Exact Text For Combined": row["Exact Text For Combined"]
        }
        
        # Check if the document already exists
        if collection.count_documents(query) == 0:
            record = row.to_dict()
            collection.insert_one(record)
            inserted_count += 1
    
    print(f'Number of records inserted: {inserted_count}')

In [55]:
store_exact_df_in_mongodb(final_df, db_url, 'transcripts', 'P1_speaker_speech')

Number of records inserted: 93292


### Store the extracted topics in the 'keywords' collection.

In [11]:
def insert_topics_to_mongodb(folder_path, db_name, collection_name):
    # Connect to MongoDB
    client = MongoClient('mongodb://localhost:27017/')
    db = client[db_name]
    collection = db[collection_name]
    
    # Iterate through all PDF files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(folder_path, filename)
            final_topics, _ = extract_topics_from_index(pdf_path)
            
            # Convert topics to lowercase and insert into MongoDB if not already present
            for topic in final_topics:
                topic_lower = topic.lower()
                if not collection.find_one({'keywords': topic_lower}):
                    collection.insert_one({'keywords': topic_lower})

folder_path = r'C:\Users\0132499s\Documents\Documents\Transcripts - Miscellanous\Dail Eireann'
db_name = 'foodsystems'
collection_name = 'keywords'
insert_topics_to_mongodb(folder_path, db_name, collection_name)

### To check if any of the rows has more than 32,000 characters(maximum limit for an excel cell) for any Text column.

In [77]:
long_text_row = final_df[final_df['Text'].str.len() > 32000]
if not long_text_row.empty:
    print("Row with more than 32000 characters found: ")
    print(long_text_row)
else:
    print("No rows with more than 32000 characters")

No rows with more than 32000 characters
