# Term - Document Matrix for Transcripts

In [1]:
import os
import PyPDF2
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [20]:
import pymongo
from pymongo import MongoClient

### Function to extract the text from PDF Files.

In [3]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
    return text

In [4]:
file_names = []
categories = []
texts = []
root_dir = r'C:\Users\0132499s\Documents\Documents\Transcript'

### Extract the files names and their categories from the folders.
#### The File Names will be the name of the file and the category will be the sub-folders inside the transcript documents.

In [5]:
for folder_name, subfolders, filenames in os.walk(root_dir):
    for filename in filenames:
        if filename.endswith('.pdf'):
            file_path = os.path.join(folder_name, filename)
            text = extract_text_from_pdf(file_path)
            file_names.append(filename)
            categories.append(os.path.basename(folder_name))
            texts.append(text)

df = pd.DataFrame({
    'File Name' : file_names,
    'Category' : categories,
    'Text' : texts
})

In [6]:
print(df.head())

                                           File Name               Category  \
0  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   
1  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   
2  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   
3  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   
4  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   

                                                Text  
0  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...  
1  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...  
2  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...  
3  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...  
4  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...  


### Remove Stop Words

In [7]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

### Remove grammer other than Nouns

In [8]:
def extract_grammer(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    remove_pos = ['PRP', 'PRP$', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS','IN', 'CC', 'UH', 'DT', 'PDT', 'WDT', 'WP', 'WP$', 'WRB']
    filtered_words = [word for word, pos in tagged_words if pos not in remove_pos]
    return ' '.join(filtered_words)

### Removing commas, special characters with commas, and extra spaces.

In [9]:
def remove_special_char(text):
    # Remove commas
    cleaned_text = text.replace(',', '')
    # Replace special characters with blank spaces
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', ' ', cleaned_text)
    # Remove extra spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

In [10]:
def remove_extra_spaces_and_numbers(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b\w\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

### Remove Speaker Titles

In [11]:
speaker_titles = ['Chairman',
'Dr',
'Mr',
'Deputy',
'Ms',
'An Leas Cheann Comhairle',
'Minister',
'Acting Chairman',
'An Ceann Comhairle',
'Senator',
'Co Chairman',
'An Cathaoirleach',
'An Leas-Chathaoirleach',
'Acting Chairperson',
'Professor',
'Vice',
'Clerk',
'Deputies',
'Comptroller',
'Audit',
'General']

def remove_terms(text, terms):
    pattern = r'\b(?:' + '|'.join(re.escape(term) for term in terms) + r')\b'
    return re.sub(pattern, '', text)

### Remove Names: Using PERSON tag from the spacy library.

In [12]:
def remove_names(text):
    doc = nlp(text)
    result = []
    for token in doc:
        if token.ent_type_ != 'PERSON':
            result.append(token.text)
    return ' '.join(result)

### Applying all the cleaning functions to get the final 'Cleaned Text'

In [13]:
df['Cleaned Text'] = df['Text'].apply(remove_stop_words)

In [14]:
df['Cleaned Text'] = df['Cleaned Text'].apply(extract_grammer)
df['Cleaned Text'] = df['Cleaned Text'].apply(remove_special_char)
df['Cleaned Text'] = df['Cleaned Text'].apply(remove_extra_spaces_and_numbers)

In [15]:
df['Cleaned Text'] = df['Cleaned Text'].apply(lambda x: remove_terms(x, speaker_titles))

In [16]:
df['Cleaned Text'] = df['Cleaned Text'].apply(remove_names)

In [17]:
print(df.head())

                                           File Name               Category  \
0  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   
1  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   
2  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   
3  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   
4  Joint Committee on Agriculture and the Marine ...  Agri, Food and Marine   

                                                Text  \
0  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...   
1  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...   
2  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...   
3  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...   
4  AN COMHCHOISTE UM THALMHAÍOCHT AGUS MUIR\nJOIN...   

                                        Cleaned Text  
0  COMHCHOISTE UM THALMHA OCHT AGUS MUIR JOINT CO...  
1  COMHCHOISTE UM THALMHA OCHT AGUS MUIR JOINT CO...  
2  COMHCHOISTE UM THALM

In [18]:
stringtest = df['Cleaned Text'][0]

In [19]:
stringtest

'COMHCHOISTE UM THALMHA OCHT AGUS MUIR JOINT COMMITTEE AGRICULTURE MARINE ardaoin Thursday November Th inig le ch ile Joint Committee Comhalta bh thair Members Senators Boyhan Lombard Donovan Paul Kehoe Brian Leddin thair attendance     IREANN JAM Business Joint Committee   begin members phones Members duration meeting phones aeroplane flight mode device members phones mode may casting system session end meeting deal housekeeping matters correspondence business Agreed Regulation Veterinary Medicinal Products Discussion   proceed consideration changes medicines two sets witnesses representatives Pharmacy Union IPU     Barry   Ian Scott Scott Consulting UK Limited consultant Independent Merchants Association ILMA   Terence Shea ILMA maximise use time call witnesses statements discussion members consideration Department officials regard members Department materials welcome officials meeting evidence Parliament cincts statute absolute privilege participants evidence location Parliament pre

## Checking MongoDB Connections

In [38]:
client = MongoClient('mongodb://localhost:27017/')
db = client['foodsystems']
collection = db['complete_transcripts']
print(f"DB : {db} \nCollection : {collection}")

DB : Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'foodsystems') 
Collection : Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'foodsystems'), 'complete_transcripts')


### Inserting the extracted text in the MongoDB Collection. 
#### The script will first establish the connection and then insert the records if the file name is not already in the collection and update (Cleaned Text and Category) if the file name is already there.

In [47]:
import pandas as pd
from pymongo import MongoClient

def upsert_dataframe_to_mongodb(dataframe, db_name, collection_name, mongo_uri="mongodb://localhost:27017/"):
    try:
        client = MongoClient(mongo_uri)
        db = client[db_name]
        collection = db[collection_name]
        data_dict = dataframe.to_dict("records")
        for record in data_dict:
            collection.update_one(
                {'File Name': record['File Name']},
                {'$set': {'Category': record['Category'], 'Cleaned Text': record.get('Cleaned Text', '')}},
                upsert=True
            )
    except Exception as e:
        print(f"An error occurred: {e}")

In [48]:
upsert_dataframe_to_mongodb(df, 'transcripts', 'complete_documents')